diff --git "a/vision/trainer_state.json" "b/vision/trainer_state.json" new file mode 100644--- /dev/null +++ "b/vision/trainer_state.json" @@ -0,0 +1,61089 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8721, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011466574934067194, + "grad_norm": 32.98546172029393, + "learning_rate": 3.816793893129771e-06, + "loss": 7.9599, + "step": 1 + }, + { + "epoch": 0.00022933149868134388, + "grad_norm": 47.732657241119774, + "learning_rate": 7.633587786259541e-06, + "loss": 7.8554, + "step": 2 + }, + { + "epoch": 0.0003439972480220158, + "grad_norm": 48.40557058105882, + "learning_rate": 1.1450381679389314e-05, + "loss": 7.7295, + "step": 3 + }, + { + "epoch": 0.00045866299736268775, + "grad_norm": 43.083244980355815, + "learning_rate": 1.5267175572519083e-05, + "loss": 7.5884, + "step": 4 + }, + { + "epoch": 0.0005733287467033597, + "grad_norm": 29.80504046923896, + "learning_rate": 1.9083969465648855e-05, + "loss": 7.1929, + "step": 5 + }, + { + "epoch": 0.0006879944960440316, + "grad_norm": 24.237601646564773, + "learning_rate": 2.2900763358778628e-05, + "loss": 6.4955, + "step": 6 + }, + { + "epoch": 0.0008026602453847036, + "grad_norm": 16.16051331323853, + "learning_rate": 2.6717557251908397e-05, + "loss": 6.1098, + "step": 7 + }, + { + "epoch": 0.0009173259947253755, + "grad_norm": 11.704852681919737, + "learning_rate": 3.0534351145038166e-05, + "loss": 5.8676, + "step": 8 + }, + { + "epoch": 0.0010319917440660474, + "grad_norm": 11.478570942670492, + "learning_rate": 3.435114503816794e-05, + "loss": 5.781, + "step": 9 + }, + { + "epoch": 0.0011466574934067195, + "grad_norm": 8.826203088380227, + "learning_rate": 3.816793893129771e-05, + "loss": 5.5798, + "step": 10 + }, + { + "epoch": 0.0012613232427473914, + "grad_norm": 8.01492782521436, + "learning_rate": 4.198473282442748e-05, + "loss": 5.156, + "step": 11 + }, + { + "epoch": 0.0013759889920880633, + "grad_norm": 8.856027556816803, + "learning_rate": 4.5801526717557256e-05, + "loss": 5.1825, + "step": 12 + }, + { + "epoch": 0.0014906547414287351, + "grad_norm": 8.764564552477168, + "learning_rate": 4.9618320610687025e-05, + "loss": 5.1275, + "step": 13 + }, + { + "epoch": 0.0016053204907694072, + "grad_norm": 8.050779163407102, + "learning_rate": 5.3435114503816794e-05, + "loss": 4.7772, + "step": 14 + }, + { + "epoch": 0.0017199862401100791, + "grad_norm": 7.677818541977624, + "learning_rate": 5.725190839694656e-05, + "loss": 4.7878, + "step": 15 + }, + { + "epoch": 0.001834651989450751, + "grad_norm": 6.2923619762944245, + "learning_rate": 6.106870229007633e-05, + "loss": 4.4865, + "step": 16 + }, + { + "epoch": 0.001949317738791423, + "grad_norm": 6.603642188881294, + "learning_rate": 6.488549618320611e-05, + "loss": 4.4195, + "step": 17 + }, + { + "epoch": 0.0020639834881320948, + "grad_norm": 5.276561341493895, + "learning_rate": 6.870229007633588e-05, + "loss": 4.3025, + "step": 18 + }, + { + "epoch": 0.002178649237472767, + "grad_norm": 6.574422805087178, + "learning_rate": 7.251908396946565e-05, + "loss": 4.3309, + "step": 19 + }, + { + "epoch": 0.002293314986813439, + "grad_norm": 6.716500046893577, + "learning_rate": 7.633587786259542e-05, + "loss": 4.1763, + "step": 20 + }, + { + "epoch": 0.002407980736154111, + "grad_norm": 7.767168694443845, + "learning_rate": 8.015267175572518e-05, + "loss": 4.1277, + "step": 21 + }, + { + "epoch": 0.0025226464854947827, + "grad_norm": 5.43495664457045, + "learning_rate": 8.396946564885496e-05, + "loss": 4.1637, + "step": 22 + }, + { + "epoch": 0.0026373122348354546, + "grad_norm": 8.19870905005088, + "learning_rate": 8.778625954198472e-05, + "loss": 4.1303, + "step": 23 + }, + { + "epoch": 0.0027519779841761265, + "grad_norm": 4.390958512330856, + "learning_rate": 9.160305343511451e-05, + "loss": 4.0368, + "step": 24 + }, + { + "epoch": 0.0028666437335167984, + "grad_norm": 8.181788688034274, + "learning_rate": 9.541984732824429e-05, + "loss": 4.0893, + "step": 25 + }, + { + "epoch": 0.0029813094828574703, + "grad_norm": 5.161047114532387, + "learning_rate": 9.923664122137405e-05, + "loss": 4.01, + "step": 26 + }, + { + "epoch": 0.0030959752321981426, + "grad_norm": 3.988291754034103, + "learning_rate": 0.00010305343511450383, + "loss": 3.9656, + "step": 27 + }, + { + "epoch": 0.0032106409815388145, + "grad_norm": 4.3518375185231, + "learning_rate": 0.00010687022900763359, + "loss": 3.8979, + "step": 28 + }, + { + "epoch": 0.0033253067308794864, + "grad_norm": 3.2474450311419583, + "learning_rate": 0.00011068702290076336, + "loss": 3.8065, + "step": 29 + }, + { + "epoch": 0.0034399724802201583, + "grad_norm": 3.6981046262219808, + "learning_rate": 0.00011450381679389313, + "loss": 3.8242, + "step": 30 + }, + { + "epoch": 0.00355463822956083, + "grad_norm": 2.3400054887498256, + "learning_rate": 0.0001183206106870229, + "loss": 3.7846, + "step": 31 + }, + { + "epoch": 0.003669303978901502, + "grad_norm": 2.5387815860304186, + "learning_rate": 0.00012213740458015266, + "loss": 3.7654, + "step": 32 + }, + { + "epoch": 0.003783969728242174, + "grad_norm": 2.024649068663064, + "learning_rate": 0.00012595419847328244, + "loss": 3.7131, + "step": 33 + }, + { + "epoch": 0.003898635477582846, + "grad_norm": 2.6049932950681742, + "learning_rate": 0.00012977099236641222, + "loss": 3.6599, + "step": 34 + }, + { + "epoch": 0.004013301226923518, + "grad_norm": 2.203460859949766, + "learning_rate": 0.000133587786259542, + "loss": 3.5835, + "step": 35 + }, + { + "epoch": 0.0041279669762641896, + "grad_norm": 1.663371476726465, + "learning_rate": 0.00013740458015267177, + "loss": 3.6209, + "step": 36 + }, + { + "epoch": 0.0042426327256048614, + "grad_norm": 2.0835656215070917, + "learning_rate": 0.00014122137404580154, + "loss": 3.3668, + "step": 37 + }, + { + "epoch": 0.004357298474945534, + "grad_norm": 1.0473334100427998, + "learning_rate": 0.0001450381679389313, + "loss": 3.5235, + "step": 38 + }, + { + "epoch": 0.004471964224286206, + "grad_norm": 1.2127375462111283, + "learning_rate": 0.00014885496183206107, + "loss": 3.4061, + "step": 39 + }, + { + "epoch": 0.004586629973626878, + "grad_norm": 1.3399602732950553, + "learning_rate": 0.00015267175572519084, + "loss": 3.556, + "step": 40 + }, + { + "epoch": 0.00470129572296755, + "grad_norm": 1.4440908550924991, + "learning_rate": 0.00015648854961832062, + "loss": 3.3867, + "step": 41 + }, + { + "epoch": 0.004815961472308222, + "grad_norm": 0.9843049187136491, + "learning_rate": 0.00016030534351145037, + "loss": 3.5907, + "step": 42 + }, + { + "epoch": 0.004930627221648894, + "grad_norm": 1.0985154001302673, + "learning_rate": 0.00016412213740458014, + "loss": 3.5133, + "step": 43 + }, + { + "epoch": 0.0050452929709895655, + "grad_norm": 0.7753963449528569, + "learning_rate": 0.00016793893129770992, + "loss": 3.473, + "step": 44 + }, + { + "epoch": 0.005159958720330237, + "grad_norm": 0.9349357778900093, + "learning_rate": 0.0001717557251908397, + "loss": 3.5091, + "step": 45 + }, + { + "epoch": 0.005274624469670909, + "grad_norm": 1.0655705240360924, + "learning_rate": 0.00017557251908396944, + "loss": 3.4788, + "step": 46 + }, + { + "epoch": 0.005389290219011581, + "grad_norm": 1.0830224408202829, + "learning_rate": 0.00017938931297709925, + "loss": 3.3443, + "step": 47 + }, + { + "epoch": 0.005503955968352253, + "grad_norm": 0.9610715057250465, + "learning_rate": 0.00018320610687022902, + "loss": 3.4869, + "step": 48 + }, + { + "epoch": 0.005618621717692925, + "grad_norm": 1.6460839008029915, + "learning_rate": 0.0001870229007633588, + "loss": 3.5075, + "step": 49 + }, + { + "epoch": 0.005733287467033597, + "grad_norm": 0.9502092366035757, + "learning_rate": 0.00019083969465648857, + "loss": 3.5921, + "step": 50 + }, + { + "epoch": 0.005847953216374269, + "grad_norm": 1.3135765411097595, + "learning_rate": 0.00019465648854961832, + "loss": 3.5413, + "step": 51 + }, + { + "epoch": 0.0059626189657149406, + "grad_norm": 0.8678551303184648, + "learning_rate": 0.0001984732824427481, + "loss": 3.4618, + "step": 52 + }, + { + "epoch": 0.006077284715055613, + "grad_norm": 0.7809299901239882, + "learning_rate": 0.00020229007633587788, + "loss": 3.5253, + "step": 53 + }, + { + "epoch": 0.006191950464396285, + "grad_norm": 1.0012899217352809, + "learning_rate": 0.00020610687022900765, + "loss": 3.4477, + "step": 54 + }, + { + "epoch": 0.006306616213736957, + "grad_norm": 0.8581363732007035, + "learning_rate": 0.0002099236641221374, + "loss": 3.4745, + "step": 55 + }, + { + "epoch": 0.006421281963077629, + "grad_norm": 0.8298370122931034, + "learning_rate": 0.00021374045801526718, + "loss": 3.42, + "step": 56 + }, + { + "epoch": 0.006535947712418301, + "grad_norm": 1.0185658213061561, + "learning_rate": 0.00021755725190839695, + "loss": 3.4566, + "step": 57 + }, + { + "epoch": 0.006650613461758973, + "grad_norm": 0.865321806958494, + "learning_rate": 0.00022137404580152673, + "loss": 3.4102, + "step": 58 + }, + { + "epoch": 0.006765279211099645, + "grad_norm": 0.941724920284073, + "learning_rate": 0.00022519083969465648, + "loss": 3.4892, + "step": 59 + }, + { + "epoch": 0.0068799449604403165, + "grad_norm": 1.0459210963236445, + "learning_rate": 0.00022900763358778625, + "loss": 3.4095, + "step": 60 + }, + { + "epoch": 0.006994610709780988, + "grad_norm": 1.1554245090240505, + "learning_rate": 0.00023282442748091603, + "loss": 3.3631, + "step": 61 + }, + { + "epoch": 0.00710927645912166, + "grad_norm": 0.895034685115234, + "learning_rate": 0.0002366412213740458, + "loss": 3.4473, + "step": 62 + }, + { + "epoch": 0.007223942208462332, + "grad_norm": 1.3066154220775512, + "learning_rate": 0.00024045801526717558, + "loss": 3.3322, + "step": 63 + }, + { + "epoch": 0.007338607957803004, + "grad_norm": 0.9813587018615572, + "learning_rate": 0.00024427480916030533, + "loss": 3.2766, + "step": 64 + }, + { + "epoch": 0.007453273707143676, + "grad_norm": 0.9704606262126225, + "learning_rate": 0.00024809160305343513, + "loss": 3.484, + "step": 65 + }, + { + "epoch": 0.007567939456484348, + "grad_norm": 0.9712869567388323, + "learning_rate": 0.0002519083969465649, + "loss": 3.4628, + "step": 66 + }, + { + "epoch": 0.00768260520582502, + "grad_norm": 0.8400786558141455, + "learning_rate": 0.00025572519083969463, + "loss": 3.4789, + "step": 67 + }, + { + "epoch": 0.007797270955165692, + "grad_norm": 1.0253271071209522, + "learning_rate": 0.00025954198473282443, + "loss": 3.4881, + "step": 68 + }, + { + "epoch": 0.007911936704506364, + "grad_norm": 1.107476856533126, + "learning_rate": 0.0002633587786259542, + "loss": 3.4161, + "step": 69 + }, + { + "epoch": 0.008026602453847035, + "grad_norm": 0.9325120243579201, + "learning_rate": 0.000267175572519084, + "loss": 3.2294, + "step": 70 + }, + { + "epoch": 0.008141268203187708, + "grad_norm": 1.3497439034406296, + "learning_rate": 0.00027099236641221373, + "loss": 3.4448, + "step": 71 + }, + { + "epoch": 0.008255933952528379, + "grad_norm": 0.8744296526952612, + "learning_rate": 0.00027480916030534353, + "loss": 3.4431, + "step": 72 + }, + { + "epoch": 0.008370599701869052, + "grad_norm": 0.9427355023191845, + "learning_rate": 0.0002786259541984733, + "loss": 3.3507, + "step": 73 + }, + { + "epoch": 0.008485265451209723, + "grad_norm": 1.0251166266851959, + "learning_rate": 0.0002824427480916031, + "loss": 3.4058, + "step": 74 + }, + { + "epoch": 0.008599931200550396, + "grad_norm": 0.7674006906281569, + "learning_rate": 0.0002862595419847328, + "loss": 3.3602, + "step": 75 + }, + { + "epoch": 0.008714596949891068, + "grad_norm": 0.9940608958833785, + "learning_rate": 0.0002900763358778626, + "loss": 3.255, + "step": 76 + }, + { + "epoch": 0.00882926269923174, + "grad_norm": 1.0217336276408653, + "learning_rate": 0.0002938931297709924, + "loss": 3.4686, + "step": 77 + }, + { + "epoch": 0.008943928448572412, + "grad_norm": 1.0210453341651522, + "learning_rate": 0.00029770992366412214, + "loss": 3.1809, + "step": 78 + }, + { + "epoch": 0.009058594197913083, + "grad_norm": 0.8433005544816973, + "learning_rate": 0.00030152671755725194, + "loss": 3.1226, + "step": 79 + }, + { + "epoch": 0.009173259947253756, + "grad_norm": 0.8872726347710475, + "learning_rate": 0.0003053435114503817, + "loss": 3.2751, + "step": 80 + }, + { + "epoch": 0.009287925696594427, + "grad_norm": 0.8457382669147994, + "learning_rate": 0.0003091603053435115, + "loss": 3.2558, + "step": 81 + }, + { + "epoch": 0.0094025914459351, + "grad_norm": 1.1123453200303604, + "learning_rate": 0.00031297709923664124, + "loss": 3.3066, + "step": 82 + }, + { + "epoch": 0.00951725719527577, + "grad_norm": 1.009852884299192, + "learning_rate": 0.000316793893129771, + "loss": 3.3075, + "step": 83 + }, + { + "epoch": 0.009631922944616443, + "grad_norm": 1.2141362918922074, + "learning_rate": 0.00032061068702290074, + "loss": 3.3221, + "step": 84 + }, + { + "epoch": 0.009746588693957114, + "grad_norm": 0.7220903566591271, + "learning_rate": 0.00032442748091603054, + "loss": 3.194, + "step": 85 + }, + { + "epoch": 0.009861254443297787, + "grad_norm": 0.9271719750341705, + "learning_rate": 0.0003282442748091603, + "loss": 3.3259, + "step": 86 + }, + { + "epoch": 0.009975920192638458, + "grad_norm": 0.7515205226796537, + "learning_rate": 0.0003320610687022901, + "loss": 3.2414, + "step": 87 + }, + { + "epoch": 0.010090585941979131, + "grad_norm": 0.8176808987862619, + "learning_rate": 0.00033587786259541984, + "loss": 3.1264, + "step": 88 + }, + { + "epoch": 0.010205251691319802, + "grad_norm": 1.1649501394725053, + "learning_rate": 0.00033969465648854964, + "loss": 3.1802, + "step": 89 + }, + { + "epoch": 0.010319917440660475, + "grad_norm": 1.3541676871463835, + "learning_rate": 0.0003435114503816794, + "loss": 3.2157, + "step": 90 + }, + { + "epoch": 0.010434583190001148, + "grad_norm": 0.8926327463688286, + "learning_rate": 0.0003473282442748092, + "loss": 3.2253, + "step": 91 + }, + { + "epoch": 0.010549248939341819, + "grad_norm": 0.9650209887320587, + "learning_rate": 0.0003511450381679389, + "loss": 3.0817, + "step": 92 + }, + { + "epoch": 0.010663914688682491, + "grad_norm": 0.9103638643319637, + "learning_rate": 0.0003549618320610687, + "loss": 3.2968, + "step": 93 + }, + { + "epoch": 0.010778580438023162, + "grad_norm": 1.193354297338719, + "learning_rate": 0.0003587786259541985, + "loss": 3.2887, + "step": 94 + }, + { + "epoch": 0.010893246187363835, + "grad_norm": 0.9156426112986605, + "learning_rate": 0.00036259541984732824, + "loss": 3.2744, + "step": 95 + }, + { + "epoch": 0.011007911936704506, + "grad_norm": 1.457209283920772, + "learning_rate": 0.00036641221374045805, + "loss": 3.3551, + "step": 96 + }, + { + "epoch": 0.011122577686045179, + "grad_norm": 0.9923668985964779, + "learning_rate": 0.0003702290076335878, + "loss": 3.2413, + "step": 97 + }, + { + "epoch": 0.01123724343538585, + "grad_norm": 0.8705641434360311, + "learning_rate": 0.0003740458015267176, + "loss": 3.227, + "step": 98 + }, + { + "epoch": 0.011351909184726523, + "grad_norm": 0.7668465888834765, + "learning_rate": 0.00037786259541984735, + "loss": 3.1211, + "step": 99 + }, + { + "epoch": 0.011466574934067194, + "grad_norm": 0.8800734116750144, + "learning_rate": 0.00038167938931297715, + "loss": 3.1365, + "step": 100 + }, + { + "epoch": 0.011581240683407866, + "grad_norm": 0.8257318115489123, + "learning_rate": 0.00038549618320610684, + "loss": 3.1621, + "step": 101 + }, + { + "epoch": 0.011695906432748537, + "grad_norm": 0.8350675911404676, + "learning_rate": 0.00038931297709923665, + "loss": 3.096, + "step": 102 + }, + { + "epoch": 0.01181057218208921, + "grad_norm": 0.9917559889337618, + "learning_rate": 0.0003931297709923664, + "loss": 3.1627, + "step": 103 + }, + { + "epoch": 0.011925237931429881, + "grad_norm": 0.8728526561487229, + "learning_rate": 0.0003969465648854962, + "loss": 3.1434, + "step": 104 + }, + { + "epoch": 0.012039903680770554, + "grad_norm": 0.8044355327218607, + "learning_rate": 0.00040076335877862595, + "loss": 3.3545, + "step": 105 + }, + { + "epoch": 0.012154569430111227, + "grad_norm": 0.7624150853537827, + "learning_rate": 0.00040458015267175575, + "loss": 3.2477, + "step": 106 + }, + { + "epoch": 0.012269235179451898, + "grad_norm": 0.7685371880323714, + "learning_rate": 0.0004083969465648855, + "loss": 3.302, + "step": 107 + }, + { + "epoch": 0.01238390092879257, + "grad_norm": 0.7944341844978677, + "learning_rate": 0.0004122137404580153, + "loss": 3.1708, + "step": 108 + }, + { + "epoch": 0.012498566678133241, + "grad_norm": 0.9876391458592928, + "learning_rate": 0.00041603053435114505, + "loss": 3.2425, + "step": 109 + }, + { + "epoch": 0.012613232427473914, + "grad_norm": 0.8847659287810202, + "learning_rate": 0.0004198473282442748, + "loss": 3.2351, + "step": 110 + }, + { + "epoch": 0.012727898176814585, + "grad_norm": 0.8426928506995099, + "learning_rate": 0.00042366412213740455, + "loss": 3.1772, + "step": 111 + }, + { + "epoch": 0.012842563926155258, + "grad_norm": 0.922126384685991, + "learning_rate": 0.00042748091603053435, + "loss": 3.1375, + "step": 112 + }, + { + "epoch": 0.012957229675495929, + "grad_norm": 0.8080171185279135, + "learning_rate": 0.00043129770992366415, + "loss": 3.1623, + "step": 113 + }, + { + "epoch": 0.013071895424836602, + "grad_norm": 0.7030443996976394, + "learning_rate": 0.0004351145038167939, + "loss": 3.0977, + "step": 114 + }, + { + "epoch": 0.013186561174177273, + "grad_norm": 0.6535113180036304, + "learning_rate": 0.0004389312977099237, + "loss": 3.1151, + "step": 115 + }, + { + "epoch": 0.013301226923517945, + "grad_norm": 0.7180706854244188, + "learning_rate": 0.00044274809160305345, + "loss": 2.9521, + "step": 116 + }, + { + "epoch": 0.013415892672858616, + "grad_norm": 1.0076357703757337, + "learning_rate": 0.00044656488549618326, + "loss": 3.286, + "step": 117 + }, + { + "epoch": 0.01353055842219929, + "grad_norm": 0.7472021229758496, + "learning_rate": 0.00045038167938931295, + "loss": 3.1203, + "step": 118 + }, + { + "epoch": 0.01364522417153996, + "grad_norm": 0.8403325212158305, + "learning_rate": 0.00045419847328244275, + "loss": 3.2043, + "step": 119 + }, + { + "epoch": 0.013759889920880633, + "grad_norm": 0.829704170483764, + "learning_rate": 0.0004580152671755725, + "loss": 3.1752, + "step": 120 + }, + { + "epoch": 0.013874555670221306, + "grad_norm": 0.8199327720574795, + "learning_rate": 0.0004618320610687023, + "loss": 3.2066, + "step": 121 + }, + { + "epoch": 0.013989221419561977, + "grad_norm": 0.8987043231843231, + "learning_rate": 0.00046564885496183206, + "loss": 3.2588, + "step": 122 + }, + { + "epoch": 0.01410388716890265, + "grad_norm": 0.7333642395061045, + "learning_rate": 0.00046946564885496186, + "loss": 3.1435, + "step": 123 + }, + { + "epoch": 0.01421855291824332, + "grad_norm": 0.6709088599417844, + "learning_rate": 0.0004732824427480916, + "loss": 3.022, + "step": 124 + }, + { + "epoch": 0.014333218667583993, + "grad_norm": 0.7545691274043932, + "learning_rate": 0.0004770992366412214, + "loss": 3.0124, + "step": 125 + }, + { + "epoch": 0.014447884416924664, + "grad_norm": 0.7434188600478571, + "learning_rate": 0.00048091603053435116, + "loss": 3.1598, + "step": 126 + }, + { + "epoch": 0.014562550166265337, + "grad_norm": 0.781938822649928, + "learning_rate": 0.0004847328244274809, + "loss": 3.2127, + "step": 127 + }, + { + "epoch": 0.014677215915606008, + "grad_norm": 0.7091659969095595, + "learning_rate": 0.0004885496183206107, + "loss": 3.101, + "step": 128 + }, + { + "epoch": 0.01479188166494668, + "grad_norm": 0.7480721553489864, + "learning_rate": 0.0004923664122137404, + "loss": 3.1012, + "step": 129 + }, + { + "epoch": 0.014906547414287352, + "grad_norm": 0.8175518936335485, + "learning_rate": 0.0004961832061068703, + "loss": 3.1727, + "step": 130 + }, + { + "epoch": 0.015021213163628025, + "grad_norm": 0.8558047877552776, + "learning_rate": 0.0005, + "loss": 3.092, + "step": 131 + }, + { + "epoch": 0.015135878912968696, + "grad_norm": 0.715675926855538, + "learning_rate": 0.0005038167938931298, + "loss": 3.0416, + "step": 132 + }, + { + "epoch": 0.015250544662309368, + "grad_norm": 0.7894171983316872, + "learning_rate": 0.0005076335877862596, + "loss": 3.1021, + "step": 133 + }, + { + "epoch": 0.01536521041165004, + "grad_norm": 0.6352980507571451, + "learning_rate": 0.0005114503816793893, + "loss": 3.1602, + "step": 134 + }, + { + "epoch": 0.015479876160990712, + "grad_norm": 0.7282730979647982, + "learning_rate": 0.0005152671755725191, + "loss": 3.1725, + "step": 135 + }, + { + "epoch": 0.015594541910331383, + "grad_norm": 0.6393104701525223, + "learning_rate": 0.0005190839694656489, + "loss": 3.1706, + "step": 136 + }, + { + "epoch": 0.015709207659672056, + "grad_norm": 0.6448333705467408, + "learning_rate": 0.0005229007633587787, + "loss": 3.1772, + "step": 137 + }, + { + "epoch": 0.01582387340901273, + "grad_norm": 0.7367909432134404, + "learning_rate": 0.0005267175572519084, + "loss": 2.9897, + "step": 138 + }, + { + "epoch": 0.0159385391583534, + "grad_norm": 0.8198870555104838, + "learning_rate": 0.0005305343511450382, + "loss": 3.1021, + "step": 139 + }, + { + "epoch": 0.01605320490769407, + "grad_norm": 0.7357635840223945, + "learning_rate": 0.000534351145038168, + "loss": 3.0588, + "step": 140 + }, + { + "epoch": 0.016167870657034743, + "grad_norm": 0.705795404439166, + "learning_rate": 0.0005381679389312977, + "loss": 2.9621, + "step": 141 + }, + { + "epoch": 0.016282536406375416, + "grad_norm": 0.7176753874516963, + "learning_rate": 0.0005419847328244275, + "loss": 3.0721, + "step": 142 + }, + { + "epoch": 0.01639720215571609, + "grad_norm": 0.700448131519179, + "learning_rate": 0.0005458015267175572, + "loss": 2.9941, + "step": 143 + }, + { + "epoch": 0.016511867905056758, + "grad_norm": 0.6990399671593439, + "learning_rate": 0.0005496183206106871, + "loss": 3.1076, + "step": 144 + }, + { + "epoch": 0.01662653365439743, + "grad_norm": 0.7412053984828968, + "learning_rate": 0.0005534351145038168, + "loss": 3.141, + "step": 145 + }, + { + "epoch": 0.016741199403738104, + "grad_norm": 0.7233185941272354, + "learning_rate": 0.0005572519083969466, + "loss": 3.1995, + "step": 146 + }, + { + "epoch": 0.016855865153078776, + "grad_norm": 0.7077580442784301, + "learning_rate": 0.0005610687022900763, + "loss": 3.0882, + "step": 147 + }, + { + "epoch": 0.016970530902419446, + "grad_norm": 0.6612724036525008, + "learning_rate": 0.0005648854961832062, + "loss": 3.1712, + "step": 148 + }, + { + "epoch": 0.01708519665176012, + "grad_norm": 0.6123332517534898, + "learning_rate": 0.0005687022900763359, + "loss": 3.1563, + "step": 149 + }, + { + "epoch": 0.01719986240110079, + "grad_norm": 0.6606169687648724, + "learning_rate": 0.0005725190839694656, + "loss": 3.0875, + "step": 150 + }, + { + "epoch": 0.017314528150441464, + "grad_norm": 0.5871384537513337, + "learning_rate": 0.0005763358778625954, + "loss": 3.056, + "step": 151 + }, + { + "epoch": 0.017429193899782137, + "grad_norm": 0.6326403891535017, + "learning_rate": 0.0005801526717557252, + "loss": 3.0828, + "step": 152 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 0.7516813020287689, + "learning_rate": 0.000583969465648855, + "loss": 3.0481, + "step": 153 + }, + { + "epoch": 0.01765852539846348, + "grad_norm": 0.7409450053300217, + "learning_rate": 0.0005877862595419848, + "loss": 3.1067, + "step": 154 + }, + { + "epoch": 0.01777319114780415, + "grad_norm": 0.7600143894697873, + "learning_rate": 0.0005916030534351145, + "loss": 3.1402, + "step": 155 + }, + { + "epoch": 0.017887856897144824, + "grad_norm": 0.7230034388363792, + "learning_rate": 0.0005954198473282443, + "loss": 3.1705, + "step": 156 + }, + { + "epoch": 0.018002522646485494, + "grad_norm": 0.8581851881653522, + "learning_rate": 0.0005992366412213741, + "loss": 3.0555, + "step": 157 + }, + { + "epoch": 0.018117188395826166, + "grad_norm": 0.7809357567202576, + "learning_rate": 0.0006030534351145039, + "loss": 3.0556, + "step": 158 + }, + { + "epoch": 0.01823185414516684, + "grad_norm": 0.839444580176229, + "learning_rate": 0.0006068702290076335, + "loss": 3.1123, + "step": 159 + }, + { + "epoch": 0.018346519894507512, + "grad_norm": 0.7325816670271689, + "learning_rate": 0.0006106870229007634, + "loss": 3.0999, + "step": 160 + }, + { + "epoch": 0.01846118564384818, + "grad_norm": 0.6010238880161904, + "learning_rate": 0.0006145038167938931, + "loss": 3.0407, + "step": 161 + }, + { + "epoch": 0.018575851393188854, + "grad_norm": 0.7092973080447477, + "learning_rate": 0.000618320610687023, + "loss": 3.0507, + "step": 162 + }, + { + "epoch": 0.018690517142529527, + "grad_norm": 0.7168594676154711, + "learning_rate": 0.0006221374045801526, + "loss": 2.9922, + "step": 163 + }, + { + "epoch": 0.0188051828918702, + "grad_norm": 0.6018886517920051, + "learning_rate": 0.0006259541984732825, + "loss": 2.9425, + "step": 164 + }, + { + "epoch": 0.01891984864121087, + "grad_norm": 0.7357060271720588, + "learning_rate": 0.0006297709923664122, + "loss": 3.0344, + "step": 165 + }, + { + "epoch": 0.01903451439055154, + "grad_norm": 0.7157620230021696, + "learning_rate": 0.000633587786259542, + "loss": 3.1855, + "step": 166 + }, + { + "epoch": 0.019149180139892214, + "grad_norm": 0.6232405739903173, + "learning_rate": 0.0006374045801526717, + "loss": 3.0973, + "step": 167 + }, + { + "epoch": 0.019263845889232887, + "grad_norm": 0.7887082352376923, + "learning_rate": 0.0006412213740458015, + "loss": 3.1427, + "step": 168 + }, + { + "epoch": 0.01937851163857356, + "grad_norm": 0.6352665877608317, + "learning_rate": 0.0006450381679389313, + "loss": 2.9604, + "step": 169 + }, + { + "epoch": 0.01949317738791423, + "grad_norm": 0.6228776886405907, + "learning_rate": 0.0006488549618320611, + "loss": 3.0234, + "step": 170 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 0.7341180388377916, + "learning_rate": 0.0006526717557251909, + "loss": 3.1532, + "step": 171 + }, + { + "epoch": 0.019722508886595574, + "grad_norm": 0.5460205774511234, + "learning_rate": 0.0006564885496183206, + "loss": 2.9746, + "step": 172 + }, + { + "epoch": 0.019837174635936247, + "grad_norm": 0.5653175240630341, + "learning_rate": 0.0006603053435114504, + "loss": 2.9971, + "step": 173 + }, + { + "epoch": 0.019951840385276916, + "grad_norm": 0.7731864970577471, + "learning_rate": 0.0006641221374045802, + "loss": 2.9651, + "step": 174 + }, + { + "epoch": 0.02006650613461759, + "grad_norm": 0.6771364419122414, + "learning_rate": 0.0006679389312977099, + "loss": 2.8333, + "step": 175 + }, + { + "epoch": 0.020181171883958262, + "grad_norm": 0.6346129832522949, + "learning_rate": 0.0006717557251908397, + "loss": 3.054, + "step": 176 + }, + { + "epoch": 0.020295837633298935, + "grad_norm": 0.7326932171856574, + "learning_rate": 0.0006755725190839694, + "loss": 3.0687, + "step": 177 + }, + { + "epoch": 0.020410503382639604, + "grad_norm": 0.5823604135128475, + "learning_rate": 0.0006793893129770993, + "loss": 2.8745, + "step": 178 + }, + { + "epoch": 0.020525169131980277, + "grad_norm": 0.6174195309858355, + "learning_rate": 0.000683206106870229, + "loss": 3.0312, + "step": 179 + }, + { + "epoch": 0.02063983488132095, + "grad_norm": 0.5418158380203857, + "learning_rate": 0.0006870229007633588, + "loss": 2.8996, + "step": 180 + }, + { + "epoch": 0.020754500630661622, + "grad_norm": 0.6181540511481147, + "learning_rate": 0.0006908396946564885, + "loss": 2.9636, + "step": 181 + }, + { + "epoch": 0.020869166380002295, + "grad_norm": 0.6244568919555246, + "learning_rate": 0.0006946564885496184, + "loss": 3.084, + "step": 182 + }, + { + "epoch": 0.020983832129342964, + "grad_norm": 0.7149755208774589, + "learning_rate": 0.0006984732824427481, + "loss": 3.0251, + "step": 183 + }, + { + "epoch": 0.021098497878683637, + "grad_norm": 0.6056458595252673, + "learning_rate": 0.0007022900763358778, + "loss": 3.0513, + "step": 184 + }, + { + "epoch": 0.02121316362802431, + "grad_norm": 0.6214029944478348, + "learning_rate": 0.0007061068702290076, + "loss": 2.9824, + "step": 185 + }, + { + "epoch": 0.021327829377364983, + "grad_norm": 0.641553641710498, + "learning_rate": 0.0007099236641221374, + "loss": 3.056, + "step": 186 + }, + { + "epoch": 0.021442495126705652, + "grad_norm": 0.6667627792634219, + "learning_rate": 0.0007137404580152672, + "loss": 3.0012, + "step": 187 + }, + { + "epoch": 0.021557160876046325, + "grad_norm": 0.5945142736105938, + "learning_rate": 0.000717557251908397, + "loss": 3.1214, + "step": 188 + }, + { + "epoch": 0.021671826625386997, + "grad_norm": 0.6783251089331055, + "learning_rate": 0.0007213740458015267, + "loss": 2.999, + "step": 189 + }, + { + "epoch": 0.02178649237472767, + "grad_norm": 0.6688221997283305, + "learning_rate": 0.0007251908396946565, + "loss": 3.0808, + "step": 190 + }, + { + "epoch": 0.02190115812406834, + "grad_norm": 0.6746245827654979, + "learning_rate": 0.0007290076335877863, + "loss": 3.0173, + "step": 191 + }, + { + "epoch": 0.022015823873409012, + "grad_norm": 0.5795337065907388, + "learning_rate": 0.0007328244274809161, + "loss": 2.9558, + "step": 192 + }, + { + "epoch": 0.022130489622749685, + "grad_norm": 0.6499209328375691, + "learning_rate": 0.0007366412213740457, + "loss": 3.0574, + "step": 193 + }, + { + "epoch": 0.022245155372090358, + "grad_norm": 0.6980438906576684, + "learning_rate": 0.0007404580152671756, + "loss": 2.8779, + "step": 194 + }, + { + "epoch": 0.022359821121431027, + "grad_norm": 0.5720807575262353, + "learning_rate": 0.0007442748091603053, + "loss": 3.0448, + "step": 195 + }, + { + "epoch": 0.0224744868707717, + "grad_norm": 0.5301946862336206, + "learning_rate": 0.0007480916030534352, + "loss": 3.1149, + "step": 196 + }, + { + "epoch": 0.022589152620112372, + "grad_norm": 0.8105858231356275, + "learning_rate": 0.0007519083969465648, + "loss": 3.0755, + "step": 197 + }, + { + "epoch": 0.022703818369453045, + "grad_norm": 0.5462211661048078, + "learning_rate": 0.0007557251908396947, + "loss": 3.0021, + "step": 198 + }, + { + "epoch": 0.022818484118793718, + "grad_norm": 4.3269402910987225, + "learning_rate": 0.0007595419847328244, + "loss": 3.1649, + "step": 199 + }, + { + "epoch": 0.022933149868134387, + "grad_norm": 0.6327785858258099, + "learning_rate": 0.0007633587786259543, + "loss": 3.0311, + "step": 200 + }, + { + "epoch": 0.02304781561747506, + "grad_norm": 3.228731007266588, + "learning_rate": 0.0007671755725190839, + "loss": 3.329, + "step": 201 + }, + { + "epoch": 0.023162481366815733, + "grad_norm": 0.64153983302329, + "learning_rate": 0.0007709923664122137, + "loss": 2.9986, + "step": 202 + }, + { + "epoch": 0.023277147116156405, + "grad_norm": 0.5881412203184709, + "learning_rate": 0.0007748091603053435, + "loss": 3.0452, + "step": 203 + }, + { + "epoch": 0.023391812865497075, + "grad_norm": 0.5672212013258866, + "learning_rate": 0.0007786259541984733, + "loss": 2.9128, + "step": 204 + }, + { + "epoch": 0.023506478614837747, + "grad_norm": 0.5536613897718605, + "learning_rate": 0.000782442748091603, + "loss": 3.044, + "step": 205 + }, + { + "epoch": 0.02362114436417842, + "grad_norm": 0.5241079883708706, + "learning_rate": 0.0007862595419847328, + "loss": 3.0764, + "step": 206 + }, + { + "epoch": 0.023735810113519093, + "grad_norm": 0.5220428254717413, + "learning_rate": 0.0007900763358778626, + "loss": 3.0119, + "step": 207 + }, + { + "epoch": 0.023850475862859762, + "grad_norm": 0.5467201856986593, + "learning_rate": 0.0007938931297709924, + "loss": 2.9784, + "step": 208 + }, + { + "epoch": 0.023965141612200435, + "grad_norm": 0.5479192531206861, + "learning_rate": 0.0007977099236641223, + "loss": 2.9427, + "step": 209 + }, + { + "epoch": 0.024079807361541108, + "grad_norm": 0.5615227198006923, + "learning_rate": 0.0008015267175572519, + "loss": 3.1378, + "step": 210 + }, + { + "epoch": 0.02419447311088178, + "grad_norm": 0.5147014084174933, + "learning_rate": 0.0008053435114503816, + "loss": 3.0301, + "step": 211 + }, + { + "epoch": 0.024309138860222453, + "grad_norm": 0.5625278337037649, + "learning_rate": 0.0008091603053435115, + "loss": 2.8769, + "step": 212 + }, + { + "epoch": 0.024423804609563123, + "grad_norm": 0.7014380475755434, + "learning_rate": 0.0008129770992366412, + "loss": 2.9623, + "step": 213 + }, + { + "epoch": 0.024538470358903795, + "grad_norm": 0.6133818118706758, + "learning_rate": 0.000816793893129771, + "loss": 3.0224, + "step": 214 + }, + { + "epoch": 0.024653136108244468, + "grad_norm": 0.625292717113106, + "learning_rate": 0.0008206106870229007, + "loss": 3.0719, + "step": 215 + }, + { + "epoch": 0.02476780185758514, + "grad_norm": 0.6113203903609521, + "learning_rate": 0.0008244274809160306, + "loss": 3.1219, + "step": 216 + }, + { + "epoch": 0.02488246760692581, + "grad_norm": 0.5736723464363234, + "learning_rate": 0.0008282442748091604, + "loss": 2.9809, + "step": 217 + }, + { + "epoch": 0.024997133356266483, + "grad_norm": 0.6405130215173672, + "learning_rate": 0.0008320610687022901, + "loss": 2.9864, + "step": 218 + }, + { + "epoch": 0.025111799105607156, + "grad_norm": 0.6007317927514583, + "learning_rate": 0.0008358778625954198, + "loss": 3.0273, + "step": 219 + }, + { + "epoch": 0.02522646485494783, + "grad_norm": 0.6619853975857226, + "learning_rate": 0.0008396946564885496, + "loss": 3.0412, + "step": 220 + }, + { + "epoch": 0.025341130604288498, + "grad_norm": 0.5522567593234391, + "learning_rate": 0.0008435114503816795, + "loss": 2.8871, + "step": 221 + }, + { + "epoch": 0.02545579635362917, + "grad_norm": 0.5627933279668168, + "learning_rate": 0.0008473282442748091, + "loss": 3.1126, + "step": 222 + }, + { + "epoch": 0.025570462102969843, + "grad_norm": 0.6260317440159605, + "learning_rate": 0.000851145038167939, + "loss": 3.2618, + "step": 223 + }, + { + "epoch": 0.025685127852310516, + "grad_norm": 0.5511742256131716, + "learning_rate": 0.0008549618320610687, + "loss": 3.0041, + "step": 224 + }, + { + "epoch": 0.025799793601651185, + "grad_norm": 0.564344748628514, + "learning_rate": 0.0008587786259541986, + "loss": 2.8753, + "step": 225 + }, + { + "epoch": 0.025914459350991858, + "grad_norm": 0.5194384479234073, + "learning_rate": 0.0008625954198473283, + "loss": 2.9954, + "step": 226 + }, + { + "epoch": 0.02602912510033253, + "grad_norm": 0.5671036584361538, + "learning_rate": 0.0008664122137404581, + "loss": 3.0036, + "step": 227 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 0.6425359144321615, + "learning_rate": 0.0008702290076335878, + "loss": 2.9818, + "step": 228 + }, + { + "epoch": 0.026258456599013876, + "grad_norm": 0.6008191185003743, + "learning_rate": 0.0008740458015267176, + "loss": 2.9525, + "step": 229 + }, + { + "epoch": 0.026373122348354545, + "grad_norm": 0.5562413068913958, + "learning_rate": 0.0008778625954198474, + "loss": 2.9698, + "step": 230 + }, + { + "epoch": 0.026487788097695218, + "grad_norm": 0.5180517548960076, + "learning_rate": 0.000881679389312977, + "loss": 2.8583, + "step": 231 + }, + { + "epoch": 0.02660245384703589, + "grad_norm": 0.5299750923547059, + "learning_rate": 0.0008854961832061069, + "loss": 2.9318, + "step": 232 + }, + { + "epoch": 0.026717119596376564, + "grad_norm": 0.5847844322487309, + "learning_rate": 0.0008893129770992367, + "loss": 2.9227, + "step": 233 + }, + { + "epoch": 0.026831785345717233, + "grad_norm": 0.5623854753534139, + "learning_rate": 0.0008931297709923665, + "loss": 2.9027, + "step": 234 + }, + { + "epoch": 0.026946451095057906, + "grad_norm": 0.6055963553845306, + "learning_rate": 0.0008969465648854962, + "loss": 2.908, + "step": 235 + }, + { + "epoch": 0.02706111684439858, + "grad_norm": 0.5304107973619167, + "learning_rate": 0.0009007633587786259, + "loss": 2.9666, + "step": 236 + }, + { + "epoch": 0.02717578259373925, + "grad_norm": 0.47771171633395954, + "learning_rate": 0.0009045801526717558, + "loss": 3.0276, + "step": 237 + }, + { + "epoch": 0.02729044834307992, + "grad_norm": 0.5638093487086294, + "learning_rate": 0.0009083969465648855, + "loss": 2.9997, + "step": 238 + }, + { + "epoch": 0.027405114092420593, + "grad_norm": 0.5115169010872654, + "learning_rate": 0.0009122137404580153, + "loss": 3.0217, + "step": 239 + }, + { + "epoch": 0.027519779841761266, + "grad_norm": 0.5373470870425937, + "learning_rate": 0.000916030534351145, + "loss": 3.0147, + "step": 240 + }, + { + "epoch": 0.02763444559110194, + "grad_norm": 0.50389345283373, + "learning_rate": 0.0009198473282442749, + "loss": 3.0051, + "step": 241 + }, + { + "epoch": 0.02774911134044261, + "grad_norm": 0.5686045089313948, + "learning_rate": 0.0009236641221374046, + "loss": 2.9977, + "step": 242 + }, + { + "epoch": 0.02786377708978328, + "grad_norm": 0.5322094317111871, + "learning_rate": 0.0009274809160305345, + "loss": 2.9022, + "step": 243 + }, + { + "epoch": 0.027978442839123954, + "grad_norm": 0.580557681506128, + "learning_rate": 0.0009312977099236641, + "loss": 2.9606, + "step": 244 + }, + { + "epoch": 0.028093108588464626, + "grad_norm": 0.5819599288092461, + "learning_rate": 0.0009351145038167939, + "loss": 2.9676, + "step": 245 + }, + { + "epoch": 0.0282077743378053, + "grad_norm": 0.5426836978641648, + "learning_rate": 0.0009389312977099237, + "loss": 2.9792, + "step": 246 + }, + { + "epoch": 0.02832244008714597, + "grad_norm": 0.5434616035713983, + "learning_rate": 0.0009427480916030535, + "loss": 2.9292, + "step": 247 + }, + { + "epoch": 0.02843710583648664, + "grad_norm": 0.4931182373221491, + "learning_rate": 0.0009465648854961832, + "loss": 3.0644, + "step": 248 + }, + { + "epoch": 0.028551771585827314, + "grad_norm": 0.4828253688205587, + "learning_rate": 0.000950381679389313, + "loss": 2.9567, + "step": 249 + }, + { + "epoch": 0.028666437335167987, + "grad_norm": 0.6042815397586974, + "learning_rate": 0.0009541984732824428, + "loss": 3.1015, + "step": 250 + }, + { + "epoch": 0.028781103084508656, + "grad_norm": 0.5585071590393784, + "learning_rate": 0.0009580152671755726, + "loss": 2.9905, + "step": 251 + }, + { + "epoch": 0.02889576883384933, + "grad_norm": 0.4350057851189633, + "learning_rate": 0.0009618320610687023, + "loss": 2.9323, + "step": 252 + }, + { + "epoch": 0.02901043458319, + "grad_norm": 0.4673874881890266, + "learning_rate": 0.0009656488549618321, + "loss": 2.8471, + "step": 253 + }, + { + "epoch": 0.029125100332530674, + "grad_norm": 0.5052753207433914, + "learning_rate": 0.0009694656488549618, + "loss": 3.0418, + "step": 254 + }, + { + "epoch": 0.029239766081871343, + "grad_norm": 0.5336441114809883, + "learning_rate": 0.0009732824427480917, + "loss": 3.0095, + "step": 255 + }, + { + "epoch": 0.029354431831212016, + "grad_norm": 0.5279004804456354, + "learning_rate": 0.0009770992366412213, + "loss": 2.9115, + "step": 256 + }, + { + "epoch": 0.02946909758055269, + "grad_norm": 0.4661839015087628, + "learning_rate": 0.0009809160305343512, + "loss": 2.9732, + "step": 257 + }, + { + "epoch": 0.02958376332989336, + "grad_norm": 0.48913604863304283, + "learning_rate": 0.0009847328244274808, + "loss": 3.1976, + "step": 258 + }, + { + "epoch": 0.029698429079234034, + "grad_norm": 0.5222709393868871, + "learning_rate": 0.0009885496183206107, + "loss": 2.8806, + "step": 259 + }, + { + "epoch": 0.029813094828574704, + "grad_norm": 0.5177827866377747, + "learning_rate": 0.0009923664122137405, + "loss": 3.0158, + "step": 260 + }, + { + "epoch": 0.029927760577915376, + "grad_norm": 0.5031667027902577, + "learning_rate": 0.0009961832061068704, + "loss": 2.8104, + "step": 261 + }, + { + "epoch": 0.03004242632725605, + "grad_norm": 0.5330034729502853, + "learning_rate": 0.001, + "loss": 2.9629, + "step": 262 + }, + { + "epoch": 0.030157092076596722, + "grad_norm": 0.48611659249721517, + "learning_rate": 0.0009999999655172654, + "loss": 2.9476, + "step": 263 + }, + { + "epoch": 0.03027175782593739, + "grad_norm": 0.45520990511820436, + "learning_rate": 0.0009999998620690664, + "loss": 2.9605, + "step": 264 + }, + { + "epoch": 0.030386423575278064, + "grad_norm": 0.5216237841662955, + "learning_rate": 0.0009999996896554175, + "loss": 2.8796, + "step": 265 + }, + { + "epoch": 0.030501089324618737, + "grad_norm": 0.5290070708388479, + "learning_rate": 0.0009999994482763422, + "loss": 3.0675, + "step": 266 + }, + { + "epoch": 0.03061575507395941, + "grad_norm": 0.4912259087553911, + "learning_rate": 0.0009999991379318737, + "loss": 2.9481, + "step": 267 + }, + { + "epoch": 0.03073042082330008, + "grad_norm": 0.44837601964173174, + "learning_rate": 0.000999998758622055, + "loss": 2.9035, + "step": 268 + }, + { + "epoch": 0.03084508657264075, + "grad_norm": 0.4651387703391051, + "learning_rate": 0.0009999983103469385, + "loss": 2.9301, + "step": 269 + }, + { + "epoch": 0.030959752321981424, + "grad_norm": 0.5068936213869853, + "learning_rate": 0.0009999977931065857, + "loss": 2.9327, + "step": 270 + }, + { + "epoch": 0.031074418071322097, + "grad_norm": 0.45357650225532237, + "learning_rate": 0.0009999972069010686, + "loss": 2.9411, + "step": 271 + }, + { + "epoch": 0.031189083820662766, + "grad_norm": 0.4890846938377679, + "learning_rate": 0.0009999965517304673, + "loss": 2.9763, + "step": 272 + }, + { + "epoch": 0.03130374957000344, + "grad_norm": 0.4432967253380608, + "learning_rate": 0.0009999958275948725, + "loss": 2.8827, + "step": 273 + }, + { + "epoch": 0.03141841531934411, + "grad_norm": 0.447116056762687, + "learning_rate": 0.0009999950344943842, + "loss": 2.8219, + "step": 274 + }, + { + "epoch": 0.031533081068684785, + "grad_norm": 0.6067394722819113, + "learning_rate": 0.0009999941724291115, + "loss": 3.016, + "step": 275 + }, + { + "epoch": 0.03164774681802546, + "grad_norm": 0.4932292446077849, + "learning_rate": 0.0009999932413991737, + "loss": 2.8823, + "step": 276 + }, + { + "epoch": 0.03176241256736613, + "grad_norm": 0.48701283343057666, + "learning_rate": 0.0009999922414046986, + "loss": 3.0513, + "step": 277 + }, + { + "epoch": 0.0318770783167068, + "grad_norm": 0.508434848624952, + "learning_rate": 0.0009999911724458248, + "loss": 3.0388, + "step": 278 + }, + { + "epoch": 0.03199174406604747, + "grad_norm": 0.49753350320140627, + "learning_rate": 0.0009999900345226994, + "loss": 2.7854, + "step": 279 + }, + { + "epoch": 0.03210640981538814, + "grad_norm": 0.4815388988859689, + "learning_rate": 0.0009999888276354795, + "loss": 2.9384, + "step": 280 + }, + { + "epoch": 0.032221075564728814, + "grad_norm": 0.5176955770044522, + "learning_rate": 0.0009999875517843315, + "loss": 3.0144, + "step": 281 + }, + { + "epoch": 0.03233574131406949, + "grad_norm": 0.5086189881569775, + "learning_rate": 0.0009999862069694312, + "loss": 2.8854, + "step": 282 + }, + { + "epoch": 0.03245040706341016, + "grad_norm": 0.5146090525545141, + "learning_rate": 0.0009999847931909645, + "loss": 2.8299, + "step": 283 + }, + { + "epoch": 0.03256507281275083, + "grad_norm": 0.4748148321450234, + "learning_rate": 0.000999983310449126, + "loss": 2.8589, + "step": 284 + }, + { + "epoch": 0.032679738562091505, + "grad_norm": 0.5235880108167481, + "learning_rate": 0.0009999817587441203, + "loss": 2.8747, + "step": 285 + }, + { + "epoch": 0.03279440431143218, + "grad_norm": 0.5311297710383596, + "learning_rate": 0.0009999801380761615, + "loss": 2.9554, + "step": 286 + }, + { + "epoch": 0.032909070060772844, + "grad_norm": 0.4784552496511499, + "learning_rate": 0.0009999784484454734, + "loss": 2.9593, + "step": 287 + }, + { + "epoch": 0.033023735810113516, + "grad_norm": 0.4802526061817244, + "learning_rate": 0.0009999766898522884, + "loss": 2.9045, + "step": 288 + }, + { + "epoch": 0.03313840155945419, + "grad_norm": 0.5005836980588156, + "learning_rate": 0.0009999748622968496, + "loss": 2.8257, + "step": 289 + }, + { + "epoch": 0.03325306730879486, + "grad_norm": 0.4650637395613939, + "learning_rate": 0.000999972965779409, + "loss": 2.8127, + "step": 290 + }, + { + "epoch": 0.033367733058135535, + "grad_norm": 0.4668955532453089, + "learning_rate": 0.000999971000300228, + "loss": 2.8695, + "step": 291 + }, + { + "epoch": 0.03348239880747621, + "grad_norm": 0.4438363509090404, + "learning_rate": 0.000999968965859578, + "loss": 2.8448, + "step": 292 + }, + { + "epoch": 0.03359706455681688, + "grad_norm": 0.5172966659465619, + "learning_rate": 0.0009999668624577395, + "loss": 2.9425, + "step": 293 + }, + { + "epoch": 0.03371173030615755, + "grad_norm": 0.5904051865520534, + "learning_rate": 0.0009999646900950023, + "loss": 2.9115, + "step": 294 + }, + { + "epoch": 0.033826396055498226, + "grad_norm": 0.5539441014916824, + "learning_rate": 0.0009999624487716666, + "loss": 2.8551, + "step": 295 + }, + { + "epoch": 0.03394106180483889, + "grad_norm": 0.5640287178272303, + "learning_rate": 0.000999960138488041, + "loss": 3.0011, + "step": 296 + }, + { + "epoch": 0.034055727554179564, + "grad_norm": 0.5284553600918369, + "learning_rate": 0.0009999577592444443, + "loss": 2.8111, + "step": 297 + }, + { + "epoch": 0.03417039330352024, + "grad_norm": 0.4823528426024632, + "learning_rate": 0.000999955311041205, + "loss": 2.9556, + "step": 298 + }, + { + "epoch": 0.03428505905286091, + "grad_norm": 0.45317566462384484, + "learning_rate": 0.0009999527938786606, + "loss": 2.943, + "step": 299 + }, + { + "epoch": 0.03439972480220158, + "grad_norm": 0.46716802143957065, + "learning_rate": 0.0009999502077571581, + "loss": 2.8747, + "step": 300 + }, + { + "epoch": 0.034514390551542255, + "grad_norm": 0.4642349076752888, + "learning_rate": 0.0009999475526770545, + "loss": 2.9187, + "step": 301 + }, + { + "epoch": 0.03462905630088293, + "grad_norm": 0.47640791292712437, + "learning_rate": 0.0009999448286387158, + "loss": 2.9906, + "step": 302 + }, + { + "epoch": 0.0347437220502236, + "grad_norm": 0.4615406485094774, + "learning_rate": 0.0009999420356425178, + "loss": 2.9198, + "step": 303 + }, + { + "epoch": 0.034858387799564274, + "grad_norm": 0.48921009340035604, + "learning_rate": 0.0009999391736888457, + "loss": 2.9495, + "step": 304 + }, + { + "epoch": 0.03497305354890494, + "grad_norm": 0.4968585248746272, + "learning_rate": 0.0009999362427780942, + "loss": 2.9745, + "step": 305 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 0.45600293295713923, + "learning_rate": 0.0009999332429106679, + "loss": 3.0254, + "step": 306 + }, + { + "epoch": 0.035202385047586285, + "grad_norm": 0.4534698626554571, + "learning_rate": 0.00099993017408698, + "loss": 2.9239, + "step": 307 + }, + { + "epoch": 0.03531705079692696, + "grad_norm": 0.46740119016259796, + "learning_rate": 0.0009999270363074547, + "loss": 2.9629, + "step": 308 + }, + { + "epoch": 0.03543171654626763, + "grad_norm": 0.46923180876874787, + "learning_rate": 0.0009999238295725237, + "loss": 2.7397, + "step": 309 + }, + { + "epoch": 0.0355463822956083, + "grad_norm": 0.46030227609025304, + "learning_rate": 0.00099992055388263, + "loss": 2.7823, + "step": 310 + }, + { + "epoch": 0.035661048044948976, + "grad_norm": 0.43082212581012064, + "learning_rate": 0.0009999172092382252, + "loss": 2.791, + "step": 311 + }, + { + "epoch": 0.03577571379428965, + "grad_norm": 0.5083007377152199, + "learning_rate": 0.0009999137956397707, + "loss": 2.8933, + "step": 312 + }, + { + "epoch": 0.035890379543630314, + "grad_norm": 0.46552601907995467, + "learning_rate": 0.0009999103130877373, + "loss": 2.8624, + "step": 313 + }, + { + "epoch": 0.03600504529297099, + "grad_norm": 0.5276439825595753, + "learning_rate": 0.0009999067615826054, + "loss": 2.6825, + "step": 314 + }, + { + "epoch": 0.03611971104231166, + "grad_norm": 0.48976568062846787, + "learning_rate": 0.000999903141124865, + "loss": 2.8526, + "step": 315 + }, + { + "epoch": 0.03623437679165233, + "grad_norm": 0.42398549089291626, + "learning_rate": 0.000999899451715015, + "loss": 2.9776, + "step": 316 + }, + { + "epoch": 0.036349042540993005, + "grad_norm": 0.45757849328674405, + "learning_rate": 0.0009998956933535649, + "loss": 2.9982, + "step": 317 + }, + { + "epoch": 0.03646370829033368, + "grad_norm": 0.4513227501855517, + "learning_rate": 0.0009998918660410324, + "loss": 2.9134, + "step": 318 + }, + { + "epoch": 0.03657837403967435, + "grad_norm": 0.42758246075351874, + "learning_rate": 0.000999887969777946, + "loss": 2.8974, + "step": 319 + }, + { + "epoch": 0.036693039789015024, + "grad_norm": 0.40678570238073214, + "learning_rate": 0.000999884004564843, + "loss": 3.0485, + "step": 320 + }, + { + "epoch": 0.036807705538355696, + "grad_norm": 0.4639323755103496, + "learning_rate": 0.00099987997040227, + "loss": 2.9364, + "step": 321 + }, + { + "epoch": 0.03692237128769636, + "grad_norm": 0.4239782751237232, + "learning_rate": 0.0009998758672907838, + "loss": 3.0521, + "step": 322 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 0.4172037620755801, + "learning_rate": 0.0009998716952309501, + "loss": 2.9848, + "step": 323 + }, + { + "epoch": 0.03715170278637771, + "grad_norm": 0.4647679514305454, + "learning_rate": 0.0009998674542233445, + "loss": 2.9383, + "step": 324 + }, + { + "epoch": 0.03726636853571838, + "grad_norm": 0.420054295652772, + "learning_rate": 0.000999863144268552, + "loss": 3.002, + "step": 325 + }, + { + "epoch": 0.03738103428505905, + "grad_norm": 0.4393499511327307, + "learning_rate": 0.000999858765367167, + "loss": 3.0245, + "step": 326 + }, + { + "epoch": 0.037495700034399726, + "grad_norm": 0.49531529977310673, + "learning_rate": 0.0009998543175197936, + "loss": 3.022, + "step": 327 + }, + { + "epoch": 0.0376103657837404, + "grad_norm": 0.4795320146107337, + "learning_rate": 0.000999849800727045, + "loss": 2.7953, + "step": 328 + }, + { + "epoch": 0.03772503153308107, + "grad_norm": 0.4355446672099129, + "learning_rate": 0.0009998452149895445, + "loss": 2.8616, + "step": 329 + }, + { + "epoch": 0.03783969728242174, + "grad_norm": 0.49477302813660073, + "learning_rate": 0.0009998405603079243, + "loss": 2.9453, + "step": 330 + }, + { + "epoch": 0.03795436303176241, + "grad_norm": 0.5275794168993565, + "learning_rate": 0.0009998358366828269, + "loss": 2.7751, + "step": 331 + }, + { + "epoch": 0.03806902878110308, + "grad_norm": 0.43616854539262845, + "learning_rate": 0.0009998310441149034, + "loss": 2.9195, + "step": 332 + }, + { + "epoch": 0.038183694530443756, + "grad_norm": 0.44370510961710824, + "learning_rate": 0.000999826182604815, + "loss": 2.8261, + "step": 333 + }, + { + "epoch": 0.03829836027978443, + "grad_norm": 0.43761133706578115, + "learning_rate": 0.0009998212521532325, + "loss": 2.8425, + "step": 334 + }, + { + "epoch": 0.0384130260291251, + "grad_norm": 0.4345452231213624, + "learning_rate": 0.0009998162527608354, + "loss": 2.7781, + "step": 335 + }, + { + "epoch": 0.038527691778465774, + "grad_norm": 0.4611564265771901, + "learning_rate": 0.0009998111844283137, + "loss": 2.8982, + "step": 336 + }, + { + "epoch": 0.03864235752780645, + "grad_norm": 0.40749071096569695, + "learning_rate": 0.0009998060471563665, + "loss": 2.9036, + "step": 337 + }, + { + "epoch": 0.03875702327714712, + "grad_norm": 0.43280865612081587, + "learning_rate": 0.0009998008409457023, + "loss": 2.902, + "step": 338 + }, + { + "epoch": 0.038871689026487785, + "grad_norm": 0.46426390733591344, + "learning_rate": 0.000999795565797039, + "loss": 2.81, + "step": 339 + }, + { + "epoch": 0.03898635477582846, + "grad_norm": 0.4420370383407498, + "learning_rate": 0.0009997902217111045, + "loss": 3.0037, + "step": 340 + }, + { + "epoch": 0.03910102052516913, + "grad_norm": 0.46816638107870434, + "learning_rate": 0.0009997848086886357, + "loss": 2.8941, + "step": 341 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 0.4860481385647438, + "learning_rate": 0.0009997793267303792, + "loss": 2.8446, + "step": 342 + }, + { + "epoch": 0.039330352023850476, + "grad_norm": 0.445966586786985, + "learning_rate": 0.0009997737758370914, + "loss": 2.8013, + "step": 343 + }, + { + "epoch": 0.03944501777319115, + "grad_norm": 0.42505493187570276, + "learning_rate": 0.0009997681560095378, + "loss": 2.7778, + "step": 344 + }, + { + "epoch": 0.03955968352253182, + "grad_norm": 0.48039411990833053, + "learning_rate": 0.0009997624672484933, + "loss": 2.8519, + "step": 345 + }, + { + "epoch": 0.039674349271872494, + "grad_norm": 0.4978776779172268, + "learning_rate": 0.0009997567095547432, + "loss": 2.8413, + "step": 346 + }, + { + "epoch": 0.03978901502121316, + "grad_norm": 0.4672321347478045, + "learning_rate": 0.000999750882929081, + "loss": 3.0195, + "step": 347 + }, + { + "epoch": 0.03990368077055383, + "grad_norm": 0.44415121207344815, + "learning_rate": 0.0009997449873723105, + "loss": 2.869, + "step": 348 + }, + { + "epoch": 0.040018346519894506, + "grad_norm": 0.4692753216934293, + "learning_rate": 0.000999739022885245, + "loss": 2.8549, + "step": 349 + }, + { + "epoch": 0.04013301226923518, + "grad_norm": 0.45523093619831717, + "learning_rate": 0.0009997329894687072, + "loss": 2.9351, + "step": 350 + }, + { + "epoch": 0.04024767801857585, + "grad_norm": 0.3899975629615919, + "learning_rate": 0.0009997268871235296, + "loss": 2.8285, + "step": 351 + }, + { + "epoch": 0.040362343767916524, + "grad_norm": 0.44822064960942354, + "learning_rate": 0.0009997207158505533, + "loss": 2.8637, + "step": 352 + }, + { + "epoch": 0.0404770095172572, + "grad_norm": 0.46489344947093303, + "learning_rate": 0.0009997144756506298, + "loss": 2.8658, + "step": 353 + }, + { + "epoch": 0.04059167526659787, + "grad_norm": 0.41958800520627165, + "learning_rate": 0.00099970816652462, + "loss": 2.9923, + "step": 354 + }, + { + "epoch": 0.04070634101593854, + "grad_norm": 0.4452179584088451, + "learning_rate": 0.0009997017884733938, + "loss": 2.8469, + "step": 355 + }, + { + "epoch": 0.04082100676527921, + "grad_norm": 0.42988254427238115, + "learning_rate": 0.000999695341497831, + "loss": 2.9169, + "step": 356 + }, + { + "epoch": 0.04093567251461988, + "grad_norm": 0.3976714124486464, + "learning_rate": 0.0009996888255988207, + "loss": 2.9287, + "step": 357 + }, + { + "epoch": 0.041050338263960554, + "grad_norm": 0.3846609864616555, + "learning_rate": 0.0009996822407772623, + "loss": 2.9095, + "step": 358 + }, + { + "epoch": 0.041165004013301226, + "grad_norm": 0.41877958446431957, + "learning_rate": 0.0009996755870340633, + "loss": 2.901, + "step": 359 + }, + { + "epoch": 0.0412796697626419, + "grad_norm": 0.3791731881821083, + "learning_rate": 0.0009996688643701419, + "loss": 2.7776, + "step": 360 + }, + { + "epoch": 0.04139433551198257, + "grad_norm": 0.38121545020482545, + "learning_rate": 0.0009996620727864252, + "loss": 2.9549, + "step": 361 + }, + { + "epoch": 0.041509001261323245, + "grad_norm": 0.40155140891613317, + "learning_rate": 0.00099965521228385, + "loss": 2.8994, + "step": 362 + }, + { + "epoch": 0.04162366701066392, + "grad_norm": 0.43125441718046403, + "learning_rate": 0.0009996482828633624, + "loss": 2.7598, + "step": 363 + }, + { + "epoch": 0.04173833276000459, + "grad_norm": 0.4770770778782653, + "learning_rate": 0.0009996412845259183, + "loss": 2.9975, + "step": 364 + }, + { + "epoch": 0.041852998509345256, + "grad_norm": 0.49995516941142554, + "learning_rate": 0.0009996342172724833, + "loss": 2.8628, + "step": 365 + }, + { + "epoch": 0.04196766425868593, + "grad_norm": 0.41779998761126896, + "learning_rate": 0.0009996270811040318, + "loss": 2.724, + "step": 366 + }, + { + "epoch": 0.0420823300080266, + "grad_norm": 0.42749704001470756, + "learning_rate": 0.0009996198760215483, + "loss": 2.8689, + "step": 367 + }, + { + "epoch": 0.042196995757367274, + "grad_norm": 0.48617886389833176, + "learning_rate": 0.0009996126020260262, + "loss": 2.9577, + "step": 368 + }, + { + "epoch": 0.04231166150670795, + "grad_norm": 0.4163277868110168, + "learning_rate": 0.0009996052591184695, + "loss": 2.8136, + "step": 369 + }, + { + "epoch": 0.04242632725604862, + "grad_norm": 0.44050122649801227, + "learning_rate": 0.0009995978472998905, + "loss": 2.9061, + "step": 370 + }, + { + "epoch": 0.04254099300538929, + "grad_norm": 0.4096153889161094, + "learning_rate": 0.0009995903665713118, + "loss": 2.7944, + "step": 371 + }, + { + "epoch": 0.042655658754729965, + "grad_norm": 0.44912392018468306, + "learning_rate": 0.000999582816933765, + "loss": 2.8208, + "step": 372 + }, + { + "epoch": 0.04277032450407063, + "grad_norm": 0.4195373920266093, + "learning_rate": 0.0009995751983882914, + "loss": 2.8087, + "step": 373 + }, + { + "epoch": 0.042884990253411304, + "grad_norm": 0.4116148378613393, + "learning_rate": 0.000999567510935942, + "loss": 2.8351, + "step": 374 + }, + { + "epoch": 0.042999656002751976, + "grad_norm": 0.37984420580150025, + "learning_rate": 0.0009995597545777771, + "loss": 2.8685, + "step": 375 + }, + { + "epoch": 0.04311432175209265, + "grad_norm": 0.4365060876046911, + "learning_rate": 0.0009995519293148666, + "loss": 2.793, + "step": 376 + }, + { + "epoch": 0.04322898750143332, + "grad_norm": 0.5404209586211132, + "learning_rate": 0.0009995440351482897, + "loss": 2.946, + "step": 377 + }, + { + "epoch": 0.043343653250773995, + "grad_norm": 0.49822471681227515, + "learning_rate": 0.0009995360720791353, + "loss": 2.8649, + "step": 378 + }, + { + "epoch": 0.04345831900011467, + "grad_norm": 0.4123137207078488, + "learning_rate": 0.000999528040108502, + "loss": 2.8452, + "step": 379 + }, + { + "epoch": 0.04357298474945534, + "grad_norm": 0.4109075077833616, + "learning_rate": 0.0009995199392374972, + "loss": 2.7985, + "step": 380 + }, + { + "epoch": 0.04368765049879601, + "grad_norm": 0.47602620084294067, + "learning_rate": 0.0009995117694672386, + "loss": 2.826, + "step": 381 + }, + { + "epoch": 0.04380231624813668, + "grad_norm": 0.42159603089235476, + "learning_rate": 0.000999503530798853, + "loss": 2.8964, + "step": 382 + }, + { + "epoch": 0.04391698199747735, + "grad_norm": 0.4439458980035615, + "learning_rate": 0.0009994952232334766, + "loss": 2.8777, + "step": 383 + }, + { + "epoch": 0.044031647746818024, + "grad_norm": 0.44211484867618567, + "learning_rate": 0.0009994868467722556, + "loss": 2.9517, + "step": 384 + }, + { + "epoch": 0.0441463134961587, + "grad_norm": 0.4181866357033534, + "learning_rate": 0.0009994784014163449, + "loss": 2.9797, + "step": 385 + }, + { + "epoch": 0.04426097924549937, + "grad_norm": 0.446295654114178, + "learning_rate": 0.0009994698871669098, + "loss": 3.0006, + "step": 386 + }, + { + "epoch": 0.04437564499484004, + "grad_norm": 0.42215794213287927, + "learning_rate": 0.0009994613040251246, + "loss": 2.9112, + "step": 387 + }, + { + "epoch": 0.044490310744180715, + "grad_norm": 0.42651543776486645, + "learning_rate": 0.000999452651992173, + "loss": 2.8749, + "step": 388 + }, + { + "epoch": 0.04460497649352139, + "grad_norm": 0.42789857319007374, + "learning_rate": 0.0009994439310692486, + "loss": 2.7857, + "step": 389 + }, + { + "epoch": 0.044719642242862054, + "grad_norm": 0.42520017533306, + "learning_rate": 0.0009994351412575542, + "loss": 2.7778, + "step": 390 + }, + { + "epoch": 0.04483430799220273, + "grad_norm": 0.4875041590092833, + "learning_rate": 0.000999426282558302, + "loss": 2.8615, + "step": 391 + }, + { + "epoch": 0.0449489737415434, + "grad_norm": 0.4403588542883991, + "learning_rate": 0.000999417354972714, + "loss": 2.8771, + "step": 392 + }, + { + "epoch": 0.04506363949088407, + "grad_norm": 0.42578332900390736, + "learning_rate": 0.000999408358502022, + "loss": 2.8174, + "step": 393 + }, + { + "epoch": 0.045178305240224745, + "grad_norm": 0.4320011402827834, + "learning_rate": 0.0009993992931474661, + "loss": 3.0187, + "step": 394 + }, + { + "epoch": 0.04529297098956542, + "grad_norm": 0.44257499612757956, + "learning_rate": 0.0009993901589102974, + "loss": 3.005, + "step": 395 + }, + { + "epoch": 0.04540763673890609, + "grad_norm": 0.48557089809549814, + "learning_rate": 0.0009993809557917754, + "loss": 2.7653, + "step": 396 + }, + { + "epoch": 0.04552230248824676, + "grad_norm": 0.4214302359558213, + "learning_rate": 0.0009993716837931696, + "loss": 3.0089, + "step": 397 + }, + { + "epoch": 0.045636968237587436, + "grad_norm": 0.404193875906678, + "learning_rate": 0.000999362342915759, + "loss": 2.8322, + "step": 398 + }, + { + "epoch": 0.0457516339869281, + "grad_norm": 0.40348708265909317, + "learning_rate": 0.0009993529331608318, + "loss": 2.7964, + "step": 399 + }, + { + "epoch": 0.045866299736268774, + "grad_norm": 0.46358188465636196, + "learning_rate": 0.0009993434545296862, + "loss": 2.8539, + "step": 400 + }, + { + "epoch": 0.04598096548560945, + "grad_norm": 0.387197190688247, + "learning_rate": 0.0009993339070236292, + "loss": 2.8845, + "step": 401 + }, + { + "epoch": 0.04609563123495012, + "grad_norm": 0.44260755914743327, + "learning_rate": 0.000999324290643978, + "loss": 2.9898, + "step": 402 + }, + { + "epoch": 0.04621029698429079, + "grad_norm": 0.3849339441341739, + "learning_rate": 0.0009993146053920588, + "loss": 2.8997, + "step": 403 + }, + { + "epoch": 0.046324962733631465, + "grad_norm": 0.39529879885162666, + "learning_rate": 0.0009993048512692078, + "loss": 2.8602, + "step": 404 + }, + { + "epoch": 0.04643962848297214, + "grad_norm": 0.4196390379342417, + "learning_rate": 0.00099929502827677, + "loss": 2.9231, + "step": 405 + }, + { + "epoch": 0.04655429423231281, + "grad_norm": 0.4261818240672346, + "learning_rate": 0.0009992851364161006, + "loss": 2.7321, + "step": 406 + }, + { + "epoch": 0.04666895998165348, + "grad_norm": 0.4151510793545912, + "learning_rate": 0.0009992751756885637, + "loss": 2.9788, + "step": 407 + }, + { + "epoch": 0.04678362573099415, + "grad_norm": 0.3768094973952089, + "learning_rate": 0.0009992651460955335, + "loss": 2.8075, + "step": 408 + }, + { + "epoch": 0.04689829148033482, + "grad_norm": 0.4258341047280142, + "learning_rate": 0.0009992550476383931, + "loss": 2.8679, + "step": 409 + }, + { + "epoch": 0.047012957229675495, + "grad_norm": 0.4350418947825548, + "learning_rate": 0.0009992448803185356, + "loss": 2.8282, + "step": 410 + }, + { + "epoch": 0.04712762297901617, + "grad_norm": 0.4041485879286882, + "learning_rate": 0.0009992346441373633, + "loss": 2.9333, + "step": 411 + }, + { + "epoch": 0.04724228872835684, + "grad_norm": 0.4678578748104447, + "learning_rate": 0.0009992243390962883, + "loss": 2.8841, + "step": 412 + }, + { + "epoch": 0.04735695447769751, + "grad_norm": 0.41091702514489364, + "learning_rate": 0.0009992139651967319, + "loss": 2.7504, + "step": 413 + }, + { + "epoch": 0.047471620227038186, + "grad_norm": 0.49033793108209633, + "learning_rate": 0.0009992035224401245, + "loss": 2.8374, + "step": 414 + }, + { + "epoch": 0.04758628597637886, + "grad_norm": 0.42603739472570434, + "learning_rate": 0.0009991930108279074, + "loss": 2.7514, + "step": 415 + }, + { + "epoch": 0.047700951725719525, + "grad_norm": 0.38670909999135045, + "learning_rate": 0.0009991824303615293, + "loss": 2.7509, + "step": 416 + }, + { + "epoch": 0.0478156174750602, + "grad_norm": 0.35885819034707067, + "learning_rate": 0.0009991717810424506, + "loss": 2.8272, + "step": 417 + }, + { + "epoch": 0.04793028322440087, + "grad_norm": 0.39616114442120487, + "learning_rate": 0.0009991610628721397, + "loss": 2.8344, + "step": 418 + }, + { + "epoch": 0.04804494897374154, + "grad_norm": 0.44582603341367083, + "learning_rate": 0.000999150275852075, + "loss": 2.8899, + "step": 419 + }, + { + "epoch": 0.048159614723082216, + "grad_norm": 0.3846685537111089, + "learning_rate": 0.0009991394199837444, + "loss": 2.8781, + "step": 420 + }, + { + "epoch": 0.04827428047242289, + "grad_norm": 0.4568480990713995, + "learning_rate": 0.0009991284952686455, + "loss": 2.8454, + "step": 421 + }, + { + "epoch": 0.04838894622176356, + "grad_norm": 0.41497904792592516, + "learning_rate": 0.0009991175017082848, + "loss": 2.8815, + "step": 422 + }, + { + "epoch": 0.048503611971104234, + "grad_norm": 0.3988258676788166, + "learning_rate": 0.0009991064393041786, + "loss": 2.7265, + "step": 423 + }, + { + "epoch": 0.04861827772044491, + "grad_norm": 0.438444029795652, + "learning_rate": 0.0009990953080578533, + "loss": 2.9248, + "step": 424 + }, + { + "epoch": 0.04873294346978557, + "grad_norm": 0.45699647215205313, + "learning_rate": 0.0009990841079708435, + "loss": 2.7777, + "step": 425 + }, + { + "epoch": 0.048847609219126245, + "grad_norm": 0.4033532254636485, + "learning_rate": 0.0009990728390446946, + "loss": 2.7618, + "step": 426 + }, + { + "epoch": 0.04896227496846692, + "grad_norm": 0.463489201897885, + "learning_rate": 0.0009990615012809608, + "loss": 2.8633, + "step": 427 + }, + { + "epoch": 0.04907694071780759, + "grad_norm": 0.4387936549724385, + "learning_rate": 0.0009990500946812058, + "loss": 2.7559, + "step": 428 + }, + { + "epoch": 0.04919160646714826, + "grad_norm": 0.4006095893834648, + "learning_rate": 0.000999038619247003, + "loss": 2.8086, + "step": 429 + }, + { + "epoch": 0.049306272216488936, + "grad_norm": 0.38101367236228806, + "learning_rate": 0.0009990270749799352, + "loss": 2.7024, + "step": 430 + }, + { + "epoch": 0.04942093796582961, + "grad_norm": 0.409644108681533, + "learning_rate": 0.0009990154618815948, + "loss": 2.8839, + "step": 431 + }, + { + "epoch": 0.04953560371517028, + "grad_norm": 0.40140101284102714, + "learning_rate": 0.0009990037799535833, + "loss": 2.9054, + "step": 432 + }, + { + "epoch": 0.04965026946451095, + "grad_norm": 0.36254358675155723, + "learning_rate": 0.0009989920291975124, + "loss": 2.713, + "step": 433 + }, + { + "epoch": 0.04976493521385162, + "grad_norm": 0.3705019593703962, + "learning_rate": 0.0009989802096150029, + "loss": 2.8037, + "step": 434 + }, + { + "epoch": 0.04987960096319229, + "grad_norm": 0.39564835396997816, + "learning_rate": 0.0009989683212076848, + "loss": 2.8976, + "step": 435 + }, + { + "epoch": 0.049994266712532966, + "grad_norm": 0.3913977239078428, + "learning_rate": 0.0009989563639771978, + "loss": 2.8678, + "step": 436 + }, + { + "epoch": 0.05010893246187364, + "grad_norm": 0.4585408231388172, + "learning_rate": 0.0009989443379251916, + "loss": 2.8011, + "step": 437 + }, + { + "epoch": 0.05022359821121431, + "grad_norm": 0.42225842535929436, + "learning_rate": 0.0009989322430533245, + "loss": 2.8809, + "step": 438 + }, + { + "epoch": 0.050338263960554984, + "grad_norm": 0.41520748540270125, + "learning_rate": 0.0009989200793632652, + "loss": 2.731, + "step": 439 + }, + { + "epoch": 0.05045292970989566, + "grad_norm": 0.4220304138402917, + "learning_rate": 0.0009989078468566912, + "loss": 2.7616, + "step": 440 + }, + { + "epoch": 0.05056759545923633, + "grad_norm": 0.41827741604696245, + "learning_rate": 0.0009988955455352898, + "loss": 2.7708, + "step": 441 + }, + { + "epoch": 0.050682261208576995, + "grad_norm": 0.4129874963149311, + "learning_rate": 0.0009988831754007576, + "loss": 2.7688, + "step": 442 + }, + { + "epoch": 0.05079692695791767, + "grad_norm": 0.4389719568545934, + "learning_rate": 0.000998870736454801, + "loss": 2.7847, + "step": 443 + }, + { + "epoch": 0.05091159270725834, + "grad_norm": 0.4137136853137206, + "learning_rate": 0.0009988582286991356, + "loss": 2.8585, + "step": 444 + }, + { + "epoch": 0.051026258456599013, + "grad_norm": 0.3605568937049715, + "learning_rate": 0.0009988456521354868, + "loss": 2.9004, + "step": 445 + }, + { + "epoch": 0.051140924205939686, + "grad_norm": 0.3610587028048033, + "learning_rate": 0.000998833006765589, + "loss": 2.7843, + "step": 446 + }, + { + "epoch": 0.05125558995528036, + "grad_norm": 0.38311042087079367, + "learning_rate": 0.0009988202925911864, + "loss": 2.9845, + "step": 447 + }, + { + "epoch": 0.05137025570462103, + "grad_norm": 0.3751968234497866, + "learning_rate": 0.000998807509614033, + "loss": 2.9177, + "step": 448 + }, + { + "epoch": 0.051484921453961704, + "grad_norm": 0.382024099924426, + "learning_rate": 0.0009987946578358918, + "loss": 2.8338, + "step": 449 + }, + { + "epoch": 0.05159958720330237, + "grad_norm": 0.46865478196754445, + "learning_rate": 0.0009987817372585355, + "loss": 2.9415, + "step": 450 + }, + { + "epoch": 0.05171425295264304, + "grad_norm": 0.3941759802390462, + "learning_rate": 0.000998768747883746, + "loss": 2.6885, + "step": 451 + }, + { + "epoch": 0.051828918701983716, + "grad_norm": 0.3712492910534536, + "learning_rate": 0.0009987556897133151, + "loss": 2.8854, + "step": 452 + }, + { + "epoch": 0.05194358445132439, + "grad_norm": 0.409512039521161, + "learning_rate": 0.0009987425627490441, + "loss": 2.8651, + "step": 453 + }, + { + "epoch": 0.05205825020066506, + "grad_norm": 0.3636573505665796, + "learning_rate": 0.0009987293669927436, + "loss": 2.9018, + "step": 454 + }, + { + "epoch": 0.052172915950005734, + "grad_norm": 0.3887738106124062, + "learning_rate": 0.0009987161024462333, + "loss": 2.9214, + "step": 455 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 0.44779193511056586, + "learning_rate": 0.0009987027691113432, + "loss": 2.8703, + "step": 456 + }, + { + "epoch": 0.05240224744868708, + "grad_norm": 0.39891917558838513, + "learning_rate": 0.0009986893669899123, + "loss": 2.9884, + "step": 457 + }, + { + "epoch": 0.05251691319802775, + "grad_norm": 0.4068790315684028, + "learning_rate": 0.0009986758960837889, + "loss": 2.7961, + "step": 458 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.3717035465244831, + "learning_rate": 0.0009986623563948314, + "loss": 2.6558, + "step": 459 + }, + { + "epoch": 0.05274624469670909, + "grad_norm": 0.4175344791804306, + "learning_rate": 0.000998648747924907, + "loss": 2.8512, + "step": 460 + }, + { + "epoch": 0.052860910446049764, + "grad_norm": 0.33997947115432336, + "learning_rate": 0.0009986350706758934, + "loss": 2.7439, + "step": 461 + }, + { + "epoch": 0.052975576195390436, + "grad_norm": 0.43599493827369423, + "learning_rate": 0.0009986213246496762, + "loss": 2.7942, + "step": 462 + }, + { + "epoch": 0.05309024194473111, + "grad_norm": 0.3775569738548157, + "learning_rate": 0.000998607509848152, + "loss": 2.8576, + "step": 463 + }, + { + "epoch": 0.05320490769407178, + "grad_norm": 0.3886838855077135, + "learning_rate": 0.0009985936262732263, + "loss": 2.7963, + "step": 464 + }, + { + "epoch": 0.053319573443412455, + "grad_norm": 0.3608050283312012, + "learning_rate": 0.0009985796739268138, + "loss": 2.7426, + "step": 465 + }, + { + "epoch": 0.05343423919275313, + "grad_norm": 0.37412542057091924, + "learning_rate": 0.000998565652810839, + "loss": 2.7439, + "step": 466 + }, + { + "epoch": 0.05354890494209379, + "grad_norm": 0.4087290695052961, + "learning_rate": 0.000998551562927236, + "loss": 2.8025, + "step": 467 + }, + { + "epoch": 0.053663570691434466, + "grad_norm": 0.4484725980508869, + "learning_rate": 0.000998537404277948, + "loss": 2.9098, + "step": 468 + }, + { + "epoch": 0.05377823644077514, + "grad_norm": 0.452143256111978, + "learning_rate": 0.0009985231768649284, + "loss": 2.8642, + "step": 469 + }, + { + "epoch": 0.05389290219011581, + "grad_norm": 0.39270105614952217, + "learning_rate": 0.000998508880690139, + "loss": 2.8272, + "step": 470 + }, + { + "epoch": 0.054007567939456484, + "grad_norm": 0.40795197227106716, + "learning_rate": 0.000998494515755552, + "loss": 2.9005, + "step": 471 + }, + { + "epoch": 0.05412223368879716, + "grad_norm": 0.3821213386554317, + "learning_rate": 0.0009984800820631488, + "loss": 2.8737, + "step": 472 + }, + { + "epoch": 0.05423689943813783, + "grad_norm": 0.38450583254639414, + "learning_rate": 0.0009984655796149201, + "loss": 2.8181, + "step": 473 + }, + { + "epoch": 0.0543515651874785, + "grad_norm": 0.4122634233673069, + "learning_rate": 0.0009984510084128661, + "loss": 2.9515, + "step": 474 + }, + { + "epoch": 0.054466230936819175, + "grad_norm": 0.42214828000030447, + "learning_rate": 0.0009984363684589972, + "loss": 2.8553, + "step": 475 + }, + { + "epoch": 0.05458089668615984, + "grad_norm": 0.4115019028951167, + "learning_rate": 0.0009984216597553322, + "loss": 2.9232, + "step": 476 + }, + { + "epoch": 0.054695562435500514, + "grad_norm": 0.36757314119084644, + "learning_rate": 0.0009984068823039, + "loss": 2.8144, + "step": 477 + }, + { + "epoch": 0.054810228184841187, + "grad_norm": 0.3560515641845487, + "learning_rate": 0.0009983920361067388, + "loss": 2.7854, + "step": 478 + }, + { + "epoch": 0.05492489393418186, + "grad_norm": 0.3825011171831334, + "learning_rate": 0.0009983771211658965, + "loss": 2.8208, + "step": 479 + }, + { + "epoch": 0.05503955968352253, + "grad_norm": 0.4021136133809867, + "learning_rate": 0.0009983621374834303, + "loss": 2.9428, + "step": 480 + }, + { + "epoch": 0.055154225432863205, + "grad_norm": 0.3948708372571274, + "learning_rate": 0.0009983470850614068, + "loss": 2.824, + "step": 481 + }, + { + "epoch": 0.05526889118220388, + "grad_norm": 0.36096627443634055, + "learning_rate": 0.0009983319639019024, + "loss": 2.7043, + "step": 482 + }, + { + "epoch": 0.05538355693154455, + "grad_norm": 0.4279004261181492, + "learning_rate": 0.0009983167740070025, + "loss": 2.8255, + "step": 483 + }, + { + "epoch": 0.05549822268088522, + "grad_norm": 0.3964766765510947, + "learning_rate": 0.0009983015153788026, + "loss": 2.881, + "step": 484 + }, + { + "epoch": 0.05561288843022589, + "grad_norm": 0.3879824586196614, + "learning_rate": 0.000998286188019407, + "loss": 2.8855, + "step": 485 + }, + { + "epoch": 0.05572755417956656, + "grad_norm": 0.4107175714269911, + "learning_rate": 0.00099827079193093, + "loss": 2.8552, + "step": 486 + }, + { + "epoch": 0.055842219928907234, + "grad_norm": 0.37524531252432775, + "learning_rate": 0.0009982553271154953, + "loss": 2.926, + "step": 487 + }, + { + "epoch": 0.05595688567824791, + "grad_norm": 0.3487945682326408, + "learning_rate": 0.0009982397935752356, + "loss": 2.8472, + "step": 488 + }, + { + "epoch": 0.05607155142758858, + "grad_norm": 0.36200550892192385, + "learning_rate": 0.0009982241913122937, + "loss": 2.9161, + "step": 489 + }, + { + "epoch": 0.05618621717692925, + "grad_norm": 0.38264745729569327, + "learning_rate": 0.000998208520328822, + "loss": 2.9392, + "step": 490 + }, + { + "epoch": 0.056300882926269925, + "grad_norm": 0.3732956380777419, + "learning_rate": 0.0009981927806269812, + "loss": 2.8577, + "step": 491 + }, + { + "epoch": 0.0564155486756106, + "grad_norm": 0.3737453493778973, + "learning_rate": 0.0009981769722089428, + "loss": 2.8192, + "step": 492 + }, + { + "epoch": 0.056530214424951264, + "grad_norm": 0.3702471871343963, + "learning_rate": 0.0009981610950768873, + "loss": 2.8721, + "step": 493 + }, + { + "epoch": 0.05664488017429194, + "grad_norm": 0.39285155488821855, + "learning_rate": 0.0009981451492330046, + "loss": 2.9218, + "step": 494 + }, + { + "epoch": 0.05675954592363261, + "grad_norm": 0.4030256747836687, + "learning_rate": 0.000998129134679494, + "loss": 2.8364, + "step": 495 + }, + { + "epoch": 0.05687421167297328, + "grad_norm": 0.3694952677987905, + "learning_rate": 0.0009981130514185646, + "loss": 2.8109, + "step": 496 + }, + { + "epoch": 0.056988877422313955, + "grad_norm": 0.38313941332829515, + "learning_rate": 0.0009980968994524344, + "loss": 2.654, + "step": 497 + }, + { + "epoch": 0.05710354317165463, + "grad_norm": 0.38013556912390845, + "learning_rate": 0.0009980806787833316, + "loss": 2.7987, + "step": 498 + }, + { + "epoch": 0.0572182089209953, + "grad_norm": 0.33783275117854766, + "learning_rate": 0.0009980643894134935, + "loss": 2.886, + "step": 499 + }, + { + "epoch": 0.05733287467033597, + "grad_norm": 0.3509621952013636, + "learning_rate": 0.000998048031345167, + "loss": 2.8833, + "step": 500 + }, + { + "epoch": 0.057447540419676646, + "grad_norm": 0.3874581582412629, + "learning_rate": 0.0009980316045806082, + "loss": 2.9003, + "step": 501 + }, + { + "epoch": 0.05756220616901731, + "grad_norm": 0.35705766155032925, + "learning_rate": 0.0009980151091220826, + "loss": 2.6738, + "step": 502 + }, + { + "epoch": 0.057676871918357984, + "grad_norm": 0.4416859258856828, + "learning_rate": 0.000997998544971866, + "loss": 2.9362, + "step": 503 + }, + { + "epoch": 0.05779153766769866, + "grad_norm": 0.38238318547088923, + "learning_rate": 0.0009979819121322426, + "loss": 2.787, + "step": 504 + }, + { + "epoch": 0.05790620341703933, + "grad_norm": 0.3933629281612558, + "learning_rate": 0.000997965210605507, + "loss": 2.9104, + "step": 505 + }, + { + "epoch": 0.05802086916638, + "grad_norm": 0.4020282593835261, + "learning_rate": 0.0009979484403939626, + "loss": 2.9175, + "step": 506 + }, + { + "epoch": 0.058135534915720675, + "grad_norm": 0.38587767254592736, + "learning_rate": 0.0009979316014999226, + "loss": 2.8248, + "step": 507 + }, + { + "epoch": 0.05825020066506135, + "grad_norm": 0.45730926007253686, + "learning_rate": 0.0009979146939257098, + "loss": 2.7858, + "step": 508 + }, + { + "epoch": 0.05836486641440202, + "grad_norm": 0.3719279516401054, + "learning_rate": 0.000997897717673656, + "loss": 2.8946, + "step": 509 + }, + { + "epoch": 0.05847953216374269, + "grad_norm": 0.41184037116914446, + "learning_rate": 0.0009978806727461028, + "loss": 2.8045, + "step": 510 + }, + { + "epoch": 0.05859419791308336, + "grad_norm": 0.38402455958857123, + "learning_rate": 0.000997863559145401, + "loss": 2.8049, + "step": 511 + }, + { + "epoch": 0.05870886366242403, + "grad_norm": 0.3445610316443261, + "learning_rate": 0.0009978463768739118, + "loss": 2.9535, + "step": 512 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.3241117104974591, + "learning_rate": 0.0009978291259340045, + "loss": 2.7859, + "step": 513 + }, + { + "epoch": 0.05893819516110538, + "grad_norm": 0.3964773035946509, + "learning_rate": 0.0009978118063280587, + "loss": 2.8495, + "step": 514 + }, + { + "epoch": 0.05905286091044605, + "grad_norm": 0.3819741328822643, + "learning_rate": 0.0009977944180584637, + "loss": 2.8638, + "step": 515 + }, + { + "epoch": 0.05916752665978672, + "grad_norm": 0.38650938047527483, + "learning_rate": 0.0009977769611276173, + "loss": 2.9108, + "step": 516 + }, + { + "epoch": 0.059282192409127396, + "grad_norm": 0.3972840160880589, + "learning_rate": 0.0009977594355379275, + "loss": 2.721, + "step": 517 + }, + { + "epoch": 0.05939685815846807, + "grad_norm": 0.35831739139778357, + "learning_rate": 0.000997741841291812, + "loss": 2.7214, + "step": 518 + }, + { + "epoch": 0.059511523907808735, + "grad_norm": 0.36764254550332665, + "learning_rate": 0.000997724178391697, + "loss": 3.0099, + "step": 519 + }, + { + "epoch": 0.05962618965714941, + "grad_norm": 0.3445096992163432, + "learning_rate": 0.0009977064468400193, + "loss": 2.7609, + "step": 520 + }, + { + "epoch": 0.05974085540649008, + "grad_norm": 0.3556029284571166, + "learning_rate": 0.0009976886466392244, + "loss": 2.7693, + "step": 521 + }, + { + "epoch": 0.05985552115583075, + "grad_norm": 0.36099874973644624, + "learning_rate": 0.0009976707777917676, + "loss": 2.9042, + "step": 522 + }, + { + "epoch": 0.059970186905171426, + "grad_norm": 0.40264356947347824, + "learning_rate": 0.0009976528403001133, + "loss": 2.7669, + "step": 523 + }, + { + "epoch": 0.0600848526545121, + "grad_norm": 0.396778556485354, + "learning_rate": 0.0009976348341667358, + "loss": 2.8934, + "step": 524 + }, + { + "epoch": 0.06019951840385277, + "grad_norm": 0.39203694343139134, + "learning_rate": 0.0009976167593941188, + "loss": 2.8433, + "step": 525 + }, + { + "epoch": 0.060314184153193444, + "grad_norm": 0.38477339157637297, + "learning_rate": 0.000997598615984755, + "loss": 2.8111, + "step": 526 + }, + { + "epoch": 0.06042884990253411, + "grad_norm": 0.41357486699162727, + "learning_rate": 0.0009975804039411475, + "loss": 2.7638, + "step": 527 + }, + { + "epoch": 0.06054351565187478, + "grad_norm": 0.39748890533156267, + "learning_rate": 0.0009975621232658082, + "loss": 2.9401, + "step": 528 + }, + { + "epoch": 0.060658181401215455, + "grad_norm": 0.3527248762851314, + "learning_rate": 0.000997543773961258, + "loss": 2.8141, + "step": 529 + }, + { + "epoch": 0.06077284715055613, + "grad_norm": 0.39835240779303555, + "learning_rate": 0.0009975253560300283, + "loss": 2.7666, + "step": 530 + }, + { + "epoch": 0.0608875128998968, + "grad_norm": 0.38953294378848635, + "learning_rate": 0.0009975068694746596, + "loss": 2.869, + "step": 531 + }, + { + "epoch": 0.06100217864923747, + "grad_norm": 0.37673164372981777, + "learning_rate": 0.0009974883142977015, + "loss": 2.9162, + "step": 532 + }, + { + "epoch": 0.061116844398578146, + "grad_norm": 0.3909901055648809, + "learning_rate": 0.0009974696905017135, + "loss": 2.8557, + "step": 533 + }, + { + "epoch": 0.06123151014791882, + "grad_norm": 0.36111718041874913, + "learning_rate": 0.0009974509980892642, + "loss": 2.8222, + "step": 534 + }, + { + "epoch": 0.06134617589725949, + "grad_norm": 0.332632292785617, + "learning_rate": 0.0009974322370629321, + "loss": 2.8032, + "step": 535 + }, + { + "epoch": 0.06146084164660016, + "grad_norm": 0.4407896032526114, + "learning_rate": 0.000997413407425305, + "loss": 2.8223, + "step": 536 + }, + { + "epoch": 0.06157550739594083, + "grad_norm": 0.4057388442704425, + "learning_rate": 0.0009973945091789796, + "loss": 2.6936, + "step": 537 + }, + { + "epoch": 0.0616901731452815, + "grad_norm": 0.38759283542501144, + "learning_rate": 0.000997375542326563, + "loss": 2.9768, + "step": 538 + }, + { + "epoch": 0.061804838894622176, + "grad_norm": 0.40891702468704066, + "learning_rate": 0.0009973565068706711, + "loss": 2.9408, + "step": 539 + }, + { + "epoch": 0.06191950464396285, + "grad_norm": 0.3818812353296347, + "learning_rate": 0.0009973374028139296, + "loss": 2.7667, + "step": 540 + }, + { + "epoch": 0.06203417039330352, + "grad_norm": 0.3857912368951548, + "learning_rate": 0.0009973182301589736, + "loss": 2.8785, + "step": 541 + }, + { + "epoch": 0.062148836142644194, + "grad_norm": 0.3424298834515133, + "learning_rate": 0.0009972989889084473, + "loss": 2.803, + "step": 542 + }, + { + "epoch": 0.06226350189198487, + "grad_norm": 0.3703823241676832, + "learning_rate": 0.000997279679065005, + "loss": 2.8327, + "step": 543 + }, + { + "epoch": 0.06237816764132553, + "grad_norm": 0.42932212049176, + "learning_rate": 0.0009972603006313098, + "loss": 2.9458, + "step": 544 + }, + { + "epoch": 0.062492833390666205, + "grad_norm": 0.3499043126889513, + "learning_rate": 0.000997240853610035, + "loss": 2.9083, + "step": 545 + }, + { + "epoch": 0.06260749914000688, + "grad_norm": 0.39589964910489645, + "learning_rate": 0.0009972213380038627, + "loss": 2.7476, + "step": 546 + }, + { + "epoch": 0.06272216488934755, + "grad_norm": 0.3498146886720316, + "learning_rate": 0.0009972017538154845, + "loss": 2.9481, + "step": 547 + }, + { + "epoch": 0.06283683063868822, + "grad_norm": 0.37553717335623615, + "learning_rate": 0.000997182101047602, + "loss": 2.733, + "step": 548 + }, + { + "epoch": 0.0629514963880289, + "grad_norm": 0.3492990140797232, + "learning_rate": 0.0009971623797029258, + "loss": 2.7595, + "step": 549 + }, + { + "epoch": 0.06306616213736957, + "grad_norm": 0.37341979960887145, + "learning_rate": 0.0009971425897841765, + "loss": 2.7499, + "step": 550 + }, + { + "epoch": 0.06318082788671024, + "grad_norm": 0.3325077238621815, + "learning_rate": 0.0009971227312940826, + "loss": 2.6768, + "step": 551 + }, + { + "epoch": 0.06329549363605091, + "grad_norm": 0.3594070652745293, + "learning_rate": 0.0009971028042353844, + "loss": 2.8266, + "step": 552 + }, + { + "epoch": 0.06341015938539159, + "grad_norm": 0.3818454255905805, + "learning_rate": 0.00099708280861083, + "loss": 2.8824, + "step": 553 + }, + { + "epoch": 0.06352482513473226, + "grad_norm": 0.36249396519560034, + "learning_rate": 0.0009970627444231776, + "loss": 2.6487, + "step": 554 + }, + { + "epoch": 0.06363949088407293, + "grad_norm": 0.3485390751171873, + "learning_rate": 0.000997042611675194, + "loss": 2.8473, + "step": 555 + }, + { + "epoch": 0.0637541566334136, + "grad_norm": 0.36882355882253803, + "learning_rate": 0.0009970224103696568, + "loss": 2.8502, + "step": 556 + }, + { + "epoch": 0.06386882238275426, + "grad_norm": 0.3845975349869386, + "learning_rate": 0.0009970021405093523, + "loss": 2.8224, + "step": 557 + }, + { + "epoch": 0.06398348813209494, + "grad_norm": 0.38113924867701604, + "learning_rate": 0.0009969818020970761, + "loss": 2.7644, + "step": 558 + }, + { + "epoch": 0.06409815388143561, + "grad_norm": 0.34430525400984807, + "learning_rate": 0.0009969613951356338, + "loss": 2.8116, + "step": 559 + }, + { + "epoch": 0.06421281963077628, + "grad_norm": 0.4230378914190675, + "learning_rate": 0.0009969409196278398, + "loss": 2.769, + "step": 560 + }, + { + "epoch": 0.06432748538011696, + "grad_norm": 0.39375503945927054, + "learning_rate": 0.0009969203755765186, + "loss": 2.7863, + "step": 561 + }, + { + "epoch": 0.06444215112945763, + "grad_norm": 0.3756954173419415, + "learning_rate": 0.0009968997629845038, + "loss": 2.8838, + "step": 562 + }, + { + "epoch": 0.0645568168787983, + "grad_norm": 0.3622992443487895, + "learning_rate": 0.0009968790818546383, + "loss": 2.8616, + "step": 563 + }, + { + "epoch": 0.06467148262813897, + "grad_norm": 0.37140118296902913, + "learning_rate": 0.000996858332189775, + "loss": 2.8407, + "step": 564 + }, + { + "epoch": 0.06478614837747965, + "grad_norm": 0.37797397816412776, + "learning_rate": 0.0009968375139927756, + "loss": 2.9121, + "step": 565 + }, + { + "epoch": 0.06490081412682032, + "grad_norm": 0.38064691309203647, + "learning_rate": 0.000996816627266512, + "loss": 2.8938, + "step": 566 + }, + { + "epoch": 0.06501547987616099, + "grad_norm": 0.35375178350637, + "learning_rate": 0.0009967956720138647, + "loss": 2.7726, + "step": 567 + }, + { + "epoch": 0.06513014562550166, + "grad_norm": 0.346366813226489, + "learning_rate": 0.0009967746482377243, + "loss": 2.7801, + "step": 568 + }, + { + "epoch": 0.06524481137484234, + "grad_norm": 0.3537313644943524, + "learning_rate": 0.0009967535559409905, + "loss": 2.7426, + "step": 569 + }, + { + "epoch": 0.06535947712418301, + "grad_norm": 0.3020340017229151, + "learning_rate": 0.0009967323951265725, + "loss": 2.8297, + "step": 570 + }, + { + "epoch": 0.06547414287352368, + "grad_norm": 0.32555722690613514, + "learning_rate": 0.0009967111657973892, + "loss": 2.7867, + "step": 571 + }, + { + "epoch": 0.06558880862286436, + "grad_norm": 0.3265944969679014, + "learning_rate": 0.000996689867956369, + "loss": 2.8542, + "step": 572 + }, + { + "epoch": 0.06570347437220503, + "grad_norm": 0.36224473561518206, + "learning_rate": 0.0009966685016064491, + "loss": 2.8781, + "step": 573 + }, + { + "epoch": 0.06581814012154569, + "grad_norm": 0.3410136540998401, + "learning_rate": 0.0009966470667505767, + "loss": 2.8693, + "step": 574 + }, + { + "epoch": 0.06593280587088636, + "grad_norm": 0.3183704316576589, + "learning_rate": 0.0009966255633917086, + "loss": 2.8049, + "step": 575 + }, + { + "epoch": 0.06604747162022703, + "grad_norm": 0.3844789157888387, + "learning_rate": 0.0009966039915328105, + "loss": 2.8911, + "step": 576 + }, + { + "epoch": 0.0661621373695677, + "grad_norm": 0.3424279256999673, + "learning_rate": 0.0009965823511768578, + "loss": 2.9003, + "step": 577 + }, + { + "epoch": 0.06627680311890838, + "grad_norm": 0.36908185692227313, + "learning_rate": 0.0009965606423268355, + "loss": 2.8489, + "step": 578 + }, + { + "epoch": 0.06639146886824905, + "grad_norm": 0.36458311519461306, + "learning_rate": 0.000996538864985738, + "loss": 2.7201, + "step": 579 + }, + { + "epoch": 0.06650613461758972, + "grad_norm": 0.3615487372115599, + "learning_rate": 0.0009965170191565688, + "loss": 2.7444, + "step": 580 + }, + { + "epoch": 0.0666208003669304, + "grad_norm": 0.39619806912950345, + "learning_rate": 0.0009964951048423414, + "loss": 2.8388, + "step": 581 + }, + { + "epoch": 0.06673546611627107, + "grad_norm": 0.3890093341248368, + "learning_rate": 0.0009964731220460784, + "loss": 2.9052, + "step": 582 + }, + { + "epoch": 0.06685013186561174, + "grad_norm": 0.4225710627187908, + "learning_rate": 0.000996451070770812, + "loss": 2.7088, + "step": 583 + }, + { + "epoch": 0.06696479761495241, + "grad_norm": 0.3767880497760453, + "learning_rate": 0.0009964289510195831, + "loss": 2.7974, + "step": 584 + }, + { + "epoch": 0.06707946336429309, + "grad_norm": 0.3574157837491907, + "learning_rate": 0.0009964067627954436, + "loss": 2.8335, + "step": 585 + }, + { + "epoch": 0.06719412911363376, + "grad_norm": 0.34636327566531677, + "learning_rate": 0.0009963845061014534, + "loss": 2.6642, + "step": 586 + }, + { + "epoch": 0.06730879486297443, + "grad_norm": 0.3360337890102201, + "learning_rate": 0.0009963621809406826, + "loss": 2.6885, + "step": 587 + }, + { + "epoch": 0.0674234606123151, + "grad_norm": 0.3385662335426565, + "learning_rate": 0.0009963397873162107, + "loss": 2.8007, + "step": 588 + }, + { + "epoch": 0.06753812636165578, + "grad_norm": 0.3694483561969065, + "learning_rate": 0.0009963173252311257, + "loss": 2.9901, + "step": 589 + }, + { + "epoch": 0.06765279211099645, + "grad_norm": 0.3731327358327472, + "learning_rate": 0.0009962947946885268, + "loss": 2.8479, + "step": 590 + }, + { + "epoch": 0.06776745786033712, + "grad_norm": 0.402369210879225, + "learning_rate": 0.000996272195691521, + "loss": 2.8698, + "step": 591 + }, + { + "epoch": 0.06788212360967778, + "grad_norm": 0.3659673445159418, + "learning_rate": 0.0009962495282432255, + "loss": 2.8526, + "step": 592 + }, + { + "epoch": 0.06799678935901846, + "grad_norm": 0.3361275502562852, + "learning_rate": 0.0009962267923467672, + "loss": 2.8988, + "step": 593 + }, + { + "epoch": 0.06811145510835913, + "grad_norm": 0.39591127988693153, + "learning_rate": 0.0009962039880052817, + "loss": 2.6127, + "step": 594 + }, + { + "epoch": 0.0682261208576998, + "grad_norm": 0.35340756316430194, + "learning_rate": 0.0009961811152219148, + "loss": 2.8294, + "step": 595 + }, + { + "epoch": 0.06834078660704047, + "grad_norm": 0.3719616818913295, + "learning_rate": 0.0009961581739998209, + "loss": 2.8296, + "step": 596 + }, + { + "epoch": 0.06845545235638115, + "grad_norm": 0.3594376416776732, + "learning_rate": 0.0009961351643421646, + "loss": 2.7965, + "step": 597 + }, + { + "epoch": 0.06857011810572182, + "grad_norm": 0.3896906187725525, + "learning_rate": 0.0009961120862521195, + "loss": 2.9705, + "step": 598 + }, + { + "epoch": 0.06868478385506249, + "grad_norm": 0.3738404085989803, + "learning_rate": 0.000996088939732869, + "loss": 2.698, + "step": 599 + }, + { + "epoch": 0.06879944960440317, + "grad_norm": 0.4032353294844088, + "learning_rate": 0.0009960657247876056, + "loss": 2.8108, + "step": 600 + }, + { + "epoch": 0.06891411535374384, + "grad_norm": 0.3430422612414147, + "learning_rate": 0.000996042441419531, + "loss": 2.6703, + "step": 601 + }, + { + "epoch": 0.06902878110308451, + "grad_norm": 0.3770888954550399, + "learning_rate": 0.0009960190896318572, + "loss": 2.7161, + "step": 602 + }, + { + "epoch": 0.06914344685242518, + "grad_norm": 0.3822676031796531, + "learning_rate": 0.0009959956694278052, + "loss": 2.754, + "step": 603 + }, + { + "epoch": 0.06925811260176586, + "grad_norm": 0.36598496054960544, + "learning_rate": 0.000995972180810605, + "loss": 2.7121, + "step": 604 + }, + { + "epoch": 0.06937277835110653, + "grad_norm": 0.3757470517181319, + "learning_rate": 0.0009959486237834964, + "loss": 2.8235, + "step": 605 + }, + { + "epoch": 0.0694874441004472, + "grad_norm": 0.3349128446474399, + "learning_rate": 0.0009959249983497289, + "loss": 2.679, + "step": 606 + }, + { + "epoch": 0.06960210984978787, + "grad_norm": 0.3795356920227357, + "learning_rate": 0.0009959013045125612, + "loss": 2.7834, + "step": 607 + }, + { + "epoch": 0.06971677559912855, + "grad_norm": 0.34552900354485805, + "learning_rate": 0.000995877542275261, + "loss": 2.7724, + "step": 608 + }, + { + "epoch": 0.0698314413484692, + "grad_norm": 0.3369926696067527, + "learning_rate": 0.0009958537116411064, + "loss": 2.7884, + "step": 609 + }, + { + "epoch": 0.06994610709780988, + "grad_norm": 0.3416824558846171, + "learning_rate": 0.000995829812613384, + "loss": 2.9158, + "step": 610 + }, + { + "epoch": 0.07006077284715055, + "grad_norm": 0.36294983633352773, + "learning_rate": 0.0009958058451953902, + "loss": 2.8477, + "step": 611 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 0.32047073254492964, + "learning_rate": 0.0009957818093904313, + "loss": 2.8156, + "step": 612 + }, + { + "epoch": 0.0702901043458319, + "grad_norm": 0.3417012078702137, + "learning_rate": 0.000995757705201822, + "loss": 2.7574, + "step": 613 + }, + { + "epoch": 0.07040477009517257, + "grad_norm": 0.33546358511539814, + "learning_rate": 0.0009957335326328874, + "loss": 2.8695, + "step": 614 + }, + { + "epoch": 0.07051943584451324, + "grad_norm": 0.3762991305598653, + "learning_rate": 0.0009957092916869613, + "loss": 2.857, + "step": 615 + }, + { + "epoch": 0.07063410159385392, + "grad_norm": 0.3270171785606096, + "learning_rate": 0.0009956849823673877, + "loss": 2.7531, + "step": 616 + }, + { + "epoch": 0.07074876734319459, + "grad_norm": 0.36331965706596603, + "learning_rate": 0.0009956606046775192, + "loss": 2.8138, + "step": 617 + }, + { + "epoch": 0.07086343309253526, + "grad_norm": 0.3725256662048272, + "learning_rate": 0.0009956361586207186, + "loss": 2.8221, + "step": 618 + }, + { + "epoch": 0.07097809884187593, + "grad_norm": 0.36081542140400913, + "learning_rate": 0.0009956116442003575, + "loss": 2.8594, + "step": 619 + }, + { + "epoch": 0.0710927645912166, + "grad_norm": 0.3671749469206358, + "learning_rate": 0.0009955870614198174, + "loss": 2.7193, + "step": 620 + }, + { + "epoch": 0.07120743034055728, + "grad_norm": 0.3949819857563313, + "learning_rate": 0.000995562410282489, + "loss": 2.8042, + "step": 621 + }, + { + "epoch": 0.07132209608989795, + "grad_norm": 0.381801362492628, + "learning_rate": 0.0009955376907917722, + "loss": 2.7509, + "step": 622 + }, + { + "epoch": 0.07143676183923862, + "grad_norm": 0.31984895011934356, + "learning_rate": 0.0009955129029510768, + "loss": 2.9748, + "step": 623 + }, + { + "epoch": 0.0715514275885793, + "grad_norm": 0.350355652487004, + "learning_rate": 0.0009954880467638219, + "loss": 2.9142, + "step": 624 + }, + { + "epoch": 0.07166609333791997, + "grad_norm": 0.3508964715117756, + "learning_rate": 0.0009954631222334356, + "loss": 2.8187, + "step": 625 + }, + { + "epoch": 0.07178075908726063, + "grad_norm": 0.3431305741311322, + "learning_rate": 0.0009954381293633561, + "loss": 2.8682, + "step": 626 + }, + { + "epoch": 0.0718954248366013, + "grad_norm": 0.37750962987300485, + "learning_rate": 0.0009954130681570305, + "loss": 2.9051, + "step": 627 + }, + { + "epoch": 0.07201009058594197, + "grad_norm": 0.3743412098272145, + "learning_rate": 0.0009953879386179157, + "loss": 2.7099, + "step": 628 + }, + { + "epoch": 0.07212475633528265, + "grad_norm": 0.3468643282965864, + "learning_rate": 0.0009953627407494777, + "loss": 2.8391, + "step": 629 + }, + { + "epoch": 0.07223942208462332, + "grad_norm": 0.3716581639642345, + "learning_rate": 0.000995337474555192, + "loss": 2.7012, + "step": 630 + }, + { + "epoch": 0.07235408783396399, + "grad_norm": 0.36933068087161947, + "learning_rate": 0.0009953121400385438, + "loss": 2.7055, + "step": 631 + }, + { + "epoch": 0.07246875358330467, + "grad_norm": 0.3631770326076731, + "learning_rate": 0.0009952867372030273, + "loss": 2.7728, + "step": 632 + }, + { + "epoch": 0.07258341933264534, + "grad_norm": 0.36108585429463225, + "learning_rate": 0.0009952612660521466, + "loss": 2.8255, + "step": 633 + }, + { + "epoch": 0.07269808508198601, + "grad_norm": 0.36061693149097496, + "learning_rate": 0.0009952357265894146, + "loss": 2.786, + "step": 634 + }, + { + "epoch": 0.07281275083132668, + "grad_norm": 0.36033243593273134, + "learning_rate": 0.000995210118818354, + "loss": 2.8017, + "step": 635 + }, + { + "epoch": 0.07292741658066736, + "grad_norm": 0.32716082977794686, + "learning_rate": 0.0009951844427424973, + "loss": 2.6647, + "step": 636 + }, + { + "epoch": 0.07304208233000803, + "grad_norm": 0.3520457593453405, + "learning_rate": 0.0009951586983653858, + "loss": 2.7381, + "step": 637 + }, + { + "epoch": 0.0731567480793487, + "grad_norm": 0.3878747588227373, + "learning_rate": 0.0009951328856905703, + "loss": 2.8102, + "step": 638 + }, + { + "epoch": 0.07327141382868937, + "grad_norm": 0.330642368471026, + "learning_rate": 0.0009951070047216116, + "loss": 2.799, + "step": 639 + }, + { + "epoch": 0.07338607957803005, + "grad_norm": 0.33272242573820904, + "learning_rate": 0.000995081055462079, + "loss": 2.8038, + "step": 640 + }, + { + "epoch": 0.07350074532737072, + "grad_norm": 0.3545123390803429, + "learning_rate": 0.0009950550379155519, + "loss": 2.7676, + "step": 641 + }, + { + "epoch": 0.07361541107671139, + "grad_norm": 0.31441985755465995, + "learning_rate": 0.000995028952085619, + "loss": 2.7781, + "step": 642 + }, + { + "epoch": 0.07373007682605205, + "grad_norm": 0.35725732121595893, + "learning_rate": 0.0009950027979758781, + "loss": 2.7209, + "step": 643 + }, + { + "epoch": 0.07384474257539272, + "grad_norm": 0.3885393546845129, + "learning_rate": 0.0009949765755899369, + "loss": 2.6714, + "step": 644 + }, + { + "epoch": 0.0739594083247334, + "grad_norm": 0.3854769764670012, + "learning_rate": 0.0009949502849314123, + "loss": 2.7207, + "step": 645 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.35490346206148193, + "learning_rate": 0.0009949239260039304, + "loss": 2.8291, + "step": 646 + }, + { + "epoch": 0.07418873982341474, + "grad_norm": 0.3437993972006972, + "learning_rate": 0.0009948974988111272, + "loss": 2.7546, + "step": 647 + }, + { + "epoch": 0.07430340557275542, + "grad_norm": 0.34751332751574304, + "learning_rate": 0.0009948710033566475, + "loss": 2.7565, + "step": 648 + }, + { + "epoch": 0.07441807132209609, + "grad_norm": 0.3879093245857311, + "learning_rate": 0.000994844439644146, + "loss": 2.6464, + "step": 649 + }, + { + "epoch": 0.07453273707143676, + "grad_norm": 0.32740429137183163, + "learning_rate": 0.0009948178076772867, + "loss": 2.7501, + "step": 650 + }, + { + "epoch": 0.07464740282077743, + "grad_norm": 0.34349167746633186, + "learning_rate": 0.0009947911074597428, + "loss": 2.6821, + "step": 651 + }, + { + "epoch": 0.0747620685701181, + "grad_norm": 0.35287004485510176, + "learning_rate": 0.0009947643389951973, + "loss": 2.6545, + "step": 652 + }, + { + "epoch": 0.07487673431945878, + "grad_norm": 0.32959540606193877, + "learning_rate": 0.0009947375022873422, + "loss": 2.6083, + "step": 653 + }, + { + "epoch": 0.07499140006879945, + "grad_norm": 0.3469641750994297, + "learning_rate": 0.0009947105973398794, + "loss": 2.7522, + "step": 654 + }, + { + "epoch": 0.07510606581814012, + "grad_norm": 0.386278937300029, + "learning_rate": 0.0009946836241565195, + "loss": 2.9341, + "step": 655 + }, + { + "epoch": 0.0752207315674808, + "grad_norm": 0.3601259442845363, + "learning_rate": 0.0009946565827409833, + "loss": 2.759, + "step": 656 + }, + { + "epoch": 0.07533539731682147, + "grad_norm": 0.33892103184916766, + "learning_rate": 0.0009946294730970005, + "loss": 2.7683, + "step": 657 + }, + { + "epoch": 0.07545006306616214, + "grad_norm": 0.3372733296202974, + "learning_rate": 0.0009946022952283106, + "loss": 2.7155, + "step": 658 + }, + { + "epoch": 0.07556472881550282, + "grad_norm": 0.33473565447204673, + "learning_rate": 0.0009945750491386616, + "loss": 2.7814, + "step": 659 + }, + { + "epoch": 0.07567939456484347, + "grad_norm": 0.342343288508645, + "learning_rate": 0.0009945477348318123, + "loss": 2.6832, + "step": 660 + }, + { + "epoch": 0.07579406031418415, + "grad_norm": 0.3328424855831501, + "learning_rate": 0.00099452035231153, + "loss": 2.6866, + "step": 661 + }, + { + "epoch": 0.07590872606352482, + "grad_norm": 0.38822682124517843, + "learning_rate": 0.0009944929015815913, + "loss": 2.6052, + "step": 662 + }, + { + "epoch": 0.07602339181286549, + "grad_norm": 0.37760905814629636, + "learning_rate": 0.0009944653826457828, + "loss": 2.7967, + "step": 663 + }, + { + "epoch": 0.07613805756220617, + "grad_norm": 0.3918384530780941, + "learning_rate": 0.0009944377955079004, + "loss": 2.9643, + "step": 664 + }, + { + "epoch": 0.07625272331154684, + "grad_norm": 0.37170981837961287, + "learning_rate": 0.0009944101401717486, + "loss": 2.7883, + "step": 665 + }, + { + "epoch": 0.07636738906088751, + "grad_norm": 0.3683137221815697, + "learning_rate": 0.0009943824166411424, + "loss": 2.8607, + "step": 666 + }, + { + "epoch": 0.07648205481022818, + "grad_norm": 0.32363810974230633, + "learning_rate": 0.0009943546249199056, + "loss": 2.8634, + "step": 667 + }, + { + "epoch": 0.07659672055956886, + "grad_norm": 0.3218890905066503, + "learning_rate": 0.0009943267650118716, + "loss": 2.7987, + "step": 668 + }, + { + "epoch": 0.07671138630890953, + "grad_norm": 0.36291221463340795, + "learning_rate": 0.0009942988369208829, + "loss": 2.7485, + "step": 669 + }, + { + "epoch": 0.0768260520582502, + "grad_norm": 0.35554600874603187, + "learning_rate": 0.000994270840650792, + "loss": 2.7304, + "step": 670 + }, + { + "epoch": 0.07694071780759087, + "grad_norm": 0.3235493287698985, + "learning_rate": 0.0009942427762054604, + "loss": 2.8015, + "step": 671 + }, + { + "epoch": 0.07705538355693155, + "grad_norm": 0.3300005966936813, + "learning_rate": 0.0009942146435887589, + "loss": 2.7567, + "step": 672 + }, + { + "epoch": 0.07717004930627222, + "grad_norm": 0.370024131947592, + "learning_rate": 0.0009941864428045677, + "loss": 2.9205, + "step": 673 + }, + { + "epoch": 0.0772847150556129, + "grad_norm": 0.3583115824591791, + "learning_rate": 0.0009941581738567768, + "loss": 2.7386, + "step": 674 + }, + { + "epoch": 0.07739938080495357, + "grad_norm": 0.3738066775311378, + "learning_rate": 0.0009941298367492854, + "loss": 2.9744, + "step": 675 + }, + { + "epoch": 0.07751404655429424, + "grad_norm": 0.31146002194075517, + "learning_rate": 0.0009941014314860021, + "loss": 2.6852, + "step": 676 + }, + { + "epoch": 0.0776287123036349, + "grad_norm": 0.37156145910077804, + "learning_rate": 0.0009940729580708448, + "loss": 2.8119, + "step": 677 + }, + { + "epoch": 0.07774337805297557, + "grad_norm": 0.323385660484067, + "learning_rate": 0.0009940444165077408, + "loss": 2.689, + "step": 678 + }, + { + "epoch": 0.07785804380231624, + "grad_norm": 0.3257550605322397, + "learning_rate": 0.0009940158068006267, + "loss": 2.8024, + "step": 679 + }, + { + "epoch": 0.07797270955165692, + "grad_norm": 0.33279684678649046, + "learning_rate": 0.0009939871289534488, + "loss": 2.6566, + "step": 680 + }, + { + "epoch": 0.07808737530099759, + "grad_norm": 0.3480450120812373, + "learning_rate": 0.0009939583829701628, + "loss": 2.7286, + "step": 681 + }, + { + "epoch": 0.07820204105033826, + "grad_norm": 0.32998462718005717, + "learning_rate": 0.0009939295688547337, + "loss": 2.7529, + "step": 682 + }, + { + "epoch": 0.07831670679967893, + "grad_norm": 0.34063967287480673, + "learning_rate": 0.0009939006866111356, + "loss": 2.7465, + "step": 683 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.37495047996329034, + "learning_rate": 0.0009938717362433524, + "loss": 2.6946, + "step": 684 + }, + { + "epoch": 0.07854603829836028, + "grad_norm": 0.3707009263477392, + "learning_rate": 0.0009938427177553773, + "loss": 2.6736, + "step": 685 + }, + { + "epoch": 0.07866070404770095, + "grad_norm": 0.4067848691390199, + "learning_rate": 0.0009938136311512127, + "loss": 2.918, + "step": 686 + }, + { + "epoch": 0.07877536979704163, + "grad_norm": 0.3741709553287032, + "learning_rate": 0.0009937844764348707, + "loss": 2.8201, + "step": 687 + }, + { + "epoch": 0.0788900355463823, + "grad_norm": 0.3502747640284244, + "learning_rate": 0.0009937552536103727, + "loss": 2.7879, + "step": 688 + }, + { + "epoch": 0.07900470129572297, + "grad_norm": 0.31784251294093274, + "learning_rate": 0.000993725962681749, + "loss": 2.7747, + "step": 689 + }, + { + "epoch": 0.07911936704506364, + "grad_norm": 0.34653285764421765, + "learning_rate": 0.0009936966036530402, + "loss": 2.8962, + "step": 690 + }, + { + "epoch": 0.07923403279440432, + "grad_norm": 0.32380406666958045, + "learning_rate": 0.0009936671765282956, + "loss": 2.7673, + "step": 691 + }, + { + "epoch": 0.07934869854374499, + "grad_norm": 0.3171829588982205, + "learning_rate": 0.0009936376813115741, + "loss": 2.6686, + "step": 692 + }, + { + "epoch": 0.07946336429308566, + "grad_norm": 0.35318233477147976, + "learning_rate": 0.000993608118006944, + "loss": 2.7779, + "step": 693 + }, + { + "epoch": 0.07957803004242632, + "grad_norm": 0.37127460357755665, + "learning_rate": 0.0009935784866184833, + "loss": 2.7071, + "step": 694 + }, + { + "epoch": 0.079692695791767, + "grad_norm": 0.3232693416643724, + "learning_rate": 0.0009935487871502787, + "loss": 2.6805, + "step": 695 + }, + { + "epoch": 0.07980736154110767, + "grad_norm": 0.39550106757047115, + "learning_rate": 0.0009935190196064267, + "loss": 2.784, + "step": 696 + }, + { + "epoch": 0.07992202729044834, + "grad_norm": 0.32802531513593103, + "learning_rate": 0.0009934891839910333, + "loss": 2.809, + "step": 697 + }, + { + "epoch": 0.08003669303978901, + "grad_norm": 0.33505304970547084, + "learning_rate": 0.0009934592803082138, + "loss": 2.7369, + "step": 698 + }, + { + "epoch": 0.08015135878912968, + "grad_norm": 0.3546371232382824, + "learning_rate": 0.0009934293085620929, + "loss": 2.7714, + "step": 699 + }, + { + "epoch": 0.08026602453847036, + "grad_norm": 0.32653696811924965, + "learning_rate": 0.0009933992687568044, + "loss": 2.7682, + "step": 700 + }, + { + "epoch": 0.08038069028781103, + "grad_norm": 0.3361782255198608, + "learning_rate": 0.0009933691608964917, + "loss": 2.6437, + "step": 701 + }, + { + "epoch": 0.0804953560371517, + "grad_norm": 0.3374368050741158, + "learning_rate": 0.0009933389849853078, + "loss": 2.8596, + "step": 702 + }, + { + "epoch": 0.08061002178649238, + "grad_norm": 0.3459579963693237, + "learning_rate": 0.0009933087410274148, + "loss": 2.9184, + "step": 703 + }, + { + "epoch": 0.08072468753583305, + "grad_norm": 0.34996499835452244, + "learning_rate": 0.0009932784290269843, + "loss": 2.8407, + "step": 704 + }, + { + "epoch": 0.08083935328517372, + "grad_norm": 0.38281053692307715, + "learning_rate": 0.0009932480489881974, + "loss": 2.863, + "step": 705 + }, + { + "epoch": 0.0809540190345144, + "grad_norm": 0.3112690038667326, + "learning_rate": 0.0009932176009152442, + "loss": 2.6913, + "step": 706 + }, + { + "epoch": 0.08106868478385507, + "grad_norm": 0.33220821626530317, + "learning_rate": 0.0009931870848123245, + "loss": 2.6733, + "step": 707 + }, + { + "epoch": 0.08118335053319574, + "grad_norm": 0.33683496984082895, + "learning_rate": 0.0009931565006836476, + "loss": 2.8295, + "step": 708 + }, + { + "epoch": 0.08129801628253641, + "grad_norm": 0.37466401491119167, + "learning_rate": 0.0009931258485334315, + "loss": 2.773, + "step": 709 + }, + { + "epoch": 0.08141268203187708, + "grad_norm": 0.35152365970651545, + "learning_rate": 0.0009930951283659048, + "loss": 2.8038, + "step": 710 + }, + { + "epoch": 0.08152734778121776, + "grad_norm": 0.36865897054377655, + "learning_rate": 0.0009930643401853043, + "loss": 2.755, + "step": 711 + }, + { + "epoch": 0.08164201353055842, + "grad_norm": 0.3337295547740383, + "learning_rate": 0.0009930334839958765, + "loss": 2.7796, + "step": 712 + }, + { + "epoch": 0.08175667927989909, + "grad_norm": 0.32593844222607615, + "learning_rate": 0.000993002559801878, + "loss": 2.6649, + "step": 713 + }, + { + "epoch": 0.08187134502923976, + "grad_norm": 0.4235123937261032, + "learning_rate": 0.0009929715676075736, + "loss": 2.926, + "step": 714 + }, + { + "epoch": 0.08198601077858043, + "grad_norm": 0.3498516325730499, + "learning_rate": 0.0009929405074172383, + "loss": 2.722, + "step": 715 + }, + { + "epoch": 0.08210067652792111, + "grad_norm": 0.38653762822311305, + "learning_rate": 0.0009929093792351567, + "loss": 2.7893, + "step": 716 + }, + { + "epoch": 0.08221534227726178, + "grad_norm": 0.37093427758230435, + "learning_rate": 0.0009928781830656215, + "loss": 2.8822, + "step": 717 + }, + { + "epoch": 0.08233000802660245, + "grad_norm": 0.32528309251182336, + "learning_rate": 0.0009928469189129363, + "loss": 2.7183, + "step": 718 + }, + { + "epoch": 0.08244467377594313, + "grad_norm": 0.344041478998291, + "learning_rate": 0.0009928155867814131, + "loss": 2.753, + "step": 719 + }, + { + "epoch": 0.0825593395252838, + "grad_norm": 0.33755859849780784, + "learning_rate": 0.0009927841866753735, + "loss": 2.729, + "step": 720 + }, + { + "epoch": 0.08267400527462447, + "grad_norm": 0.4004082459021678, + "learning_rate": 0.000992752718599149, + "loss": 2.8534, + "step": 721 + }, + { + "epoch": 0.08278867102396514, + "grad_norm": 0.3318401978047511, + "learning_rate": 0.0009927211825570793, + "loss": 2.6772, + "step": 722 + }, + { + "epoch": 0.08290333677330582, + "grad_norm": 0.35634425350570964, + "learning_rate": 0.000992689578553515, + "loss": 2.7819, + "step": 723 + }, + { + "epoch": 0.08301800252264649, + "grad_norm": 0.3497219381296959, + "learning_rate": 0.0009926579065928144, + "loss": 2.6364, + "step": 724 + }, + { + "epoch": 0.08313266827198716, + "grad_norm": 0.34057995484950926, + "learning_rate": 0.000992626166679347, + "loss": 2.7745, + "step": 725 + }, + { + "epoch": 0.08324733402132783, + "grad_norm": 0.3095893172406948, + "learning_rate": 0.0009925943588174897, + "loss": 2.83, + "step": 726 + }, + { + "epoch": 0.08336199977066851, + "grad_norm": 0.3837932914007685, + "learning_rate": 0.0009925624830116305, + "loss": 2.9259, + "step": 727 + }, + { + "epoch": 0.08347666552000918, + "grad_norm": 0.3594329020185206, + "learning_rate": 0.000992530539266166, + "loss": 2.71, + "step": 728 + }, + { + "epoch": 0.08359133126934984, + "grad_norm": 0.3195118493758497, + "learning_rate": 0.0009924985275855018, + "loss": 2.7717, + "step": 729 + }, + { + "epoch": 0.08370599701869051, + "grad_norm": 0.3253821168076408, + "learning_rate": 0.000992466447974054, + "loss": 2.8371, + "step": 730 + }, + { + "epoch": 0.08382066276803118, + "grad_norm": 0.3466344885513138, + "learning_rate": 0.0009924343004362466, + "loss": 2.7458, + "step": 731 + }, + { + "epoch": 0.08393532851737186, + "grad_norm": 0.3909718641136255, + "learning_rate": 0.0009924020849765142, + "loss": 2.8142, + "step": 732 + }, + { + "epoch": 0.08404999426671253, + "grad_norm": 0.3386934326844014, + "learning_rate": 0.0009923698015993003, + "loss": 2.6824, + "step": 733 + }, + { + "epoch": 0.0841646600160532, + "grad_norm": 0.3387661411014136, + "learning_rate": 0.0009923374503090577, + "loss": 2.8276, + "step": 734 + }, + { + "epoch": 0.08427932576539388, + "grad_norm": 0.3950938166321506, + "learning_rate": 0.0009923050311102487, + "loss": 2.8077, + "step": 735 + }, + { + "epoch": 0.08439399151473455, + "grad_norm": 0.3391695823445589, + "learning_rate": 0.0009922725440073446, + "loss": 2.7546, + "step": 736 + }, + { + "epoch": 0.08450865726407522, + "grad_norm": 0.3010384997072432, + "learning_rate": 0.0009922399890048268, + "loss": 2.7022, + "step": 737 + }, + { + "epoch": 0.0846233230134159, + "grad_norm": 0.33198956795048035, + "learning_rate": 0.0009922073661071855, + "loss": 2.7311, + "step": 738 + }, + { + "epoch": 0.08473798876275657, + "grad_norm": 0.3644362244355827, + "learning_rate": 0.0009921746753189203, + "loss": 2.9749, + "step": 739 + }, + { + "epoch": 0.08485265451209724, + "grad_norm": 0.35304746718398816, + "learning_rate": 0.0009921419166445404, + "loss": 2.5974, + "step": 740 + }, + { + "epoch": 0.08496732026143791, + "grad_norm": 0.321195109224156, + "learning_rate": 0.0009921090900885641, + "loss": 2.6718, + "step": 741 + }, + { + "epoch": 0.08508198601077858, + "grad_norm": 0.3191116220424898, + "learning_rate": 0.0009920761956555193, + "loss": 2.7905, + "step": 742 + }, + { + "epoch": 0.08519665176011926, + "grad_norm": 0.4261166185241973, + "learning_rate": 0.0009920432333499433, + "loss": 2.7329, + "step": 743 + }, + { + "epoch": 0.08531131750945993, + "grad_norm": 0.38392618279517443, + "learning_rate": 0.0009920102031763822, + "loss": 2.7382, + "step": 744 + }, + { + "epoch": 0.0854259832588006, + "grad_norm": 0.39672373374294195, + "learning_rate": 0.0009919771051393922, + "loss": 2.7643, + "step": 745 + }, + { + "epoch": 0.08554064900814126, + "grad_norm": 0.3653929855332756, + "learning_rate": 0.0009919439392435385, + "loss": 2.885, + "step": 746 + }, + { + "epoch": 0.08565531475748193, + "grad_norm": 0.34027173273402195, + "learning_rate": 0.0009919107054933956, + "loss": 2.7345, + "step": 747 + }, + { + "epoch": 0.08576998050682261, + "grad_norm": 0.38350348090530884, + "learning_rate": 0.0009918774038935477, + "loss": 2.7178, + "step": 748 + }, + { + "epoch": 0.08588464625616328, + "grad_norm": 0.3253844923475316, + "learning_rate": 0.000991844034448588, + "loss": 2.7696, + "step": 749 + }, + { + "epoch": 0.08599931200550395, + "grad_norm": 0.3156746799195098, + "learning_rate": 0.000991810597163119, + "loss": 2.6778, + "step": 750 + }, + { + "epoch": 0.08611397775484463, + "grad_norm": 0.3458491421738056, + "learning_rate": 0.000991777092041753, + "loss": 2.6871, + "step": 751 + }, + { + "epoch": 0.0862286435041853, + "grad_norm": 0.3347314285110529, + "learning_rate": 0.0009917435190891111, + "loss": 2.6569, + "step": 752 + }, + { + "epoch": 0.08634330925352597, + "grad_norm": 0.3631361433305357, + "learning_rate": 0.0009917098783098243, + "loss": 2.8881, + "step": 753 + }, + { + "epoch": 0.08645797500286664, + "grad_norm": 0.38313465064211627, + "learning_rate": 0.0009916761697085327, + "loss": 2.77, + "step": 754 + }, + { + "epoch": 0.08657264075220732, + "grad_norm": 0.3714088361988252, + "learning_rate": 0.0009916423932898857, + "loss": 2.7506, + "step": 755 + }, + { + "epoch": 0.08668730650154799, + "grad_norm": 0.31867951936812083, + "learning_rate": 0.0009916085490585423, + "loss": 2.9981, + "step": 756 + }, + { + "epoch": 0.08680197225088866, + "grad_norm": 0.32425001058262604, + "learning_rate": 0.0009915746370191701, + "loss": 2.5973, + "step": 757 + }, + { + "epoch": 0.08691663800022933, + "grad_norm": 0.3468982854015656, + "learning_rate": 0.0009915406571764471, + "loss": 2.7947, + "step": 758 + }, + { + "epoch": 0.08703130374957001, + "grad_norm": 0.3405498855389674, + "learning_rate": 0.0009915066095350603, + "loss": 2.731, + "step": 759 + }, + { + "epoch": 0.08714596949891068, + "grad_norm": 0.3404957960486945, + "learning_rate": 0.0009914724940997053, + "loss": 2.6851, + "step": 760 + }, + { + "epoch": 0.08726063524825135, + "grad_norm": 0.3408219490216612, + "learning_rate": 0.0009914383108750883, + "loss": 2.7239, + "step": 761 + }, + { + "epoch": 0.08737530099759203, + "grad_norm": 0.3844592933827754, + "learning_rate": 0.000991404059865924, + "loss": 2.6701, + "step": 762 + }, + { + "epoch": 0.08748996674693268, + "grad_norm": 0.3323816365970395, + "learning_rate": 0.0009913697410769366, + "loss": 2.6468, + "step": 763 + }, + { + "epoch": 0.08760463249627336, + "grad_norm": 0.30607982121961336, + "learning_rate": 0.0009913353545128597, + "loss": 2.6974, + "step": 764 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 0.34487323480870274, + "learning_rate": 0.0009913009001784364, + "loss": 2.8012, + "step": 765 + }, + { + "epoch": 0.0878339639949547, + "grad_norm": 0.32923920870338097, + "learning_rate": 0.0009912663780784188, + "loss": 2.7395, + "step": 766 + }, + { + "epoch": 0.08794862974429538, + "grad_norm": 0.3065849850457712, + "learning_rate": 0.000991231788217569, + "loss": 2.715, + "step": 767 + }, + { + "epoch": 0.08806329549363605, + "grad_norm": 0.31905220161800446, + "learning_rate": 0.0009911971306006575, + "loss": 2.6455, + "step": 768 + }, + { + "epoch": 0.08817796124297672, + "grad_norm": 0.35923787226643666, + "learning_rate": 0.000991162405232465, + "loss": 2.7436, + "step": 769 + }, + { + "epoch": 0.0882926269923174, + "grad_norm": 0.3432469176357402, + "learning_rate": 0.0009911276121177812, + "loss": 2.7743, + "step": 770 + }, + { + "epoch": 0.08840729274165807, + "grad_norm": 0.3373664179816129, + "learning_rate": 0.0009910927512614051, + "loss": 2.6684, + "step": 771 + }, + { + "epoch": 0.08852195849099874, + "grad_norm": 0.3347689265510937, + "learning_rate": 0.000991057822668145, + "loss": 2.7207, + "step": 772 + }, + { + "epoch": 0.08863662424033941, + "grad_norm": 0.3463916729415781, + "learning_rate": 0.0009910228263428186, + "loss": 2.9124, + "step": 773 + }, + { + "epoch": 0.08875128998968008, + "grad_norm": 0.34984784975831107, + "learning_rate": 0.000990987762290253, + "loss": 2.887, + "step": 774 + }, + { + "epoch": 0.08886595573902076, + "grad_norm": 0.4258067604126735, + "learning_rate": 0.0009909526305152848, + "loss": 2.7918, + "step": 775 + }, + { + "epoch": 0.08898062148836143, + "grad_norm": 0.3293218262617916, + "learning_rate": 0.0009909174310227596, + "loss": 2.7422, + "step": 776 + }, + { + "epoch": 0.0890952872377021, + "grad_norm": 0.3559968124688434, + "learning_rate": 0.0009908821638175325, + "loss": 2.7803, + "step": 777 + }, + { + "epoch": 0.08920995298704278, + "grad_norm": 0.34751120510254585, + "learning_rate": 0.000990846828904468, + "loss": 2.8157, + "step": 778 + }, + { + "epoch": 0.08932461873638345, + "grad_norm": 0.3092763044344859, + "learning_rate": 0.0009908114262884397, + "loss": 2.6105, + "step": 779 + }, + { + "epoch": 0.08943928448572411, + "grad_norm": 0.338958938313162, + "learning_rate": 0.0009907759559743311, + "loss": 2.7089, + "step": 780 + }, + { + "epoch": 0.08955395023506478, + "grad_norm": 0.36064855246869837, + "learning_rate": 0.0009907404179670342, + "loss": 2.916, + "step": 781 + }, + { + "epoch": 0.08966861598440545, + "grad_norm": 0.34218709162819727, + "learning_rate": 0.000990704812271451, + "loss": 2.7685, + "step": 782 + }, + { + "epoch": 0.08978328173374613, + "grad_norm": 0.31491020556970517, + "learning_rate": 0.0009906691388924928, + "loss": 2.788, + "step": 783 + }, + { + "epoch": 0.0898979474830868, + "grad_norm": 0.30196436362394613, + "learning_rate": 0.0009906333978350799, + "loss": 2.5976, + "step": 784 + }, + { + "epoch": 0.09001261323242747, + "grad_norm": 0.31564811746559535, + "learning_rate": 0.000990597589104142, + "loss": 2.6385, + "step": 785 + }, + { + "epoch": 0.09012727898176814, + "grad_norm": 0.30516805993385987, + "learning_rate": 0.0009905617127046182, + "loss": 2.7116, + "step": 786 + }, + { + "epoch": 0.09024194473110882, + "grad_norm": 0.3323257686605458, + "learning_rate": 0.0009905257686414573, + "loss": 2.7429, + "step": 787 + }, + { + "epoch": 0.09035661048044949, + "grad_norm": 0.33775436953277177, + "learning_rate": 0.0009904897569196168, + "loss": 2.7019, + "step": 788 + }, + { + "epoch": 0.09047127622979016, + "grad_norm": 0.3230868806083502, + "learning_rate": 0.0009904536775440641, + "loss": 2.5545, + "step": 789 + }, + { + "epoch": 0.09058594197913084, + "grad_norm": 0.4171155020390501, + "learning_rate": 0.0009904175305197752, + "loss": 2.8299, + "step": 790 + }, + { + "epoch": 0.09070060772847151, + "grad_norm": 0.3534959263460345, + "learning_rate": 0.0009903813158517363, + "loss": 2.6763, + "step": 791 + }, + { + "epoch": 0.09081527347781218, + "grad_norm": 0.3816764948075023, + "learning_rate": 0.0009903450335449423, + "loss": 2.8094, + "step": 792 + }, + { + "epoch": 0.09092993922715285, + "grad_norm": 0.3552510774815217, + "learning_rate": 0.0009903086836043978, + "loss": 2.7876, + "step": 793 + }, + { + "epoch": 0.09104460497649353, + "grad_norm": 0.33634011584573675, + "learning_rate": 0.0009902722660351166, + "loss": 2.7669, + "step": 794 + }, + { + "epoch": 0.0911592707258342, + "grad_norm": 0.367260508820492, + "learning_rate": 0.0009902357808421218, + "loss": 2.704, + "step": 795 + }, + { + "epoch": 0.09127393647517487, + "grad_norm": 0.35258351013888056, + "learning_rate": 0.0009901992280304456, + "loss": 2.6401, + "step": 796 + }, + { + "epoch": 0.09138860222451553, + "grad_norm": 0.29454054909104727, + "learning_rate": 0.00099016260760513, + "loss": 2.7961, + "step": 797 + }, + { + "epoch": 0.0915032679738562, + "grad_norm": 0.34810911807572176, + "learning_rate": 0.000990125919571226, + "loss": 2.6947, + "step": 798 + }, + { + "epoch": 0.09161793372319688, + "grad_norm": 0.3186633176550656, + "learning_rate": 0.000990089163933794, + "loss": 2.8424, + "step": 799 + }, + { + "epoch": 0.09173259947253755, + "grad_norm": 0.3386610558745488, + "learning_rate": 0.000990052340697904, + "loss": 2.8073, + "step": 800 + }, + { + "epoch": 0.09184726522187822, + "grad_norm": 0.31942280617581165, + "learning_rate": 0.0009900154498686349, + "loss": 2.7909, + "step": 801 + }, + { + "epoch": 0.0919619309712189, + "grad_norm": 0.32051896081066805, + "learning_rate": 0.0009899784914510748, + "loss": 2.8542, + "step": 802 + }, + { + "epoch": 0.09207659672055957, + "grad_norm": 0.3035296947368763, + "learning_rate": 0.0009899414654503216, + "loss": 2.7444, + "step": 803 + }, + { + "epoch": 0.09219126246990024, + "grad_norm": 0.32300455599940225, + "learning_rate": 0.0009899043718714826, + "loss": 2.7318, + "step": 804 + }, + { + "epoch": 0.09230592821924091, + "grad_norm": 0.3092276294728564, + "learning_rate": 0.0009898672107196739, + "loss": 2.6768, + "step": 805 + }, + { + "epoch": 0.09242059396858159, + "grad_norm": 0.3323756425112556, + "learning_rate": 0.000989829982000021, + "loss": 2.7118, + "step": 806 + }, + { + "epoch": 0.09253525971792226, + "grad_norm": 0.312506706483085, + "learning_rate": 0.000989792685717659, + "loss": 2.8544, + "step": 807 + }, + { + "epoch": 0.09264992546726293, + "grad_norm": 0.3132005664308122, + "learning_rate": 0.0009897553218777327, + "loss": 2.6641, + "step": 808 + }, + { + "epoch": 0.0927645912166036, + "grad_norm": 0.34130888476269583, + "learning_rate": 0.000989717890485395, + "loss": 2.7588, + "step": 809 + }, + { + "epoch": 0.09287925696594428, + "grad_norm": 0.33454793194644145, + "learning_rate": 0.0009896803915458094, + "loss": 2.8341, + "step": 810 + }, + { + "epoch": 0.09299392271528495, + "grad_norm": 0.3138451474325724, + "learning_rate": 0.0009896428250641479, + "loss": 2.6424, + "step": 811 + }, + { + "epoch": 0.09310858846462562, + "grad_norm": 0.3146705859461887, + "learning_rate": 0.000989605191045592, + "loss": 2.7295, + "step": 812 + }, + { + "epoch": 0.0932232542139663, + "grad_norm": 0.3183644182781799, + "learning_rate": 0.0009895674894953327, + "loss": 2.7056, + "step": 813 + }, + { + "epoch": 0.09333791996330695, + "grad_norm": 0.3260308382597346, + "learning_rate": 0.0009895297204185706, + "loss": 2.6179, + "step": 814 + }, + { + "epoch": 0.09345258571264763, + "grad_norm": 0.321161934289455, + "learning_rate": 0.0009894918838205145, + "loss": 2.7037, + "step": 815 + }, + { + "epoch": 0.0935672514619883, + "grad_norm": 0.32060529089265155, + "learning_rate": 0.0009894539797063837, + "loss": 2.6501, + "step": 816 + }, + { + "epoch": 0.09368191721132897, + "grad_norm": 0.3282000903566724, + "learning_rate": 0.0009894160080814061, + "loss": 2.7856, + "step": 817 + }, + { + "epoch": 0.09379658296066964, + "grad_norm": 0.3265395585254304, + "learning_rate": 0.0009893779689508194, + "loss": 2.7682, + "step": 818 + }, + { + "epoch": 0.09391124871001032, + "grad_norm": 0.33936198847015137, + "learning_rate": 0.0009893398623198703, + "loss": 2.8295, + "step": 819 + }, + { + "epoch": 0.09402591445935099, + "grad_norm": 0.3593274432801614, + "learning_rate": 0.0009893016881938148, + "loss": 2.8703, + "step": 820 + }, + { + "epoch": 0.09414058020869166, + "grad_norm": 0.30331589000282694, + "learning_rate": 0.0009892634465779185, + "loss": 2.7261, + "step": 821 + }, + { + "epoch": 0.09425524595803234, + "grad_norm": 0.3042107204750613, + "learning_rate": 0.000989225137477456, + "loss": 2.7993, + "step": 822 + }, + { + "epoch": 0.09436991170737301, + "grad_norm": 0.35729493002107116, + "learning_rate": 0.000989186760897711, + "loss": 2.7278, + "step": 823 + }, + { + "epoch": 0.09448457745671368, + "grad_norm": 0.29163803029769897, + "learning_rate": 0.0009891483168439773, + "loss": 2.7046, + "step": 824 + }, + { + "epoch": 0.09459924320605435, + "grad_norm": 0.3272907255403229, + "learning_rate": 0.000989109805321557, + "loss": 2.6643, + "step": 825 + }, + { + "epoch": 0.09471390895539503, + "grad_norm": 0.34726232660473333, + "learning_rate": 0.0009890712263357626, + "loss": 2.8609, + "step": 826 + }, + { + "epoch": 0.0948285747047357, + "grad_norm": 0.3337787837719053, + "learning_rate": 0.000989032579891915, + "loss": 2.6659, + "step": 827 + }, + { + "epoch": 0.09494324045407637, + "grad_norm": 0.3339243661461092, + "learning_rate": 0.000988993865995345, + "loss": 2.7355, + "step": 828 + }, + { + "epoch": 0.09505790620341704, + "grad_norm": 0.3138984574716539, + "learning_rate": 0.000988955084651392, + "loss": 2.6804, + "step": 829 + }, + { + "epoch": 0.09517257195275772, + "grad_norm": 0.33403955722498646, + "learning_rate": 0.0009889162358654056, + "loss": 2.7739, + "step": 830 + }, + { + "epoch": 0.09528723770209838, + "grad_norm": 0.3708382108265236, + "learning_rate": 0.000988877319642744, + "loss": 2.7989, + "step": 831 + }, + { + "epoch": 0.09540190345143905, + "grad_norm": 0.3505375742059597, + "learning_rate": 0.000988838335988775, + "loss": 2.7566, + "step": 832 + }, + { + "epoch": 0.09551656920077972, + "grad_norm": 0.3464552324259652, + "learning_rate": 0.0009887992849088754, + "loss": 2.8127, + "step": 833 + }, + { + "epoch": 0.0956312349501204, + "grad_norm": 0.3504053458712374, + "learning_rate": 0.000988760166408432, + "loss": 2.7198, + "step": 834 + }, + { + "epoch": 0.09574590069946107, + "grad_norm": 0.36863131843788105, + "learning_rate": 0.0009887209804928404, + "loss": 2.6591, + "step": 835 + }, + { + "epoch": 0.09586056644880174, + "grad_norm": 0.3420348091721313, + "learning_rate": 0.0009886817271675052, + "loss": 2.6698, + "step": 836 + }, + { + "epoch": 0.09597523219814241, + "grad_norm": 0.35749659591741795, + "learning_rate": 0.000988642406437841, + "loss": 2.7573, + "step": 837 + }, + { + "epoch": 0.09608989794748309, + "grad_norm": 0.3358964601273219, + "learning_rate": 0.0009886030183092712, + "loss": 2.7506, + "step": 838 + }, + { + "epoch": 0.09620456369682376, + "grad_norm": 0.3668737893660492, + "learning_rate": 0.0009885635627872285, + "loss": 2.8287, + "step": 839 + }, + { + "epoch": 0.09631922944616443, + "grad_norm": 0.3408134234258222, + "learning_rate": 0.0009885240398771554, + "loss": 2.7296, + "step": 840 + }, + { + "epoch": 0.0964338951955051, + "grad_norm": 0.33747561403912596, + "learning_rate": 0.0009884844495845029, + "loss": 2.7521, + "step": 841 + }, + { + "epoch": 0.09654856094484578, + "grad_norm": 0.34035208530181416, + "learning_rate": 0.000988444791914732, + "loss": 2.7553, + "step": 842 + }, + { + "epoch": 0.09666322669418645, + "grad_norm": 0.31514917599931935, + "learning_rate": 0.0009884050668733126, + "loss": 2.6745, + "step": 843 + }, + { + "epoch": 0.09677789244352712, + "grad_norm": 0.32159979058642046, + "learning_rate": 0.0009883652744657244, + "loss": 2.8209, + "step": 844 + }, + { + "epoch": 0.0968925581928678, + "grad_norm": 0.2929272262423057, + "learning_rate": 0.0009883254146974554, + "loss": 2.7286, + "step": 845 + }, + { + "epoch": 0.09700722394220847, + "grad_norm": 0.3247318052244169, + "learning_rate": 0.0009882854875740037, + "loss": 2.7182, + "step": 846 + }, + { + "epoch": 0.09712188969154914, + "grad_norm": 0.3070458372044395, + "learning_rate": 0.0009882454931008768, + "loss": 2.7498, + "step": 847 + }, + { + "epoch": 0.09723655544088981, + "grad_norm": 0.36472248262529544, + "learning_rate": 0.0009882054312835907, + "loss": 2.6909, + "step": 848 + }, + { + "epoch": 0.09735122119023047, + "grad_norm": 0.32982848852604985, + "learning_rate": 0.0009881653021276715, + "loss": 2.7232, + "step": 849 + }, + { + "epoch": 0.09746588693957114, + "grad_norm": 0.33090084786306473, + "learning_rate": 0.0009881251056386541, + "loss": 2.7515, + "step": 850 + }, + { + "epoch": 0.09758055268891182, + "grad_norm": 0.29511722670965806, + "learning_rate": 0.000988084841822083, + "loss": 2.7181, + "step": 851 + }, + { + "epoch": 0.09769521843825249, + "grad_norm": 0.3206775465844979, + "learning_rate": 0.0009880445106835117, + "loss": 2.7024, + "step": 852 + }, + { + "epoch": 0.09780988418759316, + "grad_norm": 0.3078499440357562, + "learning_rate": 0.000988004112228503, + "loss": 2.6708, + "step": 853 + }, + { + "epoch": 0.09792454993693384, + "grad_norm": 0.3242801278634592, + "learning_rate": 0.0009879636464626294, + "loss": 2.6326, + "step": 854 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 0.37575789365624146, + "learning_rate": 0.0009879231133914721, + "loss": 2.9144, + "step": 855 + }, + { + "epoch": 0.09815388143561518, + "grad_norm": 0.34069638352882087, + "learning_rate": 0.000987882513020622, + "loss": 2.8114, + "step": 856 + }, + { + "epoch": 0.09826854718495585, + "grad_norm": 0.31619135098553197, + "learning_rate": 0.000987841845355679, + "loss": 2.7787, + "step": 857 + }, + { + "epoch": 0.09838321293429653, + "grad_norm": 0.3510580409952719, + "learning_rate": 0.0009878011104022526, + "loss": 2.7922, + "step": 858 + }, + { + "epoch": 0.0984978786836372, + "grad_norm": 0.3322855120955192, + "learning_rate": 0.0009877603081659614, + "loss": 2.856, + "step": 859 + }, + { + "epoch": 0.09861254443297787, + "grad_norm": 0.3294385914718088, + "learning_rate": 0.0009877194386524334, + "loss": 2.7199, + "step": 860 + }, + { + "epoch": 0.09872721018231854, + "grad_norm": 0.33997121857874835, + "learning_rate": 0.0009876785018673054, + "loss": 2.7859, + "step": 861 + }, + { + "epoch": 0.09884187593165922, + "grad_norm": 0.3062185666041792, + "learning_rate": 0.0009876374978162242, + "loss": 2.7221, + "step": 862 + }, + { + "epoch": 0.09895654168099989, + "grad_norm": 0.33245848730584676, + "learning_rate": 0.0009875964265048452, + "loss": 2.7374, + "step": 863 + }, + { + "epoch": 0.09907120743034056, + "grad_norm": 0.31244086265623927, + "learning_rate": 0.0009875552879388336, + "loss": 2.7318, + "step": 864 + }, + { + "epoch": 0.09918587317968124, + "grad_norm": 0.2802373680461261, + "learning_rate": 0.000987514082123864, + "loss": 2.6616, + "step": 865 + }, + { + "epoch": 0.0993005389290219, + "grad_norm": 0.27883050987202396, + "learning_rate": 0.0009874728090656193, + "loss": 2.8188, + "step": 866 + }, + { + "epoch": 0.09941520467836257, + "grad_norm": 0.3213334584072041, + "learning_rate": 0.0009874314687697927, + "loss": 2.6124, + "step": 867 + }, + { + "epoch": 0.09952987042770324, + "grad_norm": 0.3096702355426216, + "learning_rate": 0.0009873900612420866, + "loss": 2.7301, + "step": 868 + }, + { + "epoch": 0.09964453617704391, + "grad_norm": 0.29201982224099887, + "learning_rate": 0.0009873485864882116, + "loss": 2.8175, + "step": 869 + }, + { + "epoch": 0.09975920192638459, + "grad_norm": 0.3139347670960125, + "learning_rate": 0.000987307044513889, + "loss": 2.7546, + "step": 870 + }, + { + "epoch": 0.09987386767572526, + "grad_norm": 0.3200740611375866, + "learning_rate": 0.0009872654353248486, + "loss": 2.8197, + "step": 871 + }, + { + "epoch": 0.09998853342506593, + "grad_norm": 0.2864072012925529, + "learning_rate": 0.0009872237589268295, + "loss": 2.6632, + "step": 872 + }, + { + "epoch": 0.1001031991744066, + "grad_norm": 0.35511885344886157, + "learning_rate": 0.00098718201532558, + "loss": 2.8809, + "step": 873 + }, + { + "epoch": 0.10021786492374728, + "grad_norm": 0.3586192878035327, + "learning_rate": 0.0009871402045268582, + "loss": 2.7683, + "step": 874 + }, + { + "epoch": 0.10033253067308795, + "grad_norm": 0.3477708934347381, + "learning_rate": 0.000987098326536431, + "loss": 2.8264, + "step": 875 + }, + { + "epoch": 0.10044719642242862, + "grad_norm": 0.3457708406259805, + "learning_rate": 0.0009870563813600744, + "loss": 2.709, + "step": 876 + }, + { + "epoch": 0.1005618621717693, + "grad_norm": 0.31621979228012626, + "learning_rate": 0.0009870143690035743, + "loss": 2.7643, + "step": 877 + }, + { + "epoch": 0.10067652792110997, + "grad_norm": 0.3443855725364193, + "learning_rate": 0.0009869722894727251, + "loss": 2.8125, + "step": 878 + }, + { + "epoch": 0.10079119367045064, + "grad_norm": 0.3462186702794076, + "learning_rate": 0.0009869301427733314, + "loss": 2.6844, + "step": 879 + }, + { + "epoch": 0.10090585941979131, + "grad_norm": 0.3668667990857489, + "learning_rate": 0.000986887928911206, + "loss": 2.7721, + "step": 880 + }, + { + "epoch": 0.10102052516913199, + "grad_norm": 0.3307752499719894, + "learning_rate": 0.0009868456478921719, + "loss": 2.6889, + "step": 881 + }, + { + "epoch": 0.10113519091847266, + "grad_norm": 0.3380673191616369, + "learning_rate": 0.0009868032997220608, + "loss": 2.7566, + "step": 882 + }, + { + "epoch": 0.10124985666781332, + "grad_norm": 0.3683146489378319, + "learning_rate": 0.0009867608844067136, + "loss": 2.7014, + "step": 883 + }, + { + "epoch": 0.10136452241715399, + "grad_norm": 0.3077492427814718, + "learning_rate": 0.000986718401951981, + "loss": 2.7852, + "step": 884 + }, + { + "epoch": 0.10147918816649466, + "grad_norm": 0.3380882148969396, + "learning_rate": 0.0009866758523637228, + "loss": 2.7306, + "step": 885 + }, + { + "epoch": 0.10159385391583534, + "grad_norm": 0.31518066604237777, + "learning_rate": 0.0009866332356478075, + "loss": 2.7999, + "step": 886 + }, + { + "epoch": 0.10170851966517601, + "grad_norm": 0.321232691672563, + "learning_rate": 0.000986590551810113, + "loss": 2.8056, + "step": 887 + }, + { + "epoch": 0.10182318541451668, + "grad_norm": 0.41336503236904193, + "learning_rate": 0.0009865478008565275, + "loss": 2.7064, + "step": 888 + }, + { + "epoch": 0.10193785116385735, + "grad_norm": 0.31803309832691334, + "learning_rate": 0.0009865049827929475, + "loss": 2.7417, + "step": 889 + }, + { + "epoch": 0.10205251691319803, + "grad_norm": 0.3107495413246145, + "learning_rate": 0.0009864620976252785, + "loss": 2.7376, + "step": 890 + }, + { + "epoch": 0.1021671826625387, + "grad_norm": 0.3316336868396862, + "learning_rate": 0.000986419145359436, + "loss": 2.7048, + "step": 891 + }, + { + "epoch": 0.10228184841187937, + "grad_norm": 0.32479074064184016, + "learning_rate": 0.0009863761260013443, + "loss": 2.8005, + "step": 892 + }, + { + "epoch": 0.10239651416122005, + "grad_norm": 0.3731166641753746, + "learning_rate": 0.0009863330395569374, + "loss": 2.9088, + "step": 893 + }, + { + "epoch": 0.10251117991056072, + "grad_norm": 0.3524287292259641, + "learning_rate": 0.000986289886032158, + "loss": 2.7194, + "step": 894 + }, + { + "epoch": 0.10262584565990139, + "grad_norm": 0.3186240583943147, + "learning_rate": 0.0009862466654329582, + "loss": 2.7267, + "step": 895 + }, + { + "epoch": 0.10274051140924206, + "grad_norm": 0.3275852320670076, + "learning_rate": 0.0009862033777652997, + "loss": 2.7583, + "step": 896 + }, + { + "epoch": 0.10285517715858274, + "grad_norm": 0.32148248170710536, + "learning_rate": 0.000986160023035153, + "loss": 2.7444, + "step": 897 + }, + { + "epoch": 0.10296984290792341, + "grad_norm": 0.34500432256976943, + "learning_rate": 0.0009861166012484982, + "loss": 2.6072, + "step": 898 + }, + { + "epoch": 0.10308450865726408, + "grad_norm": 0.33632349209929197, + "learning_rate": 0.0009860731124113247, + "loss": 2.7418, + "step": 899 + }, + { + "epoch": 0.10319917440660474, + "grad_norm": 0.3437098510618024, + "learning_rate": 0.0009860295565296306, + "loss": 2.7056, + "step": 900 + }, + { + "epoch": 0.10331384015594541, + "grad_norm": 0.310266689409354, + "learning_rate": 0.000985985933609424, + "loss": 2.5563, + "step": 901 + }, + { + "epoch": 0.10342850590528609, + "grad_norm": 0.33563157748964206, + "learning_rate": 0.0009859422436567212, + "loss": 2.7025, + "step": 902 + }, + { + "epoch": 0.10354317165462676, + "grad_norm": 0.3596718715418318, + "learning_rate": 0.000985898486677549, + "loss": 2.7307, + "step": 903 + }, + { + "epoch": 0.10365783740396743, + "grad_norm": 0.35428516359489975, + "learning_rate": 0.0009858546626779425, + "loss": 2.605, + "step": 904 + }, + { + "epoch": 0.1037725031533081, + "grad_norm": 0.3566856497476704, + "learning_rate": 0.0009858107716639464, + "loss": 2.8517, + "step": 905 + }, + { + "epoch": 0.10388716890264878, + "grad_norm": 0.3831760019290575, + "learning_rate": 0.000985766813641615, + "loss": 2.9544, + "step": 906 + }, + { + "epoch": 0.10400183465198945, + "grad_norm": 0.3831545895458341, + "learning_rate": 0.0009857227886170112, + "loss": 2.7249, + "step": 907 + }, + { + "epoch": 0.10411650040133012, + "grad_norm": 0.34025075196559396, + "learning_rate": 0.0009856786965962074, + "loss": 2.7835, + "step": 908 + }, + { + "epoch": 0.1042311661506708, + "grad_norm": 0.3452420180323608, + "learning_rate": 0.0009856345375852853, + "loss": 2.7445, + "step": 909 + }, + { + "epoch": 0.10434583190001147, + "grad_norm": 0.3165902527243338, + "learning_rate": 0.0009855903115903357, + "loss": 2.6954, + "step": 910 + }, + { + "epoch": 0.10446049764935214, + "grad_norm": 0.3503213420775051, + "learning_rate": 0.0009855460186174588, + "loss": 2.8186, + "step": 911 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.35478406299695237, + "learning_rate": 0.000985501658672764, + "loss": 2.8068, + "step": 912 + }, + { + "epoch": 0.10468982914803349, + "grad_norm": 0.35661653524013215, + "learning_rate": 0.0009854572317623698, + "loss": 2.6419, + "step": 913 + }, + { + "epoch": 0.10480449489737416, + "grad_norm": 0.3188442755029797, + "learning_rate": 0.0009854127378924043, + "loss": 2.746, + "step": 914 + }, + { + "epoch": 0.10491916064671483, + "grad_norm": 0.3252574475793499, + "learning_rate": 0.0009853681770690043, + "loss": 2.875, + "step": 915 + }, + { + "epoch": 0.1050338263960555, + "grad_norm": 0.3143554592782651, + "learning_rate": 0.0009853235492983164, + "loss": 2.7812, + "step": 916 + }, + { + "epoch": 0.10514849214539616, + "grad_norm": 0.31738337216505375, + "learning_rate": 0.000985278854586496, + "loss": 2.6693, + "step": 917 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 0.30387290517004506, + "learning_rate": 0.0009852340929397076, + "loss": 2.7041, + "step": 918 + }, + { + "epoch": 0.10537782364407751, + "grad_norm": 0.32024002747517516, + "learning_rate": 0.0009851892643641257, + "loss": 2.698, + "step": 919 + }, + { + "epoch": 0.10549248939341818, + "grad_norm": 0.3360100400602807, + "learning_rate": 0.000985144368865933, + "loss": 2.6534, + "step": 920 + }, + { + "epoch": 0.10560715514275885, + "grad_norm": 0.3327923598699991, + "learning_rate": 0.0009850994064513226, + "loss": 2.8473, + "step": 921 + }, + { + "epoch": 0.10572182089209953, + "grad_norm": 0.34937774561262047, + "learning_rate": 0.000985054377126496, + "loss": 2.9757, + "step": 922 + }, + { + "epoch": 0.1058364866414402, + "grad_norm": 0.31184386956552707, + "learning_rate": 0.0009850092808976639, + "loss": 2.6291, + "step": 923 + }, + { + "epoch": 0.10595115239078087, + "grad_norm": 0.2935477074302251, + "learning_rate": 0.0009849641177710467, + "loss": 2.7864, + "step": 924 + }, + { + "epoch": 0.10606581814012155, + "grad_norm": 0.2874727577927056, + "learning_rate": 0.0009849188877528736, + "loss": 2.727, + "step": 925 + }, + { + "epoch": 0.10618048388946222, + "grad_norm": 0.30490891533969455, + "learning_rate": 0.0009848735908493834, + "loss": 2.6966, + "step": 926 + }, + { + "epoch": 0.10629514963880289, + "grad_norm": 0.3211296397929058, + "learning_rate": 0.0009848282270668238, + "loss": 2.7091, + "step": 927 + }, + { + "epoch": 0.10640981538814356, + "grad_norm": 0.34374654659066245, + "learning_rate": 0.000984782796411452, + "loss": 2.7843, + "step": 928 + }, + { + "epoch": 0.10652448113748424, + "grad_norm": 0.2896515656205755, + "learning_rate": 0.0009847372988895343, + "loss": 2.5801, + "step": 929 + }, + { + "epoch": 0.10663914688682491, + "grad_norm": 0.3148035381061537, + "learning_rate": 0.000984691734507346, + "loss": 2.6524, + "step": 930 + }, + { + "epoch": 0.10675381263616558, + "grad_norm": 0.3410677274491946, + "learning_rate": 0.0009846461032711723, + "loss": 2.6706, + "step": 931 + }, + { + "epoch": 0.10686847838550625, + "grad_norm": 0.40222875297261407, + "learning_rate": 0.0009846004051873066, + "loss": 2.8881, + "step": 932 + }, + { + "epoch": 0.10698314413484693, + "grad_norm": 0.3583531015433631, + "learning_rate": 0.0009845546402620523, + "loss": 2.8905, + "step": 933 + }, + { + "epoch": 0.10709780988418759, + "grad_norm": 0.3246601675857077, + "learning_rate": 0.0009845088085017218, + "loss": 2.776, + "step": 934 + }, + { + "epoch": 0.10721247563352826, + "grad_norm": 0.3225621593967708, + "learning_rate": 0.000984462909912637, + "loss": 2.737, + "step": 935 + }, + { + "epoch": 0.10732714138286893, + "grad_norm": 0.49828952252497727, + "learning_rate": 0.0009844169445011282, + "loss": 2.6348, + "step": 936 + }, + { + "epoch": 0.1074418071322096, + "grad_norm": 0.30757707747773966, + "learning_rate": 0.0009843709122735358, + "loss": 2.7335, + "step": 937 + }, + { + "epoch": 0.10755647288155028, + "grad_norm": 0.2894803516290994, + "learning_rate": 0.000984324813236209, + "loss": 2.6535, + "step": 938 + }, + { + "epoch": 0.10767113863089095, + "grad_norm": 0.33881389323803873, + "learning_rate": 0.0009842786473955062, + "loss": 2.6218, + "step": 939 + }, + { + "epoch": 0.10778580438023162, + "grad_norm": 0.3025549629221241, + "learning_rate": 0.0009842324147577954, + "loss": 2.699, + "step": 940 + }, + { + "epoch": 0.1079004701295723, + "grad_norm": 0.293480584674943, + "learning_rate": 0.0009841861153294534, + "loss": 2.7151, + "step": 941 + }, + { + "epoch": 0.10801513587891297, + "grad_norm": 0.2846215062428214, + "learning_rate": 0.000984139749116866, + "loss": 2.6568, + "step": 942 + }, + { + "epoch": 0.10812980162825364, + "grad_norm": 0.32537499448071794, + "learning_rate": 0.0009840933161264288, + "loss": 2.7508, + "step": 943 + }, + { + "epoch": 0.10824446737759431, + "grad_norm": 0.34963029486376984, + "learning_rate": 0.0009840468163645462, + "loss": 2.6682, + "step": 944 + }, + { + "epoch": 0.10835913312693499, + "grad_norm": 0.3921614479006567, + "learning_rate": 0.0009840002498376322, + "loss": 2.7917, + "step": 945 + }, + { + "epoch": 0.10847379887627566, + "grad_norm": 0.3321891425571164, + "learning_rate": 0.0009839536165521094, + "loss": 2.7478, + "step": 946 + }, + { + "epoch": 0.10858846462561633, + "grad_norm": 0.28902044139401234, + "learning_rate": 0.0009839069165144103, + "loss": 2.7449, + "step": 947 + }, + { + "epoch": 0.108703130374957, + "grad_norm": 0.299189556004677, + "learning_rate": 0.0009838601497309763, + "loss": 2.7227, + "step": 948 + }, + { + "epoch": 0.10881779612429768, + "grad_norm": 0.32309741640074857, + "learning_rate": 0.0009838133162082578, + "loss": 2.6977, + "step": 949 + }, + { + "epoch": 0.10893246187363835, + "grad_norm": 0.3280750633785187, + "learning_rate": 0.0009837664159527146, + "loss": 2.6541, + "step": 950 + }, + { + "epoch": 0.10904712762297901, + "grad_norm": 0.30808341235343445, + "learning_rate": 0.0009837194489708157, + "loss": 2.8301, + "step": 951 + }, + { + "epoch": 0.10916179337231968, + "grad_norm": 0.30592288377064486, + "learning_rate": 0.0009836724152690395, + "loss": 2.7716, + "step": 952 + }, + { + "epoch": 0.10927645912166035, + "grad_norm": 0.3054094715700297, + "learning_rate": 0.0009836253148538731, + "loss": 2.6287, + "step": 953 + }, + { + "epoch": 0.10939112487100103, + "grad_norm": 0.33382065757584795, + "learning_rate": 0.0009835781477318133, + "loss": 2.9153, + "step": 954 + }, + { + "epoch": 0.1095057906203417, + "grad_norm": 0.329303492187594, + "learning_rate": 0.000983530913909366, + "loss": 2.7486, + "step": 955 + }, + { + "epoch": 0.10962045636968237, + "grad_norm": 0.33097903628142017, + "learning_rate": 0.0009834836133930458, + "loss": 2.7055, + "step": 956 + }, + { + "epoch": 0.10973512211902305, + "grad_norm": 0.30387516336737525, + "learning_rate": 0.0009834362461893773, + "loss": 2.6755, + "step": 957 + }, + { + "epoch": 0.10984978786836372, + "grad_norm": 0.32191404961036785, + "learning_rate": 0.0009833888123048937, + "loss": 2.7279, + "step": 958 + }, + { + "epoch": 0.10996445361770439, + "grad_norm": 0.32777686743095613, + "learning_rate": 0.0009833413117461378, + "loss": 2.7212, + "step": 959 + }, + { + "epoch": 0.11007911936704506, + "grad_norm": 0.34016131221907525, + "learning_rate": 0.0009832937445196613, + "loss": 2.7407, + "step": 960 + }, + { + "epoch": 0.11019378511638574, + "grad_norm": 0.331922025619126, + "learning_rate": 0.000983246110632025, + "loss": 2.8019, + "step": 961 + }, + { + "epoch": 0.11030845086572641, + "grad_norm": 0.3390794099029687, + "learning_rate": 0.0009831984100897994, + "loss": 2.7264, + "step": 962 + }, + { + "epoch": 0.11042311661506708, + "grad_norm": 0.3049985769813946, + "learning_rate": 0.0009831506428995636, + "loss": 2.6779, + "step": 963 + }, + { + "epoch": 0.11053778236440776, + "grad_norm": 0.33682156285593406, + "learning_rate": 0.0009831028090679064, + "loss": 2.7234, + "step": 964 + }, + { + "epoch": 0.11065244811374843, + "grad_norm": 0.3376055357398166, + "learning_rate": 0.0009830549086014254, + "loss": 2.7041, + "step": 965 + }, + { + "epoch": 0.1107671138630891, + "grad_norm": 0.31966388444642974, + "learning_rate": 0.0009830069415067276, + "loss": 2.8098, + "step": 966 + }, + { + "epoch": 0.11088177961242977, + "grad_norm": 0.3581563217040166, + "learning_rate": 0.0009829589077904293, + "loss": 2.8812, + "step": 967 + }, + { + "epoch": 0.11099644536177045, + "grad_norm": 0.3166314182435769, + "learning_rate": 0.0009829108074591556, + "loss": 2.6441, + "step": 968 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.36265769557937405, + "learning_rate": 0.0009828626405195412, + "loss": 2.744, + "step": 969 + }, + { + "epoch": 0.11122577686045178, + "grad_norm": 0.323841995827615, + "learning_rate": 0.0009828144069782296, + "loss": 2.8861, + "step": 970 + }, + { + "epoch": 0.11134044260979245, + "grad_norm": 0.3489483636270084, + "learning_rate": 0.0009827661068418738, + "loss": 2.6973, + "step": 971 + }, + { + "epoch": 0.11145510835913312, + "grad_norm": 0.3060280248774852, + "learning_rate": 0.0009827177401171361, + "loss": 2.6363, + "step": 972 + }, + { + "epoch": 0.1115697741084738, + "grad_norm": 0.32495859159485824, + "learning_rate": 0.0009826693068106876, + "loss": 2.6866, + "step": 973 + }, + { + "epoch": 0.11168443985781447, + "grad_norm": 0.30839826640265783, + "learning_rate": 0.0009826208069292086, + "loss": 2.7802, + "step": 974 + }, + { + "epoch": 0.11179910560715514, + "grad_norm": 0.3256547528909279, + "learning_rate": 0.000982572240479389, + "loss": 2.7974, + "step": 975 + }, + { + "epoch": 0.11191377135649581, + "grad_norm": 0.31968544062708854, + "learning_rate": 0.0009825236074679274, + "loss": 2.689, + "step": 976 + }, + { + "epoch": 0.11202843710583649, + "grad_norm": 0.3318767779182895, + "learning_rate": 0.0009824749079015318, + "loss": 2.8626, + "step": 977 + }, + { + "epoch": 0.11214310285517716, + "grad_norm": 0.33012813821594267, + "learning_rate": 0.0009824261417869197, + "loss": 2.7943, + "step": 978 + }, + { + "epoch": 0.11225776860451783, + "grad_norm": 0.3132083973180823, + "learning_rate": 0.000982377309130817, + "loss": 2.7097, + "step": 979 + }, + { + "epoch": 0.1123724343538585, + "grad_norm": 0.34153689355138867, + "learning_rate": 0.0009823284099399596, + "loss": 2.7981, + "step": 980 + }, + { + "epoch": 0.11248710010319918, + "grad_norm": 0.33657265966277156, + "learning_rate": 0.000982279444221092, + "loss": 2.6785, + "step": 981 + }, + { + "epoch": 0.11260176585253985, + "grad_norm": 0.3453268311427654, + "learning_rate": 0.0009822304119809682, + "loss": 2.6508, + "step": 982 + }, + { + "epoch": 0.11271643160188052, + "grad_norm": 0.3031988198988383, + "learning_rate": 0.0009821813132263513, + "loss": 2.7329, + "step": 983 + }, + { + "epoch": 0.1128310973512212, + "grad_norm": 0.32400632174632965, + "learning_rate": 0.0009821321479640134, + "loss": 2.6987, + "step": 984 + }, + { + "epoch": 0.11294576310056187, + "grad_norm": 0.37276020943315163, + "learning_rate": 0.0009820829162007357, + "loss": 2.8721, + "step": 985 + }, + { + "epoch": 0.11306042884990253, + "grad_norm": 0.3384612863614987, + "learning_rate": 0.0009820336179433091, + "loss": 2.9165, + "step": 986 + }, + { + "epoch": 0.1131750945992432, + "grad_norm": 0.33406800241198853, + "learning_rate": 0.0009819842531985337, + "loss": 2.8303, + "step": 987 + }, + { + "epoch": 0.11328976034858387, + "grad_norm": 0.31140342178259045, + "learning_rate": 0.0009819348219732176, + "loss": 2.7458, + "step": 988 + }, + { + "epoch": 0.11340442609792455, + "grad_norm": 0.2962579331183836, + "learning_rate": 0.0009818853242741796, + "loss": 2.6805, + "step": 989 + }, + { + "epoch": 0.11351909184726522, + "grad_norm": 0.28292167565820125, + "learning_rate": 0.0009818357601082467, + "loss": 2.8163, + "step": 990 + }, + { + "epoch": 0.11363375759660589, + "grad_norm": 0.3183572013446031, + "learning_rate": 0.0009817861294822551, + "loss": 2.753, + "step": 991 + }, + { + "epoch": 0.11374842334594656, + "grad_norm": 0.30681177470178445, + "learning_rate": 0.0009817364324030506, + "loss": 2.6307, + "step": 992 + }, + { + "epoch": 0.11386308909528724, + "grad_norm": 0.35465836811037893, + "learning_rate": 0.0009816866688774882, + "loss": 2.735, + "step": 993 + }, + { + "epoch": 0.11397775484462791, + "grad_norm": 0.3181644544792821, + "learning_rate": 0.0009816368389124314, + "loss": 2.8293, + "step": 994 + }, + { + "epoch": 0.11409242059396858, + "grad_norm": 0.34047824268106885, + "learning_rate": 0.0009815869425147537, + "loss": 2.7191, + "step": 995 + }, + { + "epoch": 0.11420708634330926, + "grad_norm": 0.32867782201404855, + "learning_rate": 0.0009815369796913373, + "loss": 2.7995, + "step": 996 + }, + { + "epoch": 0.11432175209264993, + "grad_norm": 0.3106698619228599, + "learning_rate": 0.0009814869504490731, + "loss": 2.5778, + "step": 997 + }, + { + "epoch": 0.1144364178419906, + "grad_norm": 0.3280618889900821, + "learning_rate": 0.0009814368547948623, + "loss": 2.7381, + "step": 998 + }, + { + "epoch": 0.11455108359133127, + "grad_norm": 0.3132780446348568, + "learning_rate": 0.0009813866927356142, + "loss": 2.6518, + "step": 999 + }, + { + "epoch": 0.11466574934067195, + "grad_norm": 0.32588876070319295, + "learning_rate": 0.000981336464278248, + "loss": 2.6893, + "step": 1000 + }, + { + "epoch": 0.11478041509001262, + "grad_norm": 0.3056702948458679, + "learning_rate": 0.0009812861694296917, + "loss": 2.7501, + "step": 1001 + }, + { + "epoch": 0.11489508083935329, + "grad_norm": 0.33183496907409415, + "learning_rate": 0.0009812358081968825, + "loss": 2.6796, + "step": 1002 + }, + { + "epoch": 0.11500974658869395, + "grad_norm": 0.34849853679526727, + "learning_rate": 0.0009811853805867668, + "loss": 2.5683, + "step": 1003 + }, + { + "epoch": 0.11512441233803462, + "grad_norm": 0.3031139174414606, + "learning_rate": 0.0009811348866063, + "loss": 2.7766, + "step": 1004 + }, + { + "epoch": 0.1152390780873753, + "grad_norm": 0.29881569866931484, + "learning_rate": 0.0009810843262624467, + "loss": 2.7064, + "step": 1005 + }, + { + "epoch": 0.11535374383671597, + "grad_norm": 0.31070186910350933, + "learning_rate": 0.000981033699562181, + "loss": 2.6935, + "step": 1006 + }, + { + "epoch": 0.11546840958605664, + "grad_norm": 0.3222251856197005, + "learning_rate": 0.0009809830065124858, + "loss": 2.6614, + "step": 1007 + }, + { + "epoch": 0.11558307533539731, + "grad_norm": 0.2974999123237951, + "learning_rate": 0.0009809322471203534, + "loss": 2.59, + "step": 1008 + }, + { + "epoch": 0.11569774108473799, + "grad_norm": 0.37053362927670497, + "learning_rate": 0.0009808814213927847, + "loss": 2.6886, + "step": 1009 + }, + { + "epoch": 0.11581240683407866, + "grad_norm": 0.34291666075195254, + "learning_rate": 0.0009808305293367904, + "loss": 2.7487, + "step": 1010 + }, + { + "epoch": 0.11592707258341933, + "grad_norm": 0.33293631217406694, + "learning_rate": 0.00098077957095939, + "loss": 2.6873, + "step": 1011 + }, + { + "epoch": 0.11604173833276, + "grad_norm": 0.31247215986790394, + "learning_rate": 0.0009807285462676122, + "loss": 2.6224, + "step": 1012 + }, + { + "epoch": 0.11615640408210068, + "grad_norm": 0.32406511337872285, + "learning_rate": 0.0009806774552684953, + "loss": 2.6588, + "step": 1013 + }, + { + "epoch": 0.11627106983144135, + "grad_norm": 0.32008670252823634, + "learning_rate": 0.0009806262979690857, + "loss": 2.7854, + "step": 1014 + }, + { + "epoch": 0.11638573558078202, + "grad_norm": 0.3302271667543452, + "learning_rate": 0.00098057507437644, + "loss": 2.5637, + "step": 1015 + }, + { + "epoch": 0.1165004013301227, + "grad_norm": 0.29000670475171997, + "learning_rate": 0.0009805237844976234, + "loss": 2.7351, + "step": 1016 + }, + { + "epoch": 0.11661506707946337, + "grad_norm": 0.343621392339593, + "learning_rate": 0.00098047242833971, + "loss": 2.8286, + "step": 1017 + }, + { + "epoch": 0.11672973282880404, + "grad_norm": 0.30661132222247955, + "learning_rate": 0.0009804210059097841, + "loss": 2.6705, + "step": 1018 + }, + { + "epoch": 0.11684439857814471, + "grad_norm": 0.30468794069984556, + "learning_rate": 0.0009803695172149382, + "loss": 2.807, + "step": 1019 + }, + { + "epoch": 0.11695906432748537, + "grad_norm": 0.317062828292318, + "learning_rate": 0.0009803179622622738, + "loss": 2.6885, + "step": 1020 + }, + { + "epoch": 0.11707373007682605, + "grad_norm": 0.35112012100663853, + "learning_rate": 0.0009802663410589023, + "loss": 2.7305, + "step": 1021 + }, + { + "epoch": 0.11718839582616672, + "grad_norm": 0.2969158129622786, + "learning_rate": 0.0009802146536119437, + "loss": 2.7355, + "step": 1022 + }, + { + "epoch": 0.11730306157550739, + "grad_norm": 0.3039129717959189, + "learning_rate": 0.0009801628999285274, + "loss": 2.6856, + "step": 1023 + }, + { + "epoch": 0.11741772732484806, + "grad_norm": 0.34030620948617085, + "learning_rate": 0.000980111080015792, + "loss": 2.7897, + "step": 1024 + }, + { + "epoch": 0.11753239307418874, + "grad_norm": 0.3120216069315966, + "learning_rate": 0.0009800591938808846, + "loss": 2.6794, + "step": 1025 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.3295414944699641, + "learning_rate": 0.0009800072415309623, + "loss": 2.708, + "step": 1026 + }, + { + "epoch": 0.11776172457287008, + "grad_norm": 0.3357949298264836, + "learning_rate": 0.0009799552229731907, + "loss": 2.8452, + "step": 1027 + }, + { + "epoch": 0.11787639032221076, + "grad_norm": 0.3125088740720094, + "learning_rate": 0.0009799031382147448, + "loss": 2.673, + "step": 1028 + }, + { + "epoch": 0.11799105607155143, + "grad_norm": 0.31571776739798657, + "learning_rate": 0.000979850987262809, + "loss": 2.6635, + "step": 1029 + }, + { + "epoch": 0.1181057218208921, + "grad_norm": 0.29182457208635965, + "learning_rate": 0.0009797987701245761, + "loss": 2.6937, + "step": 1030 + }, + { + "epoch": 0.11822038757023277, + "grad_norm": 0.3181162916916301, + "learning_rate": 0.0009797464868072487, + "loss": 2.7581, + "step": 1031 + }, + { + "epoch": 0.11833505331957345, + "grad_norm": 0.2963653908345339, + "learning_rate": 0.0009796941373180384, + "loss": 2.6714, + "step": 1032 + }, + { + "epoch": 0.11844971906891412, + "grad_norm": 0.3201736487434816, + "learning_rate": 0.0009796417216641653, + "loss": 2.7829, + "step": 1033 + }, + { + "epoch": 0.11856438481825479, + "grad_norm": 0.28811258884810137, + "learning_rate": 0.00097958923985286, + "loss": 2.6945, + "step": 1034 + }, + { + "epoch": 0.11867905056759546, + "grad_norm": 0.30028330229735706, + "learning_rate": 0.0009795366918913604, + "loss": 2.7629, + "step": 1035 + }, + { + "epoch": 0.11879371631693614, + "grad_norm": 0.29830822269162294, + "learning_rate": 0.0009794840777869152, + "loss": 2.6834, + "step": 1036 + }, + { + "epoch": 0.1189083820662768, + "grad_norm": 0.3451975774457418, + "learning_rate": 0.0009794313975467813, + "loss": 2.7251, + "step": 1037 + }, + { + "epoch": 0.11902304781561747, + "grad_norm": 0.28935256666239983, + "learning_rate": 0.0009793786511782248, + "loss": 2.6714, + "step": 1038 + }, + { + "epoch": 0.11913771356495814, + "grad_norm": 0.32535749919723506, + "learning_rate": 0.000979325838688521, + "loss": 2.6426, + "step": 1039 + }, + { + "epoch": 0.11925237931429881, + "grad_norm": 0.3547071532640959, + "learning_rate": 0.000979272960084955, + "loss": 2.6608, + "step": 1040 + }, + { + "epoch": 0.11936704506363949, + "grad_norm": 0.32764847392744145, + "learning_rate": 0.0009792200153748195, + "loss": 2.8062, + "step": 1041 + }, + { + "epoch": 0.11948171081298016, + "grad_norm": 0.30544866903508994, + "learning_rate": 0.0009791670045654177, + "loss": 2.614, + "step": 1042 + }, + { + "epoch": 0.11959637656232083, + "grad_norm": 0.3180552196899113, + "learning_rate": 0.0009791139276640614, + "loss": 2.7191, + "step": 1043 + }, + { + "epoch": 0.1197110423116615, + "grad_norm": 0.3675342377286231, + "learning_rate": 0.0009790607846780718, + "loss": 2.8023, + "step": 1044 + }, + { + "epoch": 0.11982570806100218, + "grad_norm": 0.3314896339481721, + "learning_rate": 0.0009790075756147783, + "loss": 2.7575, + "step": 1045 + }, + { + "epoch": 0.11994037381034285, + "grad_norm": 0.34456005189312017, + "learning_rate": 0.0009789543004815207, + "loss": 2.5887, + "step": 1046 + }, + { + "epoch": 0.12005503955968352, + "grad_norm": 0.3729557046401906, + "learning_rate": 0.000978900959285647, + "loss": 2.7911, + "step": 1047 + }, + { + "epoch": 0.1201697053090242, + "grad_norm": 0.30020050512761143, + "learning_rate": 0.0009788475520345146, + "loss": 2.7331, + "step": 1048 + }, + { + "epoch": 0.12028437105836487, + "grad_norm": 0.3113337001579776, + "learning_rate": 0.0009787940787354902, + "loss": 2.6733, + "step": 1049 + }, + { + "epoch": 0.12039903680770554, + "grad_norm": 0.3280475451522809, + "learning_rate": 0.000978740539395949, + "loss": 2.7824, + "step": 1050 + }, + { + "epoch": 0.12051370255704622, + "grad_norm": 0.3239225129492583, + "learning_rate": 0.0009786869340232761, + "loss": 2.7562, + "step": 1051 + }, + { + "epoch": 0.12062836830638689, + "grad_norm": 0.2962223313526621, + "learning_rate": 0.0009786332626248655, + "loss": 2.6495, + "step": 1052 + }, + { + "epoch": 0.12074303405572756, + "grad_norm": 0.31580422421558124, + "learning_rate": 0.0009785795252081199, + "loss": 2.8733, + "step": 1053 + }, + { + "epoch": 0.12085769980506822, + "grad_norm": 0.29448163542790834, + "learning_rate": 0.000978525721780451, + "loss": 2.8093, + "step": 1054 + }, + { + "epoch": 0.12097236555440889, + "grad_norm": 0.3030932921534723, + "learning_rate": 0.0009784718523492804, + "loss": 2.7391, + "step": 1055 + }, + { + "epoch": 0.12108703130374956, + "grad_norm": 0.2898693439714082, + "learning_rate": 0.0009784179169220384, + "loss": 2.7475, + "step": 1056 + }, + { + "epoch": 0.12120169705309024, + "grad_norm": 0.3112045966633896, + "learning_rate": 0.0009783639155061643, + "loss": 2.7234, + "step": 1057 + }, + { + "epoch": 0.12131636280243091, + "grad_norm": 0.3208099404963717, + "learning_rate": 0.0009783098481091063, + "loss": 2.8031, + "step": 1058 + }, + { + "epoch": 0.12143102855177158, + "grad_norm": 0.31157686899145115, + "learning_rate": 0.0009782557147383225, + "loss": 2.7221, + "step": 1059 + }, + { + "epoch": 0.12154569430111226, + "grad_norm": 0.3463634072050334, + "learning_rate": 0.0009782015154012789, + "loss": 2.7012, + "step": 1060 + }, + { + "epoch": 0.12166036005045293, + "grad_norm": 0.3319752121692342, + "learning_rate": 0.0009781472501054517, + "loss": 2.6427, + "step": 1061 + }, + { + "epoch": 0.1217750257997936, + "grad_norm": 0.31729331292484186, + "learning_rate": 0.0009780929188583256, + "loss": 2.7806, + "step": 1062 + }, + { + "epoch": 0.12188969154913427, + "grad_norm": 0.30301639848841, + "learning_rate": 0.000978038521667395, + "loss": 2.7263, + "step": 1063 + }, + { + "epoch": 0.12200435729847495, + "grad_norm": 0.3009568079740633, + "learning_rate": 0.000977984058540162, + "loss": 2.682, + "step": 1064 + }, + { + "epoch": 0.12211902304781562, + "grad_norm": 0.3228957479191051, + "learning_rate": 0.0009779295294841397, + "loss": 2.6556, + "step": 1065 + }, + { + "epoch": 0.12223368879715629, + "grad_norm": 0.3121393966380177, + "learning_rate": 0.0009778749345068487, + "loss": 2.709, + "step": 1066 + }, + { + "epoch": 0.12234835454649697, + "grad_norm": 0.3048310318104447, + "learning_rate": 0.00097782027361582, + "loss": 2.6889, + "step": 1067 + }, + { + "epoch": 0.12246302029583764, + "grad_norm": 0.2989077674025916, + "learning_rate": 0.0009777655468185924, + "loss": 2.8158, + "step": 1068 + }, + { + "epoch": 0.12257768604517831, + "grad_norm": 0.3088123474556999, + "learning_rate": 0.0009777107541227147, + "loss": 2.6555, + "step": 1069 + }, + { + "epoch": 0.12269235179451898, + "grad_norm": 0.2939190704120615, + "learning_rate": 0.0009776558955357443, + "loss": 2.747, + "step": 1070 + }, + { + "epoch": 0.12280701754385964, + "grad_norm": 0.30798364879326584, + "learning_rate": 0.0009776009710652483, + "loss": 2.8674, + "step": 1071 + }, + { + "epoch": 0.12292168329320032, + "grad_norm": 0.2910270897505483, + "learning_rate": 0.0009775459807188022, + "loss": 2.6254, + "step": 1072 + }, + { + "epoch": 0.12303634904254099, + "grad_norm": 0.29274768438081084, + "learning_rate": 0.0009774909245039909, + "loss": 2.5372, + "step": 1073 + }, + { + "epoch": 0.12315101479188166, + "grad_norm": 0.29508794714123326, + "learning_rate": 0.0009774358024284082, + "loss": 2.8244, + "step": 1074 + }, + { + "epoch": 0.12326568054122233, + "grad_norm": 0.29727437146382446, + "learning_rate": 0.0009773806144996575, + "loss": 2.7433, + "step": 1075 + }, + { + "epoch": 0.123380346290563, + "grad_norm": 0.3039161296405047, + "learning_rate": 0.0009773253607253507, + "loss": 2.6833, + "step": 1076 + }, + { + "epoch": 0.12349501203990368, + "grad_norm": 0.30848971838431627, + "learning_rate": 0.000977270041113109, + "loss": 2.7021, + "step": 1077 + }, + { + "epoch": 0.12360967778924435, + "grad_norm": 0.28643350409160817, + "learning_rate": 0.0009772146556705629, + "loss": 2.6838, + "step": 1078 + }, + { + "epoch": 0.12372434353858502, + "grad_norm": 0.3176345510416603, + "learning_rate": 0.0009771592044053512, + "loss": 2.6565, + "step": 1079 + }, + { + "epoch": 0.1238390092879257, + "grad_norm": 0.3207402413299687, + "learning_rate": 0.000977103687325123, + "loss": 2.7815, + "step": 1080 + }, + { + "epoch": 0.12395367503726637, + "grad_norm": 0.32518560099990723, + "learning_rate": 0.0009770481044375356, + "loss": 2.6238, + "step": 1081 + }, + { + "epoch": 0.12406834078660704, + "grad_norm": 0.33220037435475425, + "learning_rate": 0.0009769924557502553, + "loss": 2.6612, + "step": 1082 + }, + { + "epoch": 0.12418300653594772, + "grad_norm": 0.3233835772555074, + "learning_rate": 0.0009769367412709585, + "loss": 2.695, + "step": 1083 + }, + { + "epoch": 0.12429767228528839, + "grad_norm": 0.3053809351215719, + "learning_rate": 0.0009768809610073291, + "loss": 2.6801, + "step": 1084 + }, + { + "epoch": 0.12441233803462906, + "grad_norm": 0.3552921644812118, + "learning_rate": 0.0009768251149670614, + "loss": 2.6848, + "step": 1085 + }, + { + "epoch": 0.12452700378396973, + "grad_norm": 0.31310158450572734, + "learning_rate": 0.000976769203157858, + "loss": 2.764, + "step": 1086 + }, + { + "epoch": 0.1246416695333104, + "grad_norm": 0.3252755786154215, + "learning_rate": 0.0009767132255874315, + "loss": 2.6496, + "step": 1087 + }, + { + "epoch": 0.12475633528265107, + "grad_norm": 0.3030813600163628, + "learning_rate": 0.0009766571822635022, + "loss": 2.7656, + "step": 1088 + }, + { + "epoch": 0.12487100103199174, + "grad_norm": 0.3457076938028077, + "learning_rate": 0.0009766010731938007, + "loss": 2.7874, + "step": 1089 + }, + { + "epoch": 0.12498566678133241, + "grad_norm": 0.34068990976322255, + "learning_rate": 0.0009765448983860658, + "loss": 2.7769, + "step": 1090 + }, + { + "epoch": 0.1251003325306731, + "grad_norm": 0.309044070096772, + "learning_rate": 0.0009764886578480461, + "loss": 2.6511, + "step": 1091 + }, + { + "epoch": 0.12521499828001376, + "grad_norm": 0.32484793870159917, + "learning_rate": 0.0009764323515874986, + "loss": 2.6819, + "step": 1092 + }, + { + "epoch": 0.12532966402935444, + "grad_norm": 0.2935510513701517, + "learning_rate": 0.00097637597961219, + "loss": 2.7808, + "step": 1093 + }, + { + "epoch": 0.1254443297786951, + "grad_norm": 0.332200265241668, + "learning_rate": 0.0009763195419298955, + "loss": 2.8234, + "step": 1094 + }, + { + "epoch": 0.1255589955280358, + "grad_norm": 0.2970582850114562, + "learning_rate": 0.0009762630385483997, + "loss": 2.6976, + "step": 1095 + }, + { + "epoch": 0.12567366127737645, + "grad_norm": 0.302287282907232, + "learning_rate": 0.000976206469475496, + "loss": 2.6992, + "step": 1096 + }, + { + "epoch": 0.1257883270267171, + "grad_norm": 0.31141846609684387, + "learning_rate": 0.0009761498347189872, + "loss": 2.7116, + "step": 1097 + }, + { + "epoch": 0.1259029927760578, + "grad_norm": 0.2976244344291253, + "learning_rate": 0.000976093134286685, + "loss": 2.565, + "step": 1098 + }, + { + "epoch": 0.12601765852539845, + "grad_norm": 0.30526636304771265, + "learning_rate": 0.0009760363681864102, + "loss": 2.7651, + "step": 1099 + }, + { + "epoch": 0.12613232427473914, + "grad_norm": 0.2797712550011633, + "learning_rate": 0.0009759795364259923, + "loss": 2.5988, + "step": 1100 + }, + { + "epoch": 0.1262469900240798, + "grad_norm": 0.3166073089497224, + "learning_rate": 0.0009759226390132704, + "loss": 2.7872, + "step": 1101 + }, + { + "epoch": 0.12636165577342048, + "grad_norm": 0.30187441498020534, + "learning_rate": 0.0009758656759560923, + "loss": 2.7741, + "step": 1102 + }, + { + "epoch": 0.12647632152276114, + "grad_norm": 0.29614941276690565, + "learning_rate": 0.0009758086472623151, + "loss": 2.619, + "step": 1103 + }, + { + "epoch": 0.12659098727210183, + "grad_norm": 0.3249053678561645, + "learning_rate": 0.0009757515529398047, + "loss": 2.6033, + "step": 1104 + }, + { + "epoch": 0.1267056530214425, + "grad_norm": 0.2921780022441539, + "learning_rate": 0.0009756943929964363, + "loss": 2.7473, + "step": 1105 + }, + { + "epoch": 0.12682031877078317, + "grad_norm": 0.3330855539494519, + "learning_rate": 0.0009756371674400939, + "loss": 2.6536, + "step": 1106 + }, + { + "epoch": 0.12693498452012383, + "grad_norm": 0.2788316353800275, + "learning_rate": 0.0009755798762786707, + "loss": 2.6924, + "step": 1107 + }, + { + "epoch": 0.12704965026946452, + "grad_norm": 0.31732418324080036, + "learning_rate": 0.0009755225195200689, + "loss": 2.8171, + "step": 1108 + }, + { + "epoch": 0.12716431601880518, + "grad_norm": 0.35264739938729056, + "learning_rate": 0.0009754650971722, + "loss": 2.655, + "step": 1109 + }, + { + "epoch": 0.12727898176814587, + "grad_norm": 0.30980758203345504, + "learning_rate": 0.000975407609242984, + "loss": 2.6685, + "step": 1110 + }, + { + "epoch": 0.12739364751748652, + "grad_norm": 0.3273127622839704, + "learning_rate": 0.0009753500557403504, + "loss": 2.7617, + "step": 1111 + }, + { + "epoch": 0.1275083132668272, + "grad_norm": 0.3459824726805483, + "learning_rate": 0.0009752924366722376, + "loss": 2.8057, + "step": 1112 + }, + { + "epoch": 0.12762297901616787, + "grad_norm": 0.31908221668773695, + "learning_rate": 0.0009752347520465931, + "loss": 2.7207, + "step": 1113 + }, + { + "epoch": 0.12773764476550853, + "grad_norm": 0.3259372237077058, + "learning_rate": 0.0009751770018713734, + "loss": 2.6697, + "step": 1114 + }, + { + "epoch": 0.12785231051484922, + "grad_norm": 0.3019415271769574, + "learning_rate": 0.0009751191861545439, + "loss": 2.6886, + "step": 1115 + }, + { + "epoch": 0.12796697626418987, + "grad_norm": 0.3475581757215973, + "learning_rate": 0.0009750613049040792, + "loss": 2.6857, + "step": 1116 + }, + { + "epoch": 0.12808164201353056, + "grad_norm": 0.30116006185853883, + "learning_rate": 0.0009750033581279632, + "loss": 2.8236, + "step": 1117 + }, + { + "epoch": 0.12819630776287122, + "grad_norm": 0.3002910415084012, + "learning_rate": 0.0009749453458341882, + "loss": 2.6827, + "step": 1118 + }, + { + "epoch": 0.1283109735122119, + "grad_norm": 0.2856142338336959, + "learning_rate": 0.000974887268030756, + "loss": 2.6603, + "step": 1119 + }, + { + "epoch": 0.12842563926155257, + "grad_norm": 0.3046145549292053, + "learning_rate": 0.0009748291247256774, + "loss": 2.7755, + "step": 1120 + }, + { + "epoch": 0.12854030501089325, + "grad_norm": 0.30041920813829953, + "learning_rate": 0.000974770915926972, + "loss": 2.6955, + "step": 1121 + }, + { + "epoch": 0.1286549707602339, + "grad_norm": 0.3033132887627643, + "learning_rate": 0.0009747126416426688, + "loss": 2.7433, + "step": 1122 + }, + { + "epoch": 0.1287696365095746, + "grad_norm": 0.289999150738605, + "learning_rate": 0.0009746543018808057, + "loss": 2.7022, + "step": 1123 + }, + { + "epoch": 0.12888430225891526, + "grad_norm": 0.3081213840886586, + "learning_rate": 0.000974595896649429, + "loss": 2.7157, + "step": 1124 + }, + { + "epoch": 0.12899896800825594, + "grad_norm": 0.3303129074752545, + "learning_rate": 0.0009745374259565953, + "loss": 2.6022, + "step": 1125 + }, + { + "epoch": 0.1291136337575966, + "grad_norm": 0.3311561370301218, + "learning_rate": 0.0009744788898103691, + "loss": 2.6989, + "step": 1126 + }, + { + "epoch": 0.1292282995069373, + "grad_norm": 0.3180305084579922, + "learning_rate": 0.0009744202882188245, + "loss": 2.6408, + "step": 1127 + }, + { + "epoch": 0.12934296525627795, + "grad_norm": 0.3578259835953817, + "learning_rate": 0.0009743616211900443, + "loss": 2.859, + "step": 1128 + }, + { + "epoch": 0.12945763100561863, + "grad_norm": 0.3181880369826049, + "learning_rate": 0.0009743028887321206, + "loss": 2.6293, + "step": 1129 + }, + { + "epoch": 0.1295722967549593, + "grad_norm": 0.3077277311050923, + "learning_rate": 0.0009742440908531545, + "loss": 2.747, + "step": 1130 + }, + { + "epoch": 0.12968696250429995, + "grad_norm": 0.31052912123009196, + "learning_rate": 0.0009741852275612559, + "loss": 2.5986, + "step": 1131 + }, + { + "epoch": 0.12980162825364064, + "grad_norm": 0.32189877072938033, + "learning_rate": 0.0009741262988645441, + "loss": 2.7397, + "step": 1132 + }, + { + "epoch": 0.1299162940029813, + "grad_norm": 0.30705193054249585, + "learning_rate": 0.000974067304771147, + "loss": 2.6544, + "step": 1133 + }, + { + "epoch": 0.13003095975232198, + "grad_norm": 0.2875146345861382, + "learning_rate": 0.0009740082452892017, + "loss": 2.5831, + "step": 1134 + }, + { + "epoch": 0.13014562550166264, + "grad_norm": 0.32572832711086697, + "learning_rate": 0.0009739491204268545, + "loss": 2.7258, + "step": 1135 + }, + { + "epoch": 0.13026029125100333, + "grad_norm": 0.3243907333544116, + "learning_rate": 0.0009738899301922602, + "loss": 2.7482, + "step": 1136 + }, + { + "epoch": 0.130374957000344, + "grad_norm": 0.2883622238077074, + "learning_rate": 0.0009738306745935833, + "loss": 2.7093, + "step": 1137 + }, + { + "epoch": 0.13048962274968468, + "grad_norm": 0.30196184124992387, + "learning_rate": 0.0009737713536389969, + "loss": 2.6681, + "step": 1138 + }, + { + "epoch": 0.13060428849902533, + "grad_norm": 0.3159333031052521, + "learning_rate": 0.0009737119673366832, + "loss": 2.6835, + "step": 1139 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.3368608940112984, + "learning_rate": 0.0009736525156948333, + "loss": 2.6257, + "step": 1140 + }, + { + "epoch": 0.13083361999770668, + "grad_norm": 0.2800491980984583, + "learning_rate": 0.0009735929987216476, + "loss": 2.645, + "step": 1141 + }, + { + "epoch": 0.13094828574704737, + "grad_norm": 0.32048987322587724, + "learning_rate": 0.0009735334164253351, + "loss": 2.6335, + "step": 1142 + }, + { + "epoch": 0.13106295149638802, + "grad_norm": 0.29757897848562, + "learning_rate": 0.0009734737688141142, + "loss": 2.6446, + "step": 1143 + }, + { + "epoch": 0.1311776172457287, + "grad_norm": 0.33408283647977166, + "learning_rate": 0.0009734140558962123, + "loss": 2.8177, + "step": 1144 + }, + { + "epoch": 0.13129228299506937, + "grad_norm": 0.3207080755131899, + "learning_rate": 0.0009733542776798653, + "loss": 2.7055, + "step": 1145 + }, + { + "epoch": 0.13140694874441006, + "grad_norm": 0.29192499803903355, + "learning_rate": 0.0009732944341733188, + "loss": 2.7458, + "step": 1146 + }, + { + "epoch": 0.13152161449375072, + "grad_norm": 0.2942113010955268, + "learning_rate": 0.0009732345253848267, + "loss": 2.5987, + "step": 1147 + }, + { + "epoch": 0.13163628024309137, + "grad_norm": 0.2728831918675034, + "learning_rate": 0.0009731745513226526, + "loss": 2.7414, + "step": 1148 + }, + { + "epoch": 0.13175094599243206, + "grad_norm": 0.3002007428802995, + "learning_rate": 0.0009731145119950686, + "loss": 2.6916, + "step": 1149 + }, + { + "epoch": 0.13186561174177272, + "grad_norm": 0.3310801693208304, + "learning_rate": 0.0009730544074103562, + "loss": 2.7173, + "step": 1150 + }, + { + "epoch": 0.1319802774911134, + "grad_norm": 0.2872578269705733, + "learning_rate": 0.0009729942375768055, + "loss": 2.6925, + "step": 1151 + }, + { + "epoch": 0.13209494324045407, + "grad_norm": 0.3339589334365125, + "learning_rate": 0.0009729340025027158, + "loss": 2.877, + "step": 1152 + }, + { + "epoch": 0.13220960898979475, + "grad_norm": 0.30468268859878117, + "learning_rate": 0.0009728737021963954, + "loss": 2.6462, + "step": 1153 + }, + { + "epoch": 0.1323242747391354, + "grad_norm": 0.3090841891203495, + "learning_rate": 0.0009728133366661615, + "loss": 2.6178, + "step": 1154 + }, + { + "epoch": 0.1324389404884761, + "grad_norm": 0.325383874425344, + "learning_rate": 0.0009727529059203406, + "loss": 2.7653, + "step": 1155 + }, + { + "epoch": 0.13255360623781676, + "grad_norm": 0.34188021187078127, + "learning_rate": 0.0009726924099672676, + "loss": 2.7311, + "step": 1156 + }, + { + "epoch": 0.13266827198715744, + "grad_norm": 0.31400848735268017, + "learning_rate": 0.0009726318488152872, + "loss": 2.7306, + "step": 1157 + }, + { + "epoch": 0.1327829377364981, + "grad_norm": 0.33309705349144986, + "learning_rate": 0.0009725712224727523, + "loss": 2.7348, + "step": 1158 + }, + { + "epoch": 0.1328976034858388, + "grad_norm": 0.34229675472637827, + "learning_rate": 0.0009725105309480253, + "loss": 2.6122, + "step": 1159 + }, + { + "epoch": 0.13301226923517945, + "grad_norm": 0.30013825254366966, + "learning_rate": 0.0009724497742494776, + "loss": 2.6623, + "step": 1160 + }, + { + "epoch": 0.13312693498452013, + "grad_norm": 0.3034117941043357, + "learning_rate": 0.000972388952385489, + "loss": 2.6084, + "step": 1161 + }, + { + "epoch": 0.1332416007338608, + "grad_norm": 0.3308573757821191, + "learning_rate": 0.000972328065364449, + "loss": 2.6919, + "step": 1162 + }, + { + "epoch": 0.13335626648320148, + "grad_norm": 0.33443760037582665, + "learning_rate": 0.0009722671131947559, + "loss": 2.7983, + "step": 1163 + }, + { + "epoch": 0.13347093223254214, + "grad_norm": 0.28384016287149655, + "learning_rate": 0.0009722060958848168, + "loss": 2.7731, + "step": 1164 + }, + { + "epoch": 0.13358559798188283, + "grad_norm": 0.3131121432846602, + "learning_rate": 0.0009721450134430478, + "loss": 2.7403, + "step": 1165 + }, + { + "epoch": 0.13370026373122348, + "grad_norm": 0.3077864832515649, + "learning_rate": 0.000972083865877874, + "loss": 2.5896, + "step": 1166 + }, + { + "epoch": 0.13381492948056414, + "grad_norm": 0.29984520552459987, + "learning_rate": 0.0009720226531977296, + "loss": 2.6777, + "step": 1167 + }, + { + "epoch": 0.13392959522990483, + "grad_norm": 0.28246607776110233, + "learning_rate": 0.0009719613754110578, + "loss": 2.6972, + "step": 1168 + }, + { + "epoch": 0.1340442609792455, + "grad_norm": 0.3319968429482706, + "learning_rate": 0.0009719000325263109, + "loss": 2.8027, + "step": 1169 + }, + { + "epoch": 0.13415892672858618, + "grad_norm": 0.29292200869679685, + "learning_rate": 0.0009718386245519495, + "loss": 2.5638, + "step": 1170 + }, + { + "epoch": 0.13427359247792683, + "grad_norm": 0.28135277511039536, + "learning_rate": 0.0009717771514964439, + "loss": 2.579, + "step": 1171 + }, + { + "epoch": 0.13438825822726752, + "grad_norm": 0.3202216952374538, + "learning_rate": 0.0009717156133682734, + "loss": 2.7501, + "step": 1172 + }, + { + "epoch": 0.13450292397660818, + "grad_norm": 0.3334465306023394, + "learning_rate": 0.0009716540101759255, + "loss": 2.8353, + "step": 1173 + }, + { + "epoch": 0.13461758972594887, + "grad_norm": 0.31196363353936807, + "learning_rate": 0.0009715923419278976, + "loss": 2.7734, + "step": 1174 + }, + { + "epoch": 0.13473225547528953, + "grad_norm": 0.3264945580334173, + "learning_rate": 0.0009715306086326954, + "loss": 2.5605, + "step": 1175 + }, + { + "epoch": 0.1348469212246302, + "grad_norm": 0.3007343039110142, + "learning_rate": 0.0009714688102988339, + "loss": 2.5979, + "step": 1176 + }, + { + "epoch": 0.13496158697397087, + "grad_norm": 0.3011021234522493, + "learning_rate": 0.000971406946934837, + "loss": 2.7176, + "step": 1177 + }, + { + "epoch": 0.13507625272331156, + "grad_norm": 0.3079224691494148, + "learning_rate": 0.0009713450185492378, + "loss": 2.6673, + "step": 1178 + }, + { + "epoch": 0.13519091847265222, + "grad_norm": 0.31251648830122747, + "learning_rate": 0.0009712830251505778, + "loss": 2.7343, + "step": 1179 + }, + { + "epoch": 0.1353055842219929, + "grad_norm": 0.28724955637607563, + "learning_rate": 0.0009712209667474079, + "loss": 2.6482, + "step": 1180 + }, + { + "epoch": 0.13542024997133356, + "grad_norm": 0.3007219760730107, + "learning_rate": 0.0009711588433482881, + "loss": 2.6916, + "step": 1181 + }, + { + "epoch": 0.13553491572067425, + "grad_norm": 0.30178334823273617, + "learning_rate": 0.0009710966549617868, + "loss": 2.68, + "step": 1182 + }, + { + "epoch": 0.1356495814700149, + "grad_norm": 0.3062269145084652, + "learning_rate": 0.0009710344015964819, + "loss": 2.756, + "step": 1183 + }, + { + "epoch": 0.13576424721935557, + "grad_norm": 0.32190065767209086, + "learning_rate": 0.00097097208326096, + "loss": 2.714, + "step": 1184 + }, + { + "epoch": 0.13587891296869625, + "grad_norm": 0.3168972476878082, + "learning_rate": 0.000970909699963817, + "loss": 2.6403, + "step": 1185 + }, + { + "epoch": 0.1359935787180369, + "grad_norm": 0.36325447792357723, + "learning_rate": 0.0009708472517136569, + "loss": 2.7753, + "step": 1186 + }, + { + "epoch": 0.1361082444673776, + "grad_norm": 0.3080950849155879, + "learning_rate": 0.0009707847385190938, + "loss": 2.6844, + "step": 1187 + }, + { + "epoch": 0.13622291021671826, + "grad_norm": 0.33777156883881254, + "learning_rate": 0.00097072216038875, + "loss": 2.7788, + "step": 1188 + }, + { + "epoch": 0.13633757596605894, + "grad_norm": 0.33986276703110124, + "learning_rate": 0.000970659517331257, + "loss": 2.6061, + "step": 1189 + }, + { + "epoch": 0.1364522417153996, + "grad_norm": 0.32138342537228204, + "learning_rate": 0.000970596809355255, + "loss": 2.7633, + "step": 1190 + }, + { + "epoch": 0.1365669074647403, + "grad_norm": 0.30510092766617586, + "learning_rate": 0.0009705340364693935, + "loss": 2.7706, + "step": 1191 + }, + { + "epoch": 0.13668157321408095, + "grad_norm": 0.28188918035264077, + "learning_rate": 0.0009704711986823311, + "loss": 2.7453, + "step": 1192 + }, + { + "epoch": 0.13679623896342163, + "grad_norm": 0.2794516966437732, + "learning_rate": 0.0009704082960027348, + "loss": 2.7083, + "step": 1193 + }, + { + "epoch": 0.1369109047127623, + "grad_norm": 0.3039044119088668, + "learning_rate": 0.0009703453284392807, + "loss": 2.7217, + "step": 1194 + }, + { + "epoch": 0.13702557046210298, + "grad_norm": 0.28055467299292364, + "learning_rate": 0.0009702822960006544, + "loss": 2.6503, + "step": 1195 + }, + { + "epoch": 0.13714023621144364, + "grad_norm": 0.2894232734298314, + "learning_rate": 0.0009702191986955494, + "loss": 2.775, + "step": 1196 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 0.3133021396483607, + "learning_rate": 0.0009701560365326694, + "loss": 2.5516, + "step": 1197 + }, + { + "epoch": 0.13736956771012498, + "grad_norm": 0.30750976085729925, + "learning_rate": 0.0009700928095207259, + "loss": 2.6066, + "step": 1198 + }, + { + "epoch": 0.13748423345946567, + "grad_norm": 0.3141226187293326, + "learning_rate": 0.0009700295176684404, + "loss": 2.8212, + "step": 1199 + }, + { + "epoch": 0.13759889920880633, + "grad_norm": 0.33568159593202296, + "learning_rate": 0.0009699661609845425, + "loss": 2.6958, + "step": 1200 + }, + { + "epoch": 0.137713564958147, + "grad_norm": 0.3924992017437212, + "learning_rate": 0.0009699027394777708, + "loss": 2.6935, + "step": 1201 + }, + { + "epoch": 0.13782823070748768, + "grad_norm": 0.3238370691785521, + "learning_rate": 0.0009698392531568736, + "loss": 2.6071, + "step": 1202 + }, + { + "epoch": 0.13794289645682833, + "grad_norm": 0.30018134253066026, + "learning_rate": 0.0009697757020306072, + "loss": 2.6973, + "step": 1203 + }, + { + "epoch": 0.13805756220616902, + "grad_norm": 0.2702858077957881, + "learning_rate": 0.0009697120861077375, + "loss": 2.556, + "step": 1204 + }, + { + "epoch": 0.13817222795550968, + "grad_norm": 0.3313537349141092, + "learning_rate": 0.0009696484053970391, + "loss": 2.6391, + "step": 1205 + }, + { + "epoch": 0.13828689370485037, + "grad_norm": 0.27215853921810795, + "learning_rate": 0.0009695846599072955, + "loss": 2.6042, + "step": 1206 + }, + { + "epoch": 0.13840155945419103, + "grad_norm": 0.2901045100650714, + "learning_rate": 0.0009695208496472991, + "loss": 2.8205, + "step": 1207 + }, + { + "epoch": 0.1385162252035317, + "grad_norm": 0.28406915639804525, + "learning_rate": 0.0009694569746258514, + "loss": 2.6665, + "step": 1208 + }, + { + "epoch": 0.13863089095287237, + "grad_norm": 0.31733576097274757, + "learning_rate": 0.0009693930348517628, + "loss": 2.6298, + "step": 1209 + }, + { + "epoch": 0.13874555670221306, + "grad_norm": 0.29822463413826206, + "learning_rate": 0.0009693290303338524, + "loss": 2.5626, + "step": 1210 + }, + { + "epoch": 0.13886022245155372, + "grad_norm": 0.3171445959280755, + "learning_rate": 0.0009692649610809485, + "loss": 2.6635, + "step": 1211 + }, + { + "epoch": 0.1389748882008944, + "grad_norm": 0.3293705461700094, + "learning_rate": 0.0009692008271018883, + "loss": 2.7593, + "step": 1212 + }, + { + "epoch": 0.13908955395023506, + "grad_norm": 0.32452528692554516, + "learning_rate": 0.0009691366284055176, + "loss": 2.6933, + "step": 1213 + }, + { + "epoch": 0.13920421969957575, + "grad_norm": 0.3386564363866774, + "learning_rate": 0.0009690723650006917, + "loss": 2.6225, + "step": 1214 + }, + { + "epoch": 0.1393188854489164, + "grad_norm": 0.32089631798757984, + "learning_rate": 0.0009690080368962744, + "loss": 2.6137, + "step": 1215 + }, + { + "epoch": 0.1394335511982571, + "grad_norm": 0.31722679776556845, + "learning_rate": 0.0009689436441011384, + "loss": 2.7521, + "step": 1216 + }, + { + "epoch": 0.13954821694759775, + "grad_norm": 0.3263786820169047, + "learning_rate": 0.0009688791866241657, + "loss": 2.6015, + "step": 1217 + }, + { + "epoch": 0.1396628826969384, + "grad_norm": 0.3164623329978417, + "learning_rate": 0.0009688146644742468, + "loss": 2.7459, + "step": 1218 + }, + { + "epoch": 0.1397775484462791, + "grad_norm": 0.3121491943468524, + "learning_rate": 0.0009687500776602813, + "loss": 2.7994, + "step": 1219 + }, + { + "epoch": 0.13989221419561976, + "grad_norm": 0.3028278744215888, + "learning_rate": 0.0009686854261911779, + "loss": 2.7365, + "step": 1220 + }, + { + "epoch": 0.14000687994496044, + "grad_norm": 0.30086605090098334, + "learning_rate": 0.0009686207100758538, + "loss": 2.5864, + "step": 1221 + }, + { + "epoch": 0.1401215456943011, + "grad_norm": 0.30514876042062433, + "learning_rate": 0.0009685559293232355, + "loss": 2.7421, + "step": 1222 + }, + { + "epoch": 0.1402362114436418, + "grad_norm": 0.3101606454694243, + "learning_rate": 0.0009684910839422582, + "loss": 2.7105, + "step": 1223 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 0.2953717253506576, + "learning_rate": 0.0009684261739418663, + "loss": 2.5598, + "step": 1224 + }, + { + "epoch": 0.14046554294232313, + "grad_norm": 0.27256960668823443, + "learning_rate": 0.0009683611993310127, + "loss": 2.6238, + "step": 1225 + }, + { + "epoch": 0.1405802086916638, + "grad_norm": 0.3051491224427656, + "learning_rate": 0.0009682961601186593, + "loss": 2.689, + "step": 1226 + }, + { + "epoch": 0.14069487444100448, + "grad_norm": 0.2915298850415209, + "learning_rate": 0.000968231056313777, + "loss": 2.7235, + "step": 1227 + }, + { + "epoch": 0.14080954019034514, + "grad_norm": 0.28938583743316676, + "learning_rate": 0.0009681658879253461, + "loss": 2.7219, + "step": 1228 + }, + { + "epoch": 0.14092420593968583, + "grad_norm": 0.3266776062791121, + "learning_rate": 0.0009681006549623548, + "loss": 2.7578, + "step": 1229 + }, + { + "epoch": 0.14103887168902648, + "grad_norm": 0.2761529781871215, + "learning_rate": 0.000968035357433801, + "loss": 2.6032, + "step": 1230 + }, + { + "epoch": 0.14115353743836717, + "grad_norm": 0.3175993078794819, + "learning_rate": 0.0009679699953486913, + "loss": 2.6612, + "step": 1231 + }, + { + "epoch": 0.14126820318770783, + "grad_norm": 0.2931444822651926, + "learning_rate": 0.0009679045687160411, + "loss": 2.615, + "step": 1232 + }, + { + "epoch": 0.14138286893704852, + "grad_norm": 0.290377160482274, + "learning_rate": 0.0009678390775448745, + "loss": 2.7659, + "step": 1233 + }, + { + "epoch": 0.14149753468638918, + "grad_norm": 0.286833552871121, + "learning_rate": 0.0009677735218442252, + "loss": 2.527, + "step": 1234 + }, + { + "epoch": 0.14161220043572983, + "grad_norm": 0.2841049027703459, + "learning_rate": 0.0009677079016231349, + "loss": 2.5961, + "step": 1235 + }, + { + "epoch": 0.14172686618507052, + "grad_norm": 0.32630203867825636, + "learning_rate": 0.000967642216890655, + "loss": 2.8204, + "step": 1236 + }, + { + "epoch": 0.14184153193441118, + "grad_norm": 0.3015898235692081, + "learning_rate": 0.0009675764676558454, + "loss": 2.8384, + "step": 1237 + }, + { + "epoch": 0.14195619768375187, + "grad_norm": 0.28198854681773355, + "learning_rate": 0.000967510653927775, + "loss": 2.7332, + "step": 1238 + }, + { + "epoch": 0.14207086343309253, + "grad_norm": 0.27416288745188, + "learning_rate": 0.0009674447757155213, + "loss": 2.5034, + "step": 1239 + }, + { + "epoch": 0.1421855291824332, + "grad_norm": 0.28576391477179913, + "learning_rate": 0.0009673788330281709, + "loss": 2.718, + "step": 1240 + }, + { + "epoch": 0.14230019493177387, + "grad_norm": 0.28171942651759196, + "learning_rate": 0.0009673128258748199, + "loss": 2.7182, + "step": 1241 + }, + { + "epoch": 0.14241486068111456, + "grad_norm": 0.29435416038799517, + "learning_rate": 0.0009672467542645722, + "loss": 2.4713, + "step": 1242 + }, + { + "epoch": 0.14252952643045522, + "grad_norm": 0.2983066143161253, + "learning_rate": 0.0009671806182065414, + "loss": 2.7271, + "step": 1243 + }, + { + "epoch": 0.1426441921797959, + "grad_norm": 0.30426493788129755, + "learning_rate": 0.0009671144177098494, + "loss": 2.724, + "step": 1244 + }, + { + "epoch": 0.14275885792913656, + "grad_norm": 0.3012439342830468, + "learning_rate": 0.0009670481527836276, + "loss": 2.6508, + "step": 1245 + }, + { + "epoch": 0.14287352367847725, + "grad_norm": 0.3172671313404544, + "learning_rate": 0.000966981823437016, + "loss": 2.5874, + "step": 1246 + }, + { + "epoch": 0.1429881894278179, + "grad_norm": 0.27924155029992187, + "learning_rate": 0.0009669154296791632, + "loss": 2.6983, + "step": 1247 + }, + { + "epoch": 0.1431028551771586, + "grad_norm": 0.3018720099163792, + "learning_rate": 0.000966848971519227, + "loss": 2.7453, + "step": 1248 + }, + { + "epoch": 0.14321752092649925, + "grad_norm": 0.32031332172534815, + "learning_rate": 0.0009667824489663743, + "loss": 2.6061, + "step": 1249 + }, + { + "epoch": 0.14333218667583994, + "grad_norm": 0.30010723061919725, + "learning_rate": 0.0009667158620297803, + "loss": 2.7217, + "step": 1250 + }, + { + "epoch": 0.1434468524251806, + "grad_norm": 0.28541072883213303, + "learning_rate": 0.0009666492107186296, + "loss": 2.7163, + "step": 1251 + }, + { + "epoch": 0.14356151817452126, + "grad_norm": 0.295062026049857, + "learning_rate": 0.0009665824950421155, + "loss": 2.6173, + "step": 1252 + }, + { + "epoch": 0.14367618392386194, + "grad_norm": 0.3299893458242848, + "learning_rate": 0.00096651571500944, + "loss": 2.5901, + "step": 1253 + }, + { + "epoch": 0.1437908496732026, + "grad_norm": 0.3164751250221209, + "learning_rate": 0.0009664488706298142, + "loss": 2.7301, + "step": 1254 + }, + { + "epoch": 0.1439055154225433, + "grad_norm": 0.2902843955436391, + "learning_rate": 0.0009663819619124581, + "loss": 2.6628, + "step": 1255 + }, + { + "epoch": 0.14402018117188395, + "grad_norm": 0.3276007670590875, + "learning_rate": 0.0009663149888666003, + "loss": 2.7709, + "step": 1256 + }, + { + "epoch": 0.14413484692122464, + "grad_norm": 0.31039979074424956, + "learning_rate": 0.0009662479515014786, + "loss": 2.7682, + "step": 1257 + }, + { + "epoch": 0.1442495126705653, + "grad_norm": 0.31655599629199527, + "learning_rate": 0.0009661808498263396, + "loss": 2.6182, + "step": 1258 + }, + { + "epoch": 0.14436417841990598, + "grad_norm": 0.3035882262475092, + "learning_rate": 0.0009661136838504385, + "loss": 2.6093, + "step": 1259 + }, + { + "epoch": 0.14447884416924664, + "grad_norm": 0.3108294032888355, + "learning_rate": 0.0009660464535830395, + "loss": 2.7525, + "step": 1260 + }, + { + "epoch": 0.14459350991858733, + "grad_norm": 0.31247055404509944, + "learning_rate": 0.0009659791590334162, + "loss": 2.7095, + "step": 1261 + }, + { + "epoch": 0.14470817566792799, + "grad_norm": 0.28522240273159905, + "learning_rate": 0.00096591180021085, + "loss": 2.5845, + "step": 1262 + }, + { + "epoch": 0.14482284141726867, + "grad_norm": 0.27594384242314346, + "learning_rate": 0.0009658443771246322, + "loss": 2.583, + "step": 1263 + }, + { + "epoch": 0.14493750716660933, + "grad_norm": 0.3034209555955427, + "learning_rate": 0.0009657768897840623, + "loss": 2.6136, + "step": 1264 + }, + { + "epoch": 0.14505217291595002, + "grad_norm": 0.313705184269361, + "learning_rate": 0.000965709338198449, + "loss": 2.6914, + "step": 1265 + }, + { + "epoch": 0.14516683866529068, + "grad_norm": 0.2885115473756491, + "learning_rate": 0.0009656417223771097, + "loss": 2.5412, + "step": 1266 + }, + { + "epoch": 0.14528150441463136, + "grad_norm": 0.2782035876080379, + "learning_rate": 0.0009655740423293708, + "loss": 2.7199, + "step": 1267 + }, + { + "epoch": 0.14539617016397202, + "grad_norm": 0.33137507415933554, + "learning_rate": 0.0009655062980645673, + "loss": 2.7794, + "step": 1268 + }, + { + "epoch": 0.14551083591331268, + "grad_norm": 0.2797069105889208, + "learning_rate": 0.0009654384895920434, + "loss": 2.6238, + "step": 1269 + }, + { + "epoch": 0.14562550166265337, + "grad_norm": 0.3364891201494451, + "learning_rate": 0.0009653706169211519, + "loss": 2.8306, + "step": 1270 + }, + { + "epoch": 0.14574016741199403, + "grad_norm": 0.29349595897046465, + "learning_rate": 0.0009653026800612545, + "loss": 2.6591, + "step": 1271 + }, + { + "epoch": 0.1458548331613347, + "grad_norm": 0.3172380440683874, + "learning_rate": 0.0009652346790217221, + "loss": 2.5983, + "step": 1272 + }, + { + "epoch": 0.14596949891067537, + "grad_norm": 0.2831238481203662, + "learning_rate": 0.0009651666138119337, + "loss": 2.7247, + "step": 1273 + }, + { + "epoch": 0.14608416466001606, + "grad_norm": 0.30525842976600814, + "learning_rate": 0.000965098484441278, + "loss": 2.6583, + "step": 1274 + }, + { + "epoch": 0.14619883040935672, + "grad_norm": 0.2872357741355747, + "learning_rate": 0.0009650302909191517, + "loss": 2.5642, + "step": 1275 + }, + { + "epoch": 0.1463134961586974, + "grad_norm": 0.2976571065101496, + "learning_rate": 0.0009649620332549613, + "loss": 2.7995, + "step": 1276 + }, + { + "epoch": 0.14642816190803806, + "grad_norm": 0.2978847657666802, + "learning_rate": 0.0009648937114581212, + "loss": 2.5875, + "step": 1277 + }, + { + "epoch": 0.14654282765737875, + "grad_norm": 0.30867524647660627, + "learning_rate": 0.0009648253255380554, + "loss": 2.59, + "step": 1278 + }, + { + "epoch": 0.1466574934067194, + "grad_norm": 0.2819728796524857, + "learning_rate": 0.0009647568755041963, + "loss": 2.6564, + "step": 1279 + }, + { + "epoch": 0.1467721591560601, + "grad_norm": 0.304021304897499, + "learning_rate": 0.0009646883613659851, + "loss": 2.6794, + "step": 1280 + }, + { + "epoch": 0.14688682490540075, + "grad_norm": 0.30170758502930806, + "learning_rate": 0.0009646197831328725, + "loss": 2.5982, + "step": 1281 + }, + { + "epoch": 0.14700149065474144, + "grad_norm": 0.31465503919290083, + "learning_rate": 0.0009645511408143171, + "loss": 2.6388, + "step": 1282 + }, + { + "epoch": 0.1471161564040821, + "grad_norm": 0.3360397066010665, + "learning_rate": 0.0009644824344197872, + "loss": 2.6191, + "step": 1283 + }, + { + "epoch": 0.14723082215342279, + "grad_norm": 0.28568195826015264, + "learning_rate": 0.0009644136639587591, + "loss": 2.6031, + "step": 1284 + }, + { + "epoch": 0.14734548790276344, + "grad_norm": 0.2940096806935903, + "learning_rate": 0.0009643448294407186, + "loss": 2.5574, + "step": 1285 + }, + { + "epoch": 0.1474601536521041, + "grad_norm": 0.3064987419736337, + "learning_rate": 0.0009642759308751601, + "loss": 2.7071, + "step": 1286 + }, + { + "epoch": 0.1475748194014448, + "grad_norm": 0.29945749626450296, + "learning_rate": 0.0009642069682715868, + "loss": 2.6899, + "step": 1287 + }, + { + "epoch": 0.14768948515078545, + "grad_norm": 0.3032113659671877, + "learning_rate": 0.0009641379416395109, + "loss": 2.6868, + "step": 1288 + }, + { + "epoch": 0.14780415090012614, + "grad_norm": 0.3125916681543053, + "learning_rate": 0.0009640688509884532, + "loss": 2.6707, + "step": 1289 + }, + { + "epoch": 0.1479188166494668, + "grad_norm": 0.2904844069409456, + "learning_rate": 0.0009639996963279435, + "loss": 2.6969, + "step": 1290 + }, + { + "epoch": 0.14803348239880748, + "grad_norm": 0.31012913672379294, + "learning_rate": 0.0009639304776675204, + "loss": 2.6674, + "step": 1291 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.2829229815200425, + "learning_rate": 0.0009638611950167311, + "loss": 2.713, + "step": 1292 + }, + { + "epoch": 0.14826281389748883, + "grad_norm": 0.29547750840697584, + "learning_rate": 0.000963791848385132, + "loss": 2.6275, + "step": 1293 + }, + { + "epoch": 0.14837747964682949, + "grad_norm": 0.3205000610626429, + "learning_rate": 0.000963722437782288, + "loss": 2.7156, + "step": 1294 + }, + { + "epoch": 0.14849214539617017, + "grad_norm": 0.29463614804399874, + "learning_rate": 0.0009636529632177732, + "loss": 2.5619, + "step": 1295 + }, + { + "epoch": 0.14860681114551083, + "grad_norm": 0.3009057778776583, + "learning_rate": 0.0009635834247011701, + "loss": 2.5491, + "step": 1296 + }, + { + "epoch": 0.14872147689485152, + "grad_norm": 0.3635785225152186, + "learning_rate": 0.0009635138222420703, + "loss": 2.6911, + "step": 1297 + }, + { + "epoch": 0.14883614264419218, + "grad_norm": 0.31413647923754373, + "learning_rate": 0.000963444155850074, + "loss": 2.8696, + "step": 1298 + }, + { + "epoch": 0.14895080839353286, + "grad_norm": 0.36929621451095446, + "learning_rate": 0.0009633744255347905, + "loss": 2.7248, + "step": 1299 + }, + { + "epoch": 0.14906547414287352, + "grad_norm": 0.30710100368153004, + "learning_rate": 0.0009633046313058378, + "loss": 2.7617, + "step": 1300 + }, + { + "epoch": 0.1491801398922142, + "grad_norm": 1.229014264809978, + "learning_rate": 0.0009632347731728424, + "loss": 2.6771, + "step": 1301 + }, + { + "epoch": 0.14929480564155487, + "grad_norm": 4.307041763377014, + "learning_rate": 0.0009631648511454402, + "loss": 3.1203, + "step": 1302 + }, + { + "epoch": 0.14940947139089553, + "grad_norm": 0.3430616362839399, + "learning_rate": 0.0009630948652332756, + "loss": 2.8483, + "step": 1303 + }, + { + "epoch": 0.1495241371402362, + "grad_norm": 0.3768162483383602, + "learning_rate": 0.0009630248154460017, + "loss": 2.7336, + "step": 1304 + }, + { + "epoch": 0.14963880288957687, + "grad_norm": 0.4354516313054484, + "learning_rate": 0.0009629547017932805, + "loss": 2.8401, + "step": 1305 + }, + { + "epoch": 0.14975346863891756, + "grad_norm": 0.260235487537665, + "learning_rate": 0.0009628845242847829, + "loss": 2.5915, + "step": 1306 + }, + { + "epoch": 0.14986813438825822, + "grad_norm": 0.2779391508838878, + "learning_rate": 0.0009628142829301886, + "loss": 2.618, + "step": 1307 + }, + { + "epoch": 0.1499828001375989, + "grad_norm": 0.29886891436959195, + "learning_rate": 0.000962743977739186, + "loss": 2.7232, + "step": 1308 + }, + { + "epoch": 0.15009746588693956, + "grad_norm": 0.27395636672820894, + "learning_rate": 0.0009626736087214724, + "loss": 2.6386, + "step": 1309 + }, + { + "epoch": 0.15021213163628025, + "grad_norm": 0.29509241572181727, + "learning_rate": 0.0009626031758867538, + "loss": 2.7764, + "step": 1310 + }, + { + "epoch": 0.1503267973856209, + "grad_norm": 0.2879338193801754, + "learning_rate": 0.0009625326792447451, + "loss": 2.6116, + "step": 1311 + }, + { + "epoch": 0.1504414631349616, + "grad_norm": 0.31216096401965693, + "learning_rate": 0.00096246211880517, + "loss": 2.8987, + "step": 1312 + }, + { + "epoch": 0.15055612888430225, + "grad_norm": 0.2971942694288847, + "learning_rate": 0.000962391494577761, + "loss": 2.6521, + "step": 1313 + }, + { + "epoch": 0.15067079463364294, + "grad_norm": 0.2812591210320164, + "learning_rate": 0.0009623208065722592, + "loss": 2.7514, + "step": 1314 + }, + { + "epoch": 0.1507854603829836, + "grad_norm": 0.2969354159323737, + "learning_rate": 0.0009622500547984147, + "loss": 2.6615, + "step": 1315 + }, + { + "epoch": 0.15090012613232429, + "grad_norm": 0.2885863454558515, + "learning_rate": 0.0009621792392659867, + "loss": 2.6979, + "step": 1316 + }, + { + "epoch": 0.15101479188166494, + "grad_norm": 0.31343537531672244, + "learning_rate": 0.0009621083599847424, + "loss": 2.6475, + "step": 1317 + }, + { + "epoch": 0.15112945763100563, + "grad_norm": 0.31024777573597384, + "learning_rate": 0.0009620374169644583, + "loss": 2.7216, + "step": 1318 + }, + { + "epoch": 0.1512441233803463, + "grad_norm": 0.2778321792975303, + "learning_rate": 0.0009619664102149201, + "loss": 2.6305, + "step": 1319 + }, + { + "epoch": 0.15135878912968695, + "grad_norm": 0.2801952025862511, + "learning_rate": 0.0009618953397459211, + "loss": 2.7079, + "step": 1320 + }, + { + "epoch": 0.15147345487902764, + "grad_norm": 0.2912326911171677, + "learning_rate": 0.0009618242055672648, + "loss": 2.6208, + "step": 1321 + }, + { + "epoch": 0.1515881206283683, + "grad_norm": 0.28669232514196613, + "learning_rate": 0.0009617530076887624, + "loss": 2.6975, + "step": 1322 + }, + { + "epoch": 0.15170278637770898, + "grad_norm": 0.29980820411157194, + "learning_rate": 0.0009616817461202345, + "loss": 2.7435, + "step": 1323 + }, + { + "epoch": 0.15181745212704964, + "grad_norm": 0.2804860123607428, + "learning_rate": 0.0009616104208715101, + "loss": 2.8216, + "step": 1324 + }, + { + "epoch": 0.15193211787639033, + "grad_norm": 0.30058461036615736, + "learning_rate": 0.0009615390319524272, + "loss": 2.691, + "step": 1325 + }, + { + "epoch": 0.15204678362573099, + "grad_norm": 0.28183975892337065, + "learning_rate": 0.0009614675793728327, + "loss": 2.63, + "step": 1326 + }, + { + "epoch": 0.15216144937507167, + "grad_norm": 0.30047992958025976, + "learning_rate": 0.0009613960631425818, + "loss": 2.7993, + "step": 1327 + }, + { + "epoch": 0.15227611512441233, + "grad_norm": 0.28242224257019927, + "learning_rate": 0.000961324483271539, + "loss": 2.7805, + "step": 1328 + }, + { + "epoch": 0.15239078087375302, + "grad_norm": 0.2743901657006034, + "learning_rate": 0.0009612528397695777, + "loss": 2.5872, + "step": 1329 + }, + { + "epoch": 0.15250544662309368, + "grad_norm": 0.324018616345287, + "learning_rate": 0.0009611811326465791, + "loss": 2.6436, + "step": 1330 + }, + { + "epoch": 0.15262011237243436, + "grad_norm": 0.3094039012471273, + "learning_rate": 0.0009611093619124344, + "loss": 2.7481, + "step": 1331 + }, + { + "epoch": 0.15273477812177502, + "grad_norm": 0.32839856892671293, + "learning_rate": 0.0009610375275770427, + "loss": 2.7862, + "step": 1332 + }, + { + "epoch": 0.1528494438711157, + "grad_norm": 0.3036908245968126, + "learning_rate": 0.0009609656296503121, + "loss": 2.6353, + "step": 1333 + }, + { + "epoch": 0.15296410962045637, + "grad_norm": 0.31130742252115845, + "learning_rate": 0.0009608936681421599, + "loss": 2.7538, + "step": 1334 + }, + { + "epoch": 0.15307877536979705, + "grad_norm": 0.31403300740018963, + "learning_rate": 0.0009608216430625114, + "loss": 2.7097, + "step": 1335 + }, + { + "epoch": 0.1531934411191377, + "grad_norm": 0.32126031604362265, + "learning_rate": 0.0009607495544213014, + "loss": 2.7097, + "step": 1336 + }, + { + "epoch": 0.15330810686847837, + "grad_norm": 0.29996201059743044, + "learning_rate": 0.000960677402228473, + "loss": 2.7446, + "step": 1337 + }, + { + "epoch": 0.15342277261781906, + "grad_norm": 0.31087601092189504, + "learning_rate": 0.0009606051864939785, + "loss": 2.5068, + "step": 1338 + }, + { + "epoch": 0.15353743836715972, + "grad_norm": 0.2911594052732881, + "learning_rate": 0.0009605329072277782, + "loss": 2.7118, + "step": 1339 + }, + { + "epoch": 0.1536521041165004, + "grad_norm": 0.32672051784529743, + "learning_rate": 0.000960460564439842, + "loss": 2.6055, + "step": 1340 + }, + { + "epoch": 0.15376676986584106, + "grad_norm": 0.30596590504584176, + "learning_rate": 0.0009603881581401482, + "loss": 2.7223, + "step": 1341 + }, + { + "epoch": 0.15388143561518175, + "grad_norm": 0.29495817863816126, + "learning_rate": 0.0009603156883386836, + "loss": 2.5937, + "step": 1342 + }, + { + "epoch": 0.1539961013645224, + "grad_norm": 0.2905922889396395, + "learning_rate": 0.0009602431550454442, + "loss": 2.6542, + "step": 1343 + }, + { + "epoch": 0.1541107671138631, + "grad_norm": 0.29273684537364775, + "learning_rate": 0.0009601705582704348, + "loss": 2.677, + "step": 1344 + }, + { + "epoch": 0.15422543286320375, + "grad_norm": 0.3206561134807994, + "learning_rate": 0.0009600978980236683, + "loss": 2.7658, + "step": 1345 + }, + { + "epoch": 0.15434009861254444, + "grad_norm": 0.307659871836176, + "learning_rate": 0.0009600251743151672, + "loss": 2.6923, + "step": 1346 + }, + { + "epoch": 0.1544547643618851, + "grad_norm": 0.28028015978216897, + "learning_rate": 0.0009599523871549621, + "loss": 2.5286, + "step": 1347 + }, + { + "epoch": 0.1545694301112258, + "grad_norm": 0.2900983177041651, + "learning_rate": 0.0009598795365530928, + "loss": 2.7117, + "step": 1348 + }, + { + "epoch": 0.15468409586056645, + "grad_norm": 0.3021671341423222, + "learning_rate": 0.0009598066225196074, + "loss": 2.6965, + "step": 1349 + }, + { + "epoch": 0.15479876160990713, + "grad_norm": 0.3308628975894653, + "learning_rate": 0.0009597336450645633, + "loss": 2.6723, + "step": 1350 + }, + { + "epoch": 0.1549134273592478, + "grad_norm": 0.3150775385796687, + "learning_rate": 0.000959660604198026, + "loss": 2.5556, + "step": 1351 + }, + { + "epoch": 0.15502809310858848, + "grad_norm": 0.3181786953365109, + "learning_rate": 0.0009595874999300703, + "loss": 2.7486, + "step": 1352 + }, + { + "epoch": 0.15514275885792914, + "grad_norm": 0.2863765853606049, + "learning_rate": 0.0009595143322707795, + "loss": 2.5968, + "step": 1353 + }, + { + "epoch": 0.1552574246072698, + "grad_norm": 0.301936312190553, + "learning_rate": 0.0009594411012302459, + "loss": 2.7436, + "step": 1354 + }, + { + "epoch": 0.15537209035661048, + "grad_norm": 0.2712427155058814, + "learning_rate": 0.0009593678068185701, + "loss": 2.521, + "step": 1355 + }, + { + "epoch": 0.15548675610595114, + "grad_norm": 0.2898303125180279, + "learning_rate": 0.0009592944490458614, + "loss": 2.4917, + "step": 1356 + }, + { + "epoch": 0.15560142185529183, + "grad_norm": 0.3009522721902166, + "learning_rate": 0.0009592210279222386, + "loss": 2.8095, + "step": 1357 + }, + { + "epoch": 0.15571608760463249, + "grad_norm": 0.31190759030470555, + "learning_rate": 0.0009591475434578286, + "loss": 2.6754, + "step": 1358 + }, + { + "epoch": 0.15583075335397317, + "grad_norm": 0.2878539381987615, + "learning_rate": 0.0009590739956627671, + "loss": 2.7391, + "step": 1359 + }, + { + "epoch": 0.15594541910331383, + "grad_norm": 0.2844123743554373, + "learning_rate": 0.0009590003845471987, + "loss": 2.6585, + "step": 1360 + }, + { + "epoch": 0.15606008485265452, + "grad_norm": 0.28110045179738025, + "learning_rate": 0.0009589267101212764, + "loss": 2.521, + "step": 1361 + }, + { + "epoch": 0.15617475060199518, + "grad_norm": 0.3319338094831352, + "learning_rate": 0.0009588529723951625, + "loss": 2.734, + "step": 1362 + }, + { + "epoch": 0.15628941635133586, + "grad_norm": 0.3420508568427328, + "learning_rate": 0.0009587791713790276, + "loss": 2.6491, + "step": 1363 + }, + { + "epoch": 0.15640408210067652, + "grad_norm": 0.28738905725529035, + "learning_rate": 0.0009587053070830512, + "loss": 2.7038, + "step": 1364 + }, + { + "epoch": 0.1565187478500172, + "grad_norm": 0.2967616139059871, + "learning_rate": 0.0009586313795174213, + "loss": 2.7399, + "step": 1365 + }, + { + "epoch": 0.15663341359935787, + "grad_norm": 0.28953785272990806, + "learning_rate": 0.0009585573886923349, + "loss": 2.6468, + "step": 1366 + }, + { + "epoch": 0.15674807934869855, + "grad_norm": 0.306724069847887, + "learning_rate": 0.0009584833346179977, + "loss": 2.643, + "step": 1367 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.2687579143711925, + "learning_rate": 0.000958409217304624, + "loss": 2.5599, + "step": 1368 + }, + { + "epoch": 0.1569774108473799, + "grad_norm": 0.29070431833719695, + "learning_rate": 0.0009583350367624366, + "loss": 2.6963, + "step": 1369 + }, + { + "epoch": 0.15709207659672056, + "grad_norm": 0.30330516645010697, + "learning_rate": 0.0009582607930016678, + "loss": 2.7916, + "step": 1370 + }, + { + "epoch": 0.15720674234606122, + "grad_norm": 0.2804283396916566, + "learning_rate": 0.0009581864860325577, + "loss": 2.7007, + "step": 1371 + }, + { + "epoch": 0.1573214080954019, + "grad_norm": 0.2831796065121677, + "learning_rate": 0.0009581121158653558, + "loss": 2.6291, + "step": 1372 + }, + { + "epoch": 0.15743607384474256, + "grad_norm": 0.3284241744824253, + "learning_rate": 0.0009580376825103199, + "loss": 2.6397, + "step": 1373 + }, + { + "epoch": 0.15755073959408325, + "grad_norm": 0.280660308281362, + "learning_rate": 0.0009579631859777167, + "loss": 2.5074, + "step": 1374 + }, + { + "epoch": 0.1576654053434239, + "grad_norm": 0.28882643510692957, + "learning_rate": 0.0009578886262778214, + "loss": 2.6505, + "step": 1375 + }, + { + "epoch": 0.1577800710927646, + "grad_norm": 0.2906423896698658, + "learning_rate": 0.0009578140034209185, + "loss": 2.657, + "step": 1376 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.26113238524409305, + "learning_rate": 0.0009577393174173004, + "loss": 2.5377, + "step": 1377 + }, + { + "epoch": 0.15800940259144594, + "grad_norm": 0.2861986238013753, + "learning_rate": 0.0009576645682772689, + "loss": 2.6617, + "step": 1378 + }, + { + "epoch": 0.1581240683407866, + "grad_norm": 0.30158948399379343, + "learning_rate": 0.0009575897560111339, + "loss": 2.6743, + "step": 1379 + }, + { + "epoch": 0.1582387340901273, + "grad_norm": 0.28893938629548654, + "learning_rate": 0.0009575148806292146, + "loss": 2.8124, + "step": 1380 + }, + { + "epoch": 0.15835339983946795, + "grad_norm": 0.2850451670356848, + "learning_rate": 0.0009574399421418388, + "loss": 2.6023, + "step": 1381 + }, + { + "epoch": 0.15846806558880863, + "grad_norm": 0.2812654520496973, + "learning_rate": 0.0009573649405593422, + "loss": 2.7464, + "step": 1382 + }, + { + "epoch": 0.1585827313381493, + "grad_norm": 0.29649873676726207, + "learning_rate": 0.0009572898758920704, + "loss": 2.6811, + "step": 1383 + }, + { + "epoch": 0.15869739708748998, + "grad_norm": 0.285323763393947, + "learning_rate": 0.000957214748150377, + "loss": 2.6763, + "step": 1384 + }, + { + "epoch": 0.15881206283683064, + "grad_norm": 0.3112699489214442, + "learning_rate": 0.0009571395573446242, + "loss": 2.8377, + "step": 1385 + }, + { + "epoch": 0.15892672858617132, + "grad_norm": 0.28910352731450245, + "learning_rate": 0.0009570643034851835, + "loss": 2.7889, + "step": 1386 + }, + { + "epoch": 0.15904139433551198, + "grad_norm": 0.299228362841977, + "learning_rate": 0.0009569889865824345, + "loss": 2.7422, + "step": 1387 + }, + { + "epoch": 0.15915606008485264, + "grad_norm": 0.32584046422645296, + "learning_rate": 0.0009569136066467659, + "loss": 2.6867, + "step": 1388 + }, + { + "epoch": 0.15927072583419333, + "grad_norm": 0.3101293248062392, + "learning_rate": 0.0009568381636885747, + "loss": 2.6469, + "step": 1389 + }, + { + "epoch": 0.159385391583534, + "grad_norm": 0.28817243833619927, + "learning_rate": 0.0009567626577182671, + "loss": 2.6394, + "step": 1390 + }, + { + "epoch": 0.15950005733287467, + "grad_norm": 0.27853070935012986, + "learning_rate": 0.0009566870887462573, + "loss": 2.6048, + "step": 1391 + }, + { + "epoch": 0.15961472308221533, + "grad_norm": 0.29278542080928277, + "learning_rate": 0.0009566114567829691, + "loss": 2.7467, + "step": 1392 + }, + { + "epoch": 0.15972938883155602, + "grad_norm": 0.28072724997901144, + "learning_rate": 0.0009565357618388342, + "loss": 2.6325, + "step": 1393 + }, + { + "epoch": 0.15984405458089668, + "grad_norm": 0.26508041750084166, + "learning_rate": 0.0009564600039242932, + "loss": 2.7655, + "step": 1394 + }, + { + "epoch": 0.15995872033023736, + "grad_norm": 0.30225388915393603, + "learning_rate": 0.0009563841830497957, + "loss": 2.6474, + "step": 1395 + }, + { + "epoch": 0.16007338607957802, + "grad_norm": 0.28257919598395925, + "learning_rate": 0.0009563082992257996, + "loss": 2.5329, + "step": 1396 + }, + { + "epoch": 0.1601880518289187, + "grad_norm": 0.28709148095143805, + "learning_rate": 0.0009562323524627716, + "loss": 2.7921, + "step": 1397 + }, + { + "epoch": 0.16030271757825937, + "grad_norm": 0.3124660074222078, + "learning_rate": 0.0009561563427711872, + "loss": 2.7546, + "step": 1398 + }, + { + "epoch": 0.16041738332760005, + "grad_norm": 0.322950461699168, + "learning_rate": 0.0009560802701615304, + "loss": 2.5556, + "step": 1399 + }, + { + "epoch": 0.1605320490769407, + "grad_norm": 0.33760268314043496, + "learning_rate": 0.0009560041346442941, + "loss": 2.683, + "step": 1400 + }, + { + "epoch": 0.1606467148262814, + "grad_norm": 0.2736771753392101, + "learning_rate": 0.0009559279362299796, + "loss": 2.7207, + "step": 1401 + }, + { + "epoch": 0.16076138057562206, + "grad_norm": 0.3110926171361155, + "learning_rate": 0.0009558516749290971, + "loss": 2.573, + "step": 1402 + }, + { + "epoch": 0.16087604632496275, + "grad_norm": 0.31202705252548457, + "learning_rate": 0.0009557753507521653, + "loss": 2.6152, + "step": 1403 + }, + { + "epoch": 0.1609907120743034, + "grad_norm": 0.27397834242002433, + "learning_rate": 0.0009556989637097118, + "loss": 2.6419, + "step": 1404 + }, + { + "epoch": 0.16110537782364406, + "grad_norm": 0.28812009789717413, + "learning_rate": 0.0009556225138122727, + "loss": 2.5732, + "step": 1405 + }, + { + "epoch": 0.16122004357298475, + "grad_norm": 0.28564671520490803, + "learning_rate": 0.0009555460010703927, + "loss": 2.7242, + "step": 1406 + }, + { + "epoch": 0.1613347093223254, + "grad_norm": 0.29068215405101083, + "learning_rate": 0.0009554694254946252, + "loss": 2.4719, + "step": 1407 + }, + { + "epoch": 0.1614493750716661, + "grad_norm": 0.2739241875294356, + "learning_rate": 0.0009553927870955327, + "loss": 2.8237, + "step": 1408 + }, + { + "epoch": 0.16156404082100675, + "grad_norm": 0.27769375912799493, + "learning_rate": 0.0009553160858836858, + "loss": 2.6226, + "step": 1409 + }, + { + "epoch": 0.16167870657034744, + "grad_norm": 0.3090350010972169, + "learning_rate": 0.000955239321869664, + "loss": 2.64, + "step": 1410 + }, + { + "epoch": 0.1617933723196881, + "grad_norm": 0.2840761451661036, + "learning_rate": 0.0009551624950640552, + "loss": 2.5964, + "step": 1411 + }, + { + "epoch": 0.1619080380690288, + "grad_norm": 0.2936312083950193, + "learning_rate": 0.0009550856054774566, + "loss": 2.7175, + "step": 1412 + }, + { + "epoch": 0.16202270381836945, + "grad_norm": 0.29241776290349897, + "learning_rate": 0.0009550086531204733, + "loss": 2.675, + "step": 1413 + }, + { + "epoch": 0.16213736956771013, + "grad_norm": 0.3143612356899675, + "learning_rate": 0.0009549316380037196, + "loss": 2.6333, + "step": 1414 + }, + { + "epoch": 0.1622520353170508, + "grad_norm": 0.30064990656542856, + "learning_rate": 0.0009548545601378183, + "loss": 2.6607, + "step": 1415 + }, + { + "epoch": 0.16236670106639148, + "grad_norm": 0.3207323618511196, + "learning_rate": 0.0009547774195334007, + "loss": 2.7214, + "step": 1416 + }, + { + "epoch": 0.16248136681573214, + "grad_norm": 0.26272137208356916, + "learning_rate": 0.0009547002162011069, + "loss": 2.6222, + "step": 1417 + }, + { + "epoch": 0.16259603256507282, + "grad_norm": 0.3204269605220272, + "learning_rate": 0.0009546229501515856, + "loss": 2.6687, + "step": 1418 + }, + { + "epoch": 0.16271069831441348, + "grad_norm": 0.31773048263040615, + "learning_rate": 0.0009545456213954944, + "loss": 2.719, + "step": 1419 + }, + { + "epoch": 0.16282536406375417, + "grad_norm": 0.309855424827149, + "learning_rate": 0.000954468229943499, + "loss": 2.6968, + "step": 1420 + }, + { + "epoch": 0.16294002981309483, + "grad_norm": 0.2945499865623526, + "learning_rate": 0.0009543907758062742, + "loss": 2.5579, + "step": 1421 + }, + { + "epoch": 0.16305469556243551, + "grad_norm": 0.314048331515698, + "learning_rate": 0.0009543132589945034, + "loss": 2.7855, + "step": 1422 + }, + { + "epoch": 0.16316936131177617, + "grad_norm": 0.2865659891387558, + "learning_rate": 0.0009542356795188786, + "loss": 2.6914, + "step": 1423 + }, + { + "epoch": 0.16328402706111683, + "grad_norm": 0.32765506143342005, + "learning_rate": 0.0009541580373901002, + "loss": 2.5989, + "step": 1424 + }, + { + "epoch": 0.16339869281045752, + "grad_norm": 0.3048617292278364, + "learning_rate": 0.0009540803326188777, + "loss": 2.5351, + "step": 1425 + }, + { + "epoch": 0.16351335855979818, + "grad_norm": 0.2837077774513346, + "learning_rate": 0.0009540025652159288, + "loss": 2.627, + "step": 1426 + }, + { + "epoch": 0.16362802430913886, + "grad_norm": 0.315699924940462, + "learning_rate": 0.0009539247351919802, + "loss": 2.6636, + "step": 1427 + }, + { + "epoch": 0.16374269005847952, + "grad_norm": 0.2765188970627581, + "learning_rate": 0.0009538468425577669, + "loss": 2.6547, + "step": 1428 + }, + { + "epoch": 0.1638573558078202, + "grad_norm": 0.2915747658650876, + "learning_rate": 0.0009537688873240327, + "loss": 2.5286, + "step": 1429 + }, + { + "epoch": 0.16397202155716087, + "grad_norm": 0.28059276189087545, + "learning_rate": 0.0009536908695015303, + "loss": 2.6726, + "step": 1430 + }, + { + "epoch": 0.16408668730650156, + "grad_norm": 0.31217899244383984, + "learning_rate": 0.0009536127891010205, + "loss": 2.6636, + "step": 1431 + }, + { + "epoch": 0.16420135305584221, + "grad_norm": 0.3194344789611659, + "learning_rate": 0.000953534646133273, + "loss": 2.8102, + "step": 1432 + }, + { + "epoch": 0.1643160188051829, + "grad_norm": 0.30233952114902113, + "learning_rate": 0.0009534564406090664, + "loss": 2.5798, + "step": 1433 + }, + { + "epoch": 0.16443068455452356, + "grad_norm": 0.3204618045039989, + "learning_rate": 0.0009533781725391872, + "loss": 2.5656, + "step": 1434 + }, + { + "epoch": 0.16454535030386425, + "grad_norm": 0.3066549137474608, + "learning_rate": 0.0009532998419344316, + "loss": 2.6853, + "step": 1435 + }, + { + "epoch": 0.1646600160532049, + "grad_norm": 0.2882620931541858, + "learning_rate": 0.0009532214488056032, + "loss": 2.5774, + "step": 1436 + }, + { + "epoch": 0.1647746818025456, + "grad_norm": 0.28806916693313656, + "learning_rate": 0.0009531429931635154, + "loss": 2.5575, + "step": 1437 + }, + { + "epoch": 0.16488934755188625, + "grad_norm": 0.28272016928534155, + "learning_rate": 0.0009530644750189892, + "loss": 2.6609, + "step": 1438 + }, + { + "epoch": 0.16500401330122694, + "grad_norm": 0.2811104601283848, + "learning_rate": 0.0009529858943828548, + "loss": 2.6326, + "step": 1439 + }, + { + "epoch": 0.1651186790505676, + "grad_norm": 0.2663009438921707, + "learning_rate": 0.0009529072512659512, + "loss": 2.6215, + "step": 1440 + }, + { + "epoch": 0.16523334479990825, + "grad_norm": 0.2820000380483963, + "learning_rate": 0.0009528285456791253, + "loss": 2.6721, + "step": 1441 + }, + { + "epoch": 0.16534801054924894, + "grad_norm": 0.297854268523418, + "learning_rate": 0.0009527497776332334, + "loss": 2.6001, + "step": 1442 + }, + { + "epoch": 0.1654626762985896, + "grad_norm": 0.32222427351588706, + "learning_rate": 0.0009526709471391397, + "loss": 2.6269, + "step": 1443 + }, + { + "epoch": 0.1655773420479303, + "grad_norm": 0.3102724064412689, + "learning_rate": 0.0009525920542077176, + "loss": 2.6809, + "step": 1444 + }, + { + "epoch": 0.16569200779727095, + "grad_norm": 0.3167401340580198, + "learning_rate": 0.0009525130988498489, + "loss": 2.7132, + "step": 1445 + }, + { + "epoch": 0.16580667354661163, + "grad_norm": 0.3008248309919062, + "learning_rate": 0.0009524340810764237, + "loss": 2.5752, + "step": 1446 + }, + { + "epoch": 0.1659213392959523, + "grad_norm": 0.2953920935204927, + "learning_rate": 0.0009523550008983413, + "loss": 2.6058, + "step": 1447 + }, + { + "epoch": 0.16603600504529298, + "grad_norm": 0.2958244286339564, + "learning_rate": 0.0009522758583265092, + "loss": 2.5752, + "step": 1448 + }, + { + "epoch": 0.16615067079463364, + "grad_norm": 0.2708737051793529, + "learning_rate": 0.0009521966533718436, + "loss": 2.588, + "step": 1449 + }, + { + "epoch": 0.16626533654397432, + "grad_norm": 0.28417046858287925, + "learning_rate": 0.0009521173860452695, + "loss": 2.626, + "step": 1450 + }, + { + "epoch": 0.16638000229331498, + "grad_norm": 0.2817084696561381, + "learning_rate": 0.0009520380563577198, + "loss": 2.7451, + "step": 1451 + }, + { + "epoch": 0.16649466804265567, + "grad_norm": 0.28974527558245117, + "learning_rate": 0.000951958664320137, + "loss": 2.61, + "step": 1452 + }, + { + "epoch": 0.16660933379199633, + "grad_norm": 0.3230066379957491, + "learning_rate": 0.0009518792099434717, + "loss": 2.6368, + "step": 1453 + }, + { + "epoch": 0.16672399954133701, + "grad_norm": 0.27824886635865875, + "learning_rate": 0.0009517996932386827, + "loss": 2.6792, + "step": 1454 + }, + { + "epoch": 0.16683866529067767, + "grad_norm": 0.32774565771485126, + "learning_rate": 0.0009517201142167385, + "loss": 2.6827, + "step": 1455 + }, + { + "epoch": 0.16695333104001836, + "grad_norm": 0.3088994123920793, + "learning_rate": 0.0009516404728886148, + "loss": 2.6637, + "step": 1456 + }, + { + "epoch": 0.16706799678935902, + "grad_norm": 0.29449102075669653, + "learning_rate": 0.000951560769265297, + "loss": 2.6162, + "step": 1457 + }, + { + "epoch": 0.16718266253869968, + "grad_norm": 0.3261853360388319, + "learning_rate": 0.0009514810033577786, + "loss": 2.5551, + "step": 1458 + }, + { + "epoch": 0.16729732828804036, + "grad_norm": 0.29787856921191114, + "learning_rate": 0.0009514011751770618, + "loss": 2.7473, + "step": 1459 + }, + { + "epoch": 0.16741199403738102, + "grad_norm": 0.2944436817765084, + "learning_rate": 0.0009513212847341573, + "loss": 2.6746, + "step": 1460 + }, + { + "epoch": 0.1675266597867217, + "grad_norm": 0.28829425971899686, + "learning_rate": 0.0009512413320400847, + "loss": 2.6536, + "step": 1461 + }, + { + "epoch": 0.16764132553606237, + "grad_norm": 0.3290881295071441, + "learning_rate": 0.0009511613171058717, + "loss": 2.6729, + "step": 1462 + }, + { + "epoch": 0.16775599128540306, + "grad_norm": 0.27966983808484835, + "learning_rate": 0.000951081239942555, + "loss": 2.804, + "step": 1463 + }, + { + "epoch": 0.16787065703474371, + "grad_norm": 0.2757834808424912, + "learning_rate": 0.0009510011005611796, + "loss": 2.5126, + "step": 1464 + }, + { + "epoch": 0.1679853227840844, + "grad_norm": 0.27139395208258155, + "learning_rate": 0.0009509208989727992, + "loss": 2.7308, + "step": 1465 + }, + { + "epoch": 0.16809998853342506, + "grad_norm": 0.2737602092824948, + "learning_rate": 0.000950840635188476, + "loss": 2.7915, + "step": 1466 + }, + { + "epoch": 0.16821465428276575, + "grad_norm": 0.28300459330822075, + "learning_rate": 0.0009507603092192812, + "loss": 2.53, + "step": 1467 + }, + { + "epoch": 0.1683293200321064, + "grad_norm": 0.28048004413677813, + "learning_rate": 0.000950679921076294, + "loss": 2.7427, + "step": 1468 + }, + { + "epoch": 0.1684439857814471, + "grad_norm": 0.28796626270159864, + "learning_rate": 0.0009505994707706023, + "loss": 2.4798, + "step": 1469 + }, + { + "epoch": 0.16855865153078775, + "grad_norm": 0.2602547706208409, + "learning_rate": 0.000950518958313303, + "loss": 2.6237, + "step": 1470 + }, + { + "epoch": 0.16867331728012844, + "grad_norm": 0.31238677493304695, + "learning_rate": 0.0009504383837155008, + "loss": 2.7236, + "step": 1471 + }, + { + "epoch": 0.1687879830294691, + "grad_norm": 0.2804170854815118, + "learning_rate": 0.0009503577469883098, + "loss": 2.592, + "step": 1472 + }, + { + "epoch": 0.16890264877880978, + "grad_norm": 0.30053387490909567, + "learning_rate": 0.0009502770481428525, + "loss": 2.6539, + "step": 1473 + }, + { + "epoch": 0.16901731452815044, + "grad_norm": 0.30834320472548965, + "learning_rate": 0.000950196287190259, + "loss": 2.756, + "step": 1474 + }, + { + "epoch": 0.1691319802774911, + "grad_norm": 0.31776791179375063, + "learning_rate": 0.0009501154641416695, + "loss": 2.6248, + "step": 1475 + }, + { + "epoch": 0.1692466460268318, + "grad_norm": 0.3087810351869066, + "learning_rate": 0.0009500345790082317, + "loss": 2.6473, + "step": 1476 + }, + { + "epoch": 0.16936131177617245, + "grad_norm": 0.30480823299445287, + "learning_rate": 0.0009499536318011019, + "loss": 2.7211, + "step": 1477 + }, + { + "epoch": 0.16947597752551313, + "grad_norm": 0.30880566619999633, + "learning_rate": 0.0009498726225314458, + "loss": 2.5683, + "step": 1478 + }, + { + "epoch": 0.1695906432748538, + "grad_norm": 0.33186228339465085, + "learning_rate": 0.0009497915512104367, + "loss": 2.5963, + "step": 1479 + }, + { + "epoch": 0.16970530902419448, + "grad_norm": 0.3111155102589005, + "learning_rate": 0.0009497104178492568, + "loss": 2.6648, + "step": 1480 + }, + { + "epoch": 0.16981997477353514, + "grad_norm": 0.28135526141656864, + "learning_rate": 0.0009496292224590973, + "loss": 2.7007, + "step": 1481 + }, + { + "epoch": 0.16993464052287582, + "grad_norm": 0.29578500090767396, + "learning_rate": 0.000949547965051157, + "loss": 2.6404, + "step": 1482 + }, + { + "epoch": 0.17004930627221648, + "grad_norm": 0.2827310943911389, + "learning_rate": 0.0009494666456366441, + "loss": 2.5735, + "step": 1483 + }, + { + "epoch": 0.17016397202155717, + "grad_norm": 0.3025309710319539, + "learning_rate": 0.0009493852642267751, + "loss": 2.6682, + "step": 1484 + }, + { + "epoch": 0.17027863777089783, + "grad_norm": 0.28255922243287473, + "learning_rate": 0.0009493038208327749, + "loss": 2.5944, + "step": 1485 + }, + { + "epoch": 0.17039330352023851, + "grad_norm": 0.33117238349940387, + "learning_rate": 0.0009492223154658773, + "loss": 2.6566, + "step": 1486 + }, + { + "epoch": 0.17050796926957917, + "grad_norm": 0.30506155447438676, + "learning_rate": 0.0009491407481373241, + "loss": 2.6299, + "step": 1487 + }, + { + "epoch": 0.17062263501891986, + "grad_norm": 0.28670429040430734, + "learning_rate": 0.0009490591188583661, + "loss": 2.5977, + "step": 1488 + }, + { + "epoch": 0.17073730076826052, + "grad_norm": 0.31573783930321386, + "learning_rate": 0.0009489774276402625, + "loss": 2.6092, + "step": 1489 + }, + { + "epoch": 0.1708519665176012, + "grad_norm": 0.33596673511835784, + "learning_rate": 0.0009488956744942811, + "loss": 2.558, + "step": 1490 + }, + { + "epoch": 0.17096663226694186, + "grad_norm": 0.3138896289374198, + "learning_rate": 0.0009488138594316982, + "loss": 2.7614, + "step": 1491 + }, + { + "epoch": 0.17108129801628252, + "grad_norm": 0.30605480146804603, + "learning_rate": 0.0009487319824637983, + "loss": 2.6414, + "step": 1492 + }, + { + "epoch": 0.1711959637656232, + "grad_norm": 0.2904203431533111, + "learning_rate": 0.0009486500436018752, + "loss": 2.7069, + "step": 1493 + }, + { + "epoch": 0.17131062951496387, + "grad_norm": 0.2895527923973493, + "learning_rate": 0.0009485680428572308, + "loss": 2.6998, + "step": 1494 + }, + { + "epoch": 0.17142529526430456, + "grad_norm": 0.30183643209093675, + "learning_rate": 0.0009484859802411751, + "loss": 2.5405, + "step": 1495 + }, + { + "epoch": 0.17153996101364521, + "grad_norm": 0.3009461864055049, + "learning_rate": 0.0009484038557650274, + "loss": 2.5786, + "step": 1496 + }, + { + "epoch": 0.1716546267629859, + "grad_norm": 0.3177453651091308, + "learning_rate": 0.0009483216694401152, + "loss": 2.7656, + "step": 1497 + }, + { + "epoch": 0.17176929251232656, + "grad_norm": 0.2837175084859612, + "learning_rate": 0.0009482394212777745, + "loss": 2.6968, + "step": 1498 + }, + { + "epoch": 0.17188395826166725, + "grad_norm": 0.26602938016721817, + "learning_rate": 0.0009481571112893498, + "loss": 2.7052, + "step": 1499 + }, + { + "epoch": 0.1719986240110079, + "grad_norm": 0.26495611851406126, + "learning_rate": 0.0009480747394861944, + "loss": 2.6593, + "step": 1500 + }, + { + "epoch": 0.1721132897603486, + "grad_norm": 0.2693634923702362, + "learning_rate": 0.0009479923058796695, + "loss": 2.483, + "step": 1501 + }, + { + "epoch": 0.17222795550968925, + "grad_norm": 0.25265713749306934, + "learning_rate": 0.0009479098104811459, + "loss": 2.5363, + "step": 1502 + }, + { + "epoch": 0.17234262125902994, + "grad_norm": 0.2975013468719012, + "learning_rate": 0.0009478272533020016, + "loss": 2.6521, + "step": 1503 + }, + { + "epoch": 0.1724572870083706, + "grad_norm": 0.2781856459368853, + "learning_rate": 0.0009477446343536241, + "loss": 2.7137, + "step": 1504 + }, + { + "epoch": 0.17257195275771128, + "grad_norm": 0.2953848626261515, + "learning_rate": 0.0009476619536474091, + "loss": 2.5883, + "step": 1505 + }, + { + "epoch": 0.17268661850705194, + "grad_norm": 0.28226725537786074, + "learning_rate": 0.0009475792111947607, + "loss": 2.6298, + "step": 1506 + }, + { + "epoch": 0.17280128425639263, + "grad_norm": 0.3099605978252708, + "learning_rate": 0.0009474964070070919, + "loss": 2.6173, + "step": 1507 + }, + { + "epoch": 0.1729159500057333, + "grad_norm": 0.3206886311737969, + "learning_rate": 0.0009474135410958239, + "loss": 2.7354, + "step": 1508 + }, + { + "epoch": 0.17303061575507395, + "grad_norm": 0.31795945458749986, + "learning_rate": 0.0009473306134723862, + "loss": 2.6441, + "step": 1509 + }, + { + "epoch": 0.17314528150441463, + "grad_norm": 0.33931547002360274, + "learning_rate": 0.0009472476241482173, + "loss": 2.6856, + "step": 1510 + }, + { + "epoch": 0.1732599472537553, + "grad_norm": 0.3131751106714846, + "learning_rate": 0.000947164573134764, + "loss": 2.5559, + "step": 1511 + }, + { + "epoch": 0.17337461300309598, + "grad_norm": 0.31278192336063265, + "learning_rate": 0.0009470814604434816, + "loss": 2.7362, + "step": 1512 + }, + { + "epoch": 0.17348927875243664, + "grad_norm": 0.30532758607176597, + "learning_rate": 0.000946998286085834, + "loss": 2.8198, + "step": 1513 + }, + { + "epoch": 0.17360394450177732, + "grad_norm": 0.30851306996712585, + "learning_rate": 0.0009469150500732932, + "loss": 2.5382, + "step": 1514 + }, + { + "epoch": 0.17371861025111798, + "grad_norm": 0.3259828180809456, + "learning_rate": 0.0009468317524173402, + "loss": 2.6779, + "step": 1515 + }, + { + "epoch": 0.17383327600045867, + "grad_norm": 0.2964282898313913, + "learning_rate": 0.0009467483931294644, + "loss": 2.6367, + "step": 1516 + }, + { + "epoch": 0.17394794174979933, + "grad_norm": 0.3015987617548308, + "learning_rate": 0.0009466649722211635, + "loss": 2.7786, + "step": 1517 + }, + { + "epoch": 0.17406260749914002, + "grad_norm": 0.30464791390570706, + "learning_rate": 0.000946581489703944, + "loss": 2.6539, + "step": 1518 + }, + { + "epoch": 0.17417727324848067, + "grad_norm": 0.2745374778803253, + "learning_rate": 0.0009464979455893205, + "loss": 2.536, + "step": 1519 + }, + { + "epoch": 0.17429193899782136, + "grad_norm": 0.27495875744278353, + "learning_rate": 0.0009464143398888166, + "loss": 2.5714, + "step": 1520 + }, + { + "epoch": 0.17440660474716202, + "grad_norm": 0.27522383973782755, + "learning_rate": 0.0009463306726139638, + "loss": 2.6366, + "step": 1521 + }, + { + "epoch": 0.1745212704965027, + "grad_norm": 0.29495152826234694, + "learning_rate": 0.0009462469437763026, + "loss": 2.7182, + "step": 1522 + }, + { + "epoch": 0.17463593624584337, + "grad_norm": 0.26541693929842214, + "learning_rate": 0.0009461631533873818, + "loss": 2.5448, + "step": 1523 + }, + { + "epoch": 0.17475060199518405, + "grad_norm": 0.2973656239417534, + "learning_rate": 0.0009460793014587585, + "loss": 2.5127, + "step": 1524 + }, + { + "epoch": 0.1748652677445247, + "grad_norm": 0.3030775767510761, + "learning_rate": 0.0009459953880019987, + "loss": 2.7687, + "step": 1525 + }, + { + "epoch": 0.17497993349386537, + "grad_norm": 0.2637909379493865, + "learning_rate": 0.0009459114130286766, + "loss": 2.703, + "step": 1526 + }, + { + "epoch": 0.17509459924320606, + "grad_norm": 0.2694524706811455, + "learning_rate": 0.0009458273765503749, + "loss": 2.6349, + "step": 1527 + }, + { + "epoch": 0.17520926499254671, + "grad_norm": 0.31073057737754145, + "learning_rate": 0.0009457432785786848, + "loss": 2.542, + "step": 1528 + }, + { + "epoch": 0.1753239307418874, + "grad_norm": 0.30077112974795583, + "learning_rate": 0.0009456591191252061, + "loss": 2.8178, + "step": 1529 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 0.28534000104901913, + "learning_rate": 0.0009455748982015468, + "loss": 2.5936, + "step": 1530 + }, + { + "epoch": 0.17555326224056875, + "grad_norm": 0.2689044008676204, + "learning_rate": 0.0009454906158193239, + "loss": 2.753, + "step": 1531 + }, + { + "epoch": 0.1756679279899094, + "grad_norm": 0.29820364657109166, + "learning_rate": 0.0009454062719901624, + "loss": 2.6721, + "step": 1532 + }, + { + "epoch": 0.1757825937392501, + "grad_norm": 0.28398320156414253, + "learning_rate": 0.0009453218667256958, + "loss": 2.5474, + "step": 1533 + }, + { + "epoch": 0.17589725948859075, + "grad_norm": 0.366927796191532, + "learning_rate": 0.0009452374000375664, + "loss": 2.6449, + "step": 1534 + }, + { + "epoch": 0.17601192523793144, + "grad_norm": 0.29014059901614264, + "learning_rate": 0.0009451528719374245, + "loss": 2.7485, + "step": 1535 + }, + { + "epoch": 0.1761265909872721, + "grad_norm": 0.3007168410628329, + "learning_rate": 0.0009450682824369294, + "loss": 2.6584, + "step": 1536 + }, + { + "epoch": 0.17624125673661278, + "grad_norm": 0.3009065673318265, + "learning_rate": 0.0009449836315477485, + "loss": 2.5932, + "step": 1537 + }, + { + "epoch": 0.17635592248595344, + "grad_norm": 0.29526266492601716, + "learning_rate": 0.0009448989192815578, + "loss": 2.5454, + "step": 1538 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.30602915845172635, + "learning_rate": 0.0009448141456500416, + "loss": 2.7817, + "step": 1539 + }, + { + "epoch": 0.1765852539846348, + "grad_norm": 0.3020506961474114, + "learning_rate": 0.0009447293106648931, + "loss": 2.6561, + "step": 1540 + }, + { + "epoch": 0.17669991973397547, + "grad_norm": 0.26838638679045507, + "learning_rate": 0.0009446444143378134, + "loss": 2.6111, + "step": 1541 + }, + { + "epoch": 0.17681458548331613, + "grad_norm": 0.29076077851293425, + "learning_rate": 0.0009445594566805126, + "loss": 2.5808, + "step": 1542 + }, + { + "epoch": 0.1769292512326568, + "grad_norm": 0.2717047439045014, + "learning_rate": 0.0009444744377047088, + "loss": 2.6596, + "step": 1543 + }, + { + "epoch": 0.17704391698199748, + "grad_norm": 0.3024489025390416, + "learning_rate": 0.0009443893574221286, + "loss": 2.6598, + "step": 1544 + }, + { + "epoch": 0.17715858273133814, + "grad_norm": 0.27790608523239063, + "learning_rate": 0.0009443042158445074, + "loss": 2.692, + "step": 1545 + }, + { + "epoch": 0.17727324848067882, + "grad_norm": 0.2970646188717731, + "learning_rate": 0.000944219012983589, + "loss": 2.5375, + "step": 1546 + }, + { + "epoch": 0.17738791423001948, + "grad_norm": 0.2929272448653852, + "learning_rate": 0.0009441337488511252, + "loss": 2.5052, + "step": 1547 + }, + { + "epoch": 0.17750257997936017, + "grad_norm": 0.29092786670066356, + "learning_rate": 0.0009440484234588766, + "loss": 2.601, + "step": 1548 + }, + { + "epoch": 0.17761724572870083, + "grad_norm": 0.3089062864779452, + "learning_rate": 0.0009439630368186125, + "loss": 2.6262, + "step": 1549 + }, + { + "epoch": 0.17773191147804152, + "grad_norm": 0.2951561950761542, + "learning_rate": 0.0009438775889421102, + "loss": 2.6455, + "step": 1550 + }, + { + "epoch": 0.17784657722738217, + "grad_norm": 0.2916727628444164, + "learning_rate": 0.0009437920798411554, + "loss": 2.6278, + "step": 1551 + }, + { + "epoch": 0.17796124297672286, + "grad_norm": 0.29213314274446406, + "learning_rate": 0.0009437065095275429, + "loss": 2.6298, + "step": 1552 + }, + { + "epoch": 0.17807590872606352, + "grad_norm": 0.2651489394467703, + "learning_rate": 0.0009436208780130751, + "loss": 2.6627, + "step": 1553 + }, + { + "epoch": 0.1781905744754042, + "grad_norm": 0.31725705792573494, + "learning_rate": 0.0009435351853095633, + "loss": 2.7131, + "step": 1554 + }, + { + "epoch": 0.17830524022474487, + "grad_norm": 0.3156123779527773, + "learning_rate": 0.0009434494314288273, + "loss": 2.7252, + "step": 1555 + }, + { + "epoch": 0.17841990597408555, + "grad_norm": 0.2677063535949339, + "learning_rate": 0.0009433636163826951, + "loss": 2.5976, + "step": 1556 + }, + { + "epoch": 0.1785345717234262, + "grad_norm": 0.3281015928728765, + "learning_rate": 0.0009432777401830033, + "loss": 2.6502, + "step": 1557 + }, + { + "epoch": 0.1786492374727669, + "grad_norm": 0.2933539569048112, + "learning_rate": 0.0009431918028415969, + "loss": 2.4632, + "step": 1558 + }, + { + "epoch": 0.17876390322210756, + "grad_norm": 0.2944806214615233, + "learning_rate": 0.0009431058043703293, + "loss": 2.6128, + "step": 1559 + }, + { + "epoch": 0.17887856897144822, + "grad_norm": 0.3023002824676232, + "learning_rate": 0.0009430197447810625, + "loss": 2.7158, + "step": 1560 + }, + { + "epoch": 0.1789932347207889, + "grad_norm": 0.3144447210743638, + "learning_rate": 0.0009429336240856662, + "loss": 2.719, + "step": 1561 + }, + { + "epoch": 0.17910790047012956, + "grad_norm": 0.2793896979780344, + "learning_rate": 0.00094284744229602, + "loss": 2.5493, + "step": 1562 + }, + { + "epoch": 0.17922256621947025, + "grad_norm": 0.28812998331110956, + "learning_rate": 0.0009427611994240104, + "loss": 2.6607, + "step": 1563 + }, + { + "epoch": 0.1793372319688109, + "grad_norm": 0.2503551038701681, + "learning_rate": 0.0009426748954815332, + "loss": 2.6102, + "step": 1564 + }, + { + "epoch": 0.1794518977181516, + "grad_norm": 0.2890967294557894, + "learning_rate": 0.0009425885304804922, + "loss": 2.7804, + "step": 1565 + }, + { + "epoch": 0.17956656346749225, + "grad_norm": 0.2702078058215934, + "learning_rate": 0.0009425021044328, + "loss": 2.5802, + "step": 1566 + }, + { + "epoch": 0.17968122921683294, + "grad_norm": 0.24761957442675828, + "learning_rate": 0.0009424156173503772, + "loss": 2.6283, + "step": 1567 + }, + { + "epoch": 0.1797958949661736, + "grad_norm": 0.26839976163231943, + "learning_rate": 0.0009423290692451534, + "loss": 2.6337, + "step": 1568 + }, + { + "epoch": 0.17991056071551428, + "grad_norm": 0.24897925792663997, + "learning_rate": 0.000942242460129066, + "loss": 2.6027, + "step": 1569 + }, + { + "epoch": 0.18002522646485494, + "grad_norm": 0.2976443813214073, + "learning_rate": 0.0009421557900140612, + "loss": 2.705, + "step": 1570 + }, + { + "epoch": 0.18013989221419563, + "grad_norm": 0.30623174491703814, + "learning_rate": 0.0009420690589120932, + "loss": 2.6631, + "step": 1571 + }, + { + "epoch": 0.1802545579635363, + "grad_norm": 0.29580499544813665, + "learning_rate": 0.0009419822668351255, + "loss": 2.7546, + "step": 1572 + }, + { + "epoch": 0.18036922371287697, + "grad_norm": 0.2910336264497159, + "learning_rate": 0.0009418954137951288, + "loss": 2.6307, + "step": 1573 + }, + { + "epoch": 0.18048388946221763, + "grad_norm": 0.3325015577864962, + "learning_rate": 0.000941808499804083, + "loss": 2.6504, + "step": 1574 + }, + { + "epoch": 0.18059855521155832, + "grad_norm": 0.32410874444441473, + "learning_rate": 0.0009417215248739764, + "loss": 2.5533, + "step": 1575 + }, + { + "epoch": 0.18071322096089898, + "grad_norm": 0.2823388513204848, + "learning_rate": 0.0009416344890168054, + "loss": 2.6224, + "step": 1576 + }, + { + "epoch": 0.18082788671023964, + "grad_norm": 0.3042401970435541, + "learning_rate": 0.000941547392244575, + "loss": 2.6287, + "step": 1577 + }, + { + "epoch": 0.18094255245958032, + "grad_norm": 0.28583859338866113, + "learning_rate": 0.0009414602345692984, + "loss": 2.5986, + "step": 1578 + }, + { + "epoch": 0.18105721820892098, + "grad_norm": 0.3270861775025795, + "learning_rate": 0.0009413730160029974, + "loss": 2.8622, + "step": 1579 + }, + { + "epoch": 0.18117188395826167, + "grad_norm": 0.2791183194210447, + "learning_rate": 0.0009412857365577023, + "loss": 2.7038, + "step": 1580 + }, + { + "epoch": 0.18128654970760233, + "grad_norm": 0.26728468189240806, + "learning_rate": 0.0009411983962454515, + "loss": 2.6402, + "step": 1581 + }, + { + "epoch": 0.18140121545694302, + "grad_norm": 0.2782027059948188, + "learning_rate": 0.0009411109950782919, + "loss": 2.6564, + "step": 1582 + }, + { + "epoch": 0.18151588120628367, + "grad_norm": 0.27536306521063675, + "learning_rate": 0.0009410235330682788, + "loss": 2.648, + "step": 1583 + }, + { + "epoch": 0.18163054695562436, + "grad_norm": 0.30140752326698506, + "learning_rate": 0.0009409360102274761, + "loss": 2.7696, + "step": 1584 + }, + { + "epoch": 0.18174521270496502, + "grad_norm": 0.28476965576567537, + "learning_rate": 0.0009408484265679558, + "loss": 2.641, + "step": 1585 + }, + { + "epoch": 0.1818598784543057, + "grad_norm": 0.2912154401832019, + "learning_rate": 0.0009407607821017983, + "loss": 2.5654, + "step": 1586 + }, + { + "epoch": 0.18197454420364637, + "grad_norm": 0.27064674786681925, + "learning_rate": 0.0009406730768410927, + "loss": 2.6001, + "step": 1587 + }, + { + "epoch": 0.18208920995298705, + "grad_norm": 0.3079389881158881, + "learning_rate": 0.0009405853107979361, + "loss": 2.5993, + "step": 1588 + }, + { + "epoch": 0.1822038757023277, + "grad_norm": 0.2744699039593194, + "learning_rate": 0.0009404974839844341, + "loss": 2.7245, + "step": 1589 + }, + { + "epoch": 0.1823185414516684, + "grad_norm": 0.28595867588557095, + "learning_rate": 0.0009404095964127008, + "loss": 2.609, + "step": 1590 + }, + { + "epoch": 0.18243320720100906, + "grad_norm": 0.3234937555525272, + "learning_rate": 0.0009403216480948589, + "loss": 2.6891, + "step": 1591 + }, + { + "epoch": 0.18254787295034974, + "grad_norm": 0.3045545903055496, + "learning_rate": 0.0009402336390430388, + "loss": 2.5994, + "step": 1592 + }, + { + "epoch": 0.1826625386996904, + "grad_norm": 0.2962280238786574, + "learning_rate": 0.0009401455692693798, + "loss": 2.6063, + "step": 1593 + }, + { + "epoch": 0.18277720444903106, + "grad_norm": 0.3396992559743812, + "learning_rate": 0.0009400574387860294, + "loss": 2.6353, + "step": 1594 + }, + { + "epoch": 0.18289187019837175, + "grad_norm": 0.30783701879618003, + "learning_rate": 0.0009399692476051436, + "loss": 2.5981, + "step": 1595 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.3159177302862465, + "learning_rate": 0.0009398809957388868, + "loss": 2.6042, + "step": 1596 + }, + { + "epoch": 0.1831212016970531, + "grad_norm": 0.27765425844391156, + "learning_rate": 0.0009397926831994314, + "loss": 2.5528, + "step": 1597 + }, + { + "epoch": 0.18323586744639375, + "grad_norm": 0.268450928572531, + "learning_rate": 0.0009397043099989587, + "loss": 2.6004, + "step": 1598 + }, + { + "epoch": 0.18335053319573444, + "grad_norm": 0.2897147883884085, + "learning_rate": 0.0009396158761496577, + "loss": 2.7143, + "step": 1599 + }, + { + "epoch": 0.1834651989450751, + "grad_norm": 0.2933284819181002, + "learning_rate": 0.0009395273816637267, + "loss": 2.7728, + "step": 1600 + }, + { + "epoch": 0.18357986469441578, + "grad_norm": 0.30544029460470745, + "learning_rate": 0.0009394388265533713, + "loss": 2.7528, + "step": 1601 + }, + { + "epoch": 0.18369453044375644, + "grad_norm": 0.29069796962646677, + "learning_rate": 0.0009393502108308064, + "loss": 2.7128, + "step": 1602 + }, + { + "epoch": 0.18380919619309713, + "grad_norm": 0.31037129829178545, + "learning_rate": 0.0009392615345082547, + "loss": 2.8378, + "step": 1603 + }, + { + "epoch": 0.1839238619424378, + "grad_norm": 0.27303452879426143, + "learning_rate": 0.0009391727975979474, + "loss": 2.7026, + "step": 1604 + }, + { + "epoch": 0.18403852769177848, + "grad_norm": 0.30009853898275407, + "learning_rate": 0.0009390840001121239, + "loss": 2.6208, + "step": 1605 + }, + { + "epoch": 0.18415319344111913, + "grad_norm": 0.29923071803536866, + "learning_rate": 0.0009389951420630325, + "loss": 2.6483, + "step": 1606 + }, + { + "epoch": 0.18426785919045982, + "grad_norm": 0.2740264424118773, + "learning_rate": 0.0009389062234629292, + "loss": 2.6782, + "step": 1607 + }, + { + "epoch": 0.18438252493980048, + "grad_norm": 0.287194991648891, + "learning_rate": 0.0009388172443240788, + "loss": 2.5848, + "step": 1608 + }, + { + "epoch": 0.18449719068914117, + "grad_norm": 0.30150078162675387, + "learning_rate": 0.0009387282046587539, + "loss": 2.6558, + "step": 1609 + }, + { + "epoch": 0.18461185643848183, + "grad_norm": 0.2951588308871814, + "learning_rate": 0.0009386391044792363, + "loss": 2.5973, + "step": 1610 + }, + { + "epoch": 0.18472652218782248, + "grad_norm": 0.23524440028545218, + "learning_rate": 0.0009385499437978153, + "loss": 2.5132, + "step": 1611 + }, + { + "epoch": 0.18484118793716317, + "grad_norm": 0.2739704447838688, + "learning_rate": 0.0009384607226267891, + "loss": 2.689, + "step": 1612 + }, + { + "epoch": 0.18495585368650383, + "grad_norm": 0.2793427414219293, + "learning_rate": 0.0009383714409784643, + "loss": 2.5308, + "step": 1613 + }, + { + "epoch": 0.18507051943584452, + "grad_norm": 0.30337601161979083, + "learning_rate": 0.000938282098865155, + "loss": 2.7024, + "step": 1614 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.29629368290679026, + "learning_rate": 0.0009381926962991847, + "loss": 2.6783, + "step": 1615 + }, + { + "epoch": 0.18529985093452586, + "grad_norm": 0.2801128803762986, + "learning_rate": 0.0009381032332928847, + "loss": 2.5971, + "step": 1616 + }, + { + "epoch": 0.18541451668386652, + "grad_norm": 0.3356344561293281, + "learning_rate": 0.0009380137098585946, + "loss": 2.6614, + "step": 1617 + }, + { + "epoch": 0.1855291824332072, + "grad_norm": 0.29673425679719284, + "learning_rate": 0.0009379241260086626, + "loss": 2.5374, + "step": 1618 + }, + { + "epoch": 0.18564384818254787, + "grad_norm": 0.2943190784069161, + "learning_rate": 0.0009378344817554449, + "loss": 2.8114, + "step": 1619 + }, + { + "epoch": 0.18575851393188855, + "grad_norm": 0.29149759905709227, + "learning_rate": 0.0009377447771113065, + "loss": 2.6624, + "step": 1620 + }, + { + "epoch": 0.1858731796812292, + "grad_norm": 0.29876199012293314, + "learning_rate": 0.0009376550120886203, + "loss": 2.5378, + "step": 1621 + }, + { + "epoch": 0.1859878454305699, + "grad_norm": 0.3116963337528239, + "learning_rate": 0.0009375651866997674, + "loss": 2.6813, + "step": 1622 + }, + { + "epoch": 0.18610251117991056, + "grad_norm": 0.2786910569530075, + "learning_rate": 0.0009374753009571379, + "loss": 2.6264, + "step": 1623 + }, + { + "epoch": 0.18621717692925124, + "grad_norm": 0.26753595420092785, + "learning_rate": 0.0009373853548731297, + "loss": 2.5009, + "step": 1624 + }, + { + "epoch": 0.1863318426785919, + "grad_norm": 0.27015646330514587, + "learning_rate": 0.000937295348460149, + "loss": 2.6444, + "step": 1625 + }, + { + "epoch": 0.1864465084279326, + "grad_norm": 0.2882331691823512, + "learning_rate": 0.0009372052817306106, + "loss": 2.6701, + "step": 1626 + }, + { + "epoch": 0.18656117417727325, + "grad_norm": 0.2868421747540592, + "learning_rate": 0.0009371151546969376, + "loss": 2.5549, + "step": 1627 + }, + { + "epoch": 0.1866758399266139, + "grad_norm": 0.29906532482779186, + "learning_rate": 0.0009370249673715611, + "loss": 2.4808, + "step": 1628 + }, + { + "epoch": 0.1867905056759546, + "grad_norm": 0.3101826433512816, + "learning_rate": 0.0009369347197669207, + "loss": 2.7789, + "step": 1629 + }, + { + "epoch": 0.18690517142529525, + "grad_norm": 0.28527036467296285, + "learning_rate": 0.0009368444118954646, + "loss": 2.5803, + "step": 1630 + }, + { + "epoch": 0.18701983717463594, + "grad_norm": 0.29620391976219546, + "learning_rate": 0.0009367540437696489, + "loss": 2.7285, + "step": 1631 + }, + { + "epoch": 0.1871345029239766, + "grad_norm": 0.2759342798402815, + "learning_rate": 0.0009366636154019381, + "loss": 2.5969, + "step": 1632 + }, + { + "epoch": 0.18724916867331728, + "grad_norm": 0.27664114923108574, + "learning_rate": 0.0009365731268048052, + "loss": 2.6709, + "step": 1633 + }, + { + "epoch": 0.18736383442265794, + "grad_norm": 0.2695354081349453, + "learning_rate": 0.0009364825779907311, + "loss": 2.6622, + "step": 1634 + }, + { + "epoch": 0.18747850017199863, + "grad_norm": 0.2865983259279852, + "learning_rate": 0.0009363919689722056, + "loss": 2.588, + "step": 1635 + }, + { + "epoch": 0.1875931659213393, + "grad_norm": 0.27651615557639503, + "learning_rate": 0.0009363012997617264, + "loss": 2.6959, + "step": 1636 + }, + { + "epoch": 0.18770783167067998, + "grad_norm": 0.31206797324515645, + "learning_rate": 0.0009362105703717994, + "loss": 2.626, + "step": 1637 + }, + { + "epoch": 0.18782249742002063, + "grad_norm": 0.2601260397485705, + "learning_rate": 0.0009361197808149393, + "loss": 2.583, + "step": 1638 + }, + { + "epoch": 0.18793716316936132, + "grad_norm": 0.2711339295227757, + "learning_rate": 0.0009360289311036688, + "loss": 2.6201, + "step": 1639 + }, + { + "epoch": 0.18805182891870198, + "grad_norm": 0.2888400178614798, + "learning_rate": 0.0009359380212505184, + "loss": 2.6415, + "step": 1640 + }, + { + "epoch": 0.18816649466804267, + "grad_norm": 0.30071446604341157, + "learning_rate": 0.0009358470512680278, + "loss": 2.7359, + "step": 1641 + }, + { + "epoch": 0.18828116041738333, + "grad_norm": 0.27292667279703003, + "learning_rate": 0.0009357560211687445, + "loss": 2.6477, + "step": 1642 + }, + { + "epoch": 0.188395826166724, + "grad_norm": 0.2805619716769284, + "learning_rate": 0.0009356649309652243, + "loss": 2.5284, + "step": 1643 + }, + { + "epoch": 0.18851049191606467, + "grad_norm": 0.2690250281567539, + "learning_rate": 0.0009355737806700315, + "loss": 2.693, + "step": 1644 + }, + { + "epoch": 0.18862515766540533, + "grad_norm": 0.31442287594243556, + "learning_rate": 0.0009354825702957383, + "loss": 2.745, + "step": 1645 + }, + { + "epoch": 0.18873982341474602, + "grad_norm": 0.26390035694890357, + "learning_rate": 0.0009353912998549259, + "loss": 2.4715, + "step": 1646 + }, + { + "epoch": 0.18885448916408668, + "grad_norm": 0.2904274656010759, + "learning_rate": 0.0009352999693601827, + "loss": 2.7475, + "step": 1647 + }, + { + "epoch": 0.18896915491342736, + "grad_norm": 0.2658738651093013, + "learning_rate": 0.0009352085788241064, + "loss": 2.5381, + "step": 1648 + }, + { + "epoch": 0.18908382066276802, + "grad_norm": 0.28600584822310604, + "learning_rate": 0.0009351171282593026, + "loss": 2.4896, + "step": 1649 + }, + { + "epoch": 0.1891984864121087, + "grad_norm": 0.2949739403293361, + "learning_rate": 0.0009350256176783847, + "loss": 2.7043, + "step": 1650 + }, + { + "epoch": 0.18931315216144937, + "grad_norm": 0.2601698808073328, + "learning_rate": 0.0009349340470939753, + "loss": 2.5595, + "step": 1651 + }, + { + "epoch": 0.18942781791079005, + "grad_norm": 0.2809716579863838, + "learning_rate": 0.0009348424165187049, + "loss": 2.8073, + "step": 1652 + }, + { + "epoch": 0.1895424836601307, + "grad_norm": 0.3070053668395669, + "learning_rate": 0.0009347507259652119, + "loss": 2.6033, + "step": 1653 + }, + { + "epoch": 0.1896571494094714, + "grad_norm": 0.3181813220673822, + "learning_rate": 0.0009346589754461433, + "loss": 2.6295, + "step": 1654 + }, + { + "epoch": 0.18977181515881206, + "grad_norm": 0.3051686791808755, + "learning_rate": 0.0009345671649741545, + "loss": 2.5578, + "step": 1655 + }, + { + "epoch": 0.18988648090815274, + "grad_norm": 0.294904401079894, + "learning_rate": 0.0009344752945619089, + "loss": 2.6184, + "step": 1656 + }, + { + "epoch": 0.1900011466574934, + "grad_norm": 0.2896281238108003, + "learning_rate": 0.0009343833642220781, + "loss": 2.6809, + "step": 1657 + }, + { + "epoch": 0.1901158124068341, + "grad_norm": 0.2853277831028136, + "learning_rate": 0.0009342913739673424, + "loss": 2.5593, + "step": 1658 + }, + { + "epoch": 0.19023047815617475, + "grad_norm": 0.28778394604390056, + "learning_rate": 0.00093419932381039, + "loss": 2.6681, + "step": 1659 + }, + { + "epoch": 0.19034514390551543, + "grad_norm": 0.2814948936986095, + "learning_rate": 0.0009341072137639175, + "loss": 2.5445, + "step": 1660 + }, + { + "epoch": 0.1904598096548561, + "grad_norm": 0.28794003786753697, + "learning_rate": 0.0009340150438406296, + "loss": 2.5699, + "step": 1661 + }, + { + "epoch": 0.19057447540419675, + "grad_norm": 0.264828278606104, + "learning_rate": 0.0009339228140532396, + "loss": 2.6842, + "step": 1662 + }, + { + "epoch": 0.19068914115353744, + "grad_norm": 0.26794141908158825, + "learning_rate": 0.0009338305244144687, + "loss": 2.566, + "step": 1663 + }, + { + "epoch": 0.1908038069028781, + "grad_norm": 0.2784198207148264, + "learning_rate": 0.0009337381749370463, + "loss": 2.6559, + "step": 1664 + }, + { + "epoch": 0.19091847265221878, + "grad_norm": 0.2700447329563735, + "learning_rate": 0.0009336457656337108, + "loss": 2.7994, + "step": 1665 + }, + { + "epoch": 0.19103313840155944, + "grad_norm": 0.24834314276895197, + "learning_rate": 0.0009335532965172079, + "loss": 2.6122, + "step": 1666 + }, + { + "epoch": 0.19114780415090013, + "grad_norm": 0.2831720400238699, + "learning_rate": 0.0009334607676002919, + "loss": 2.6909, + "step": 1667 + }, + { + "epoch": 0.1912624699002408, + "grad_norm": 0.28914973880105727, + "learning_rate": 0.0009333681788957256, + "loss": 2.6878, + "step": 1668 + }, + { + "epoch": 0.19137713564958148, + "grad_norm": 0.29304060720372416, + "learning_rate": 0.0009332755304162798, + "loss": 2.5646, + "step": 1669 + }, + { + "epoch": 0.19149180139892213, + "grad_norm": 0.25314858427775216, + "learning_rate": 0.0009331828221747335, + "loss": 2.5606, + "step": 1670 + }, + { + "epoch": 0.19160646714826282, + "grad_norm": 0.325792760080864, + "learning_rate": 0.0009330900541838741, + "loss": 2.7139, + "step": 1671 + }, + { + "epoch": 0.19172113289760348, + "grad_norm": 0.28787676976779847, + "learning_rate": 0.0009329972264564972, + "loss": 2.6344, + "step": 1672 + }, + { + "epoch": 0.19183579864694417, + "grad_norm": 0.27576276754689916, + "learning_rate": 0.0009329043390054066, + "loss": 2.5655, + "step": 1673 + }, + { + "epoch": 0.19195046439628483, + "grad_norm": 0.294794584295284, + "learning_rate": 0.0009328113918434142, + "loss": 2.6071, + "step": 1674 + }, + { + "epoch": 0.1920651301456255, + "grad_norm": 0.3004997233442451, + "learning_rate": 0.0009327183849833406, + "loss": 2.6939, + "step": 1675 + }, + { + "epoch": 0.19217979589496617, + "grad_norm": 0.2751277123682815, + "learning_rate": 0.0009326253184380141, + "loss": 2.6713, + "step": 1676 + }, + { + "epoch": 0.19229446164430686, + "grad_norm": 0.2867734252046316, + "learning_rate": 0.0009325321922202716, + "loss": 2.6944, + "step": 1677 + }, + { + "epoch": 0.19240912739364752, + "grad_norm": 0.27641770654652825, + "learning_rate": 0.0009324390063429578, + "loss": 2.5355, + "step": 1678 + }, + { + "epoch": 0.1925237931429882, + "grad_norm": 0.26458423972910705, + "learning_rate": 0.0009323457608189263, + "loss": 2.733, + "step": 1679 + }, + { + "epoch": 0.19263845889232886, + "grad_norm": 0.2732839981523438, + "learning_rate": 0.0009322524556610384, + "loss": 2.5572, + "step": 1680 + }, + { + "epoch": 0.19275312464166952, + "grad_norm": 0.29258284583202737, + "learning_rate": 0.0009321590908821635, + "loss": 2.6516, + "step": 1681 + }, + { + "epoch": 0.1928677903910102, + "grad_norm": 0.26813493127264015, + "learning_rate": 0.0009320656664951797, + "loss": 2.5803, + "step": 1682 + }, + { + "epoch": 0.19298245614035087, + "grad_norm": 0.30478739766811735, + "learning_rate": 0.0009319721825129734, + "loss": 2.5953, + "step": 1683 + }, + { + "epoch": 0.19309712188969155, + "grad_norm": 0.27023023879764657, + "learning_rate": 0.0009318786389484383, + "loss": 2.5282, + "step": 1684 + }, + { + "epoch": 0.1932117876390322, + "grad_norm": 0.2763402459070808, + "learning_rate": 0.0009317850358144778, + "loss": 2.4999, + "step": 1685 + }, + { + "epoch": 0.1933264533883729, + "grad_norm": 0.3157947959751468, + "learning_rate": 0.0009316913731240018, + "loss": 2.4914, + "step": 1686 + }, + { + "epoch": 0.19344111913771356, + "grad_norm": 0.3124817905240308, + "learning_rate": 0.0009315976508899298, + "loss": 2.6382, + "step": 1687 + }, + { + "epoch": 0.19355578488705424, + "grad_norm": 0.30712230488803877, + "learning_rate": 0.0009315038691251887, + "loss": 2.5854, + "step": 1688 + }, + { + "epoch": 0.1936704506363949, + "grad_norm": 0.2923878178154196, + "learning_rate": 0.0009314100278427143, + "loss": 2.5773, + "step": 1689 + }, + { + "epoch": 0.1937851163857356, + "grad_norm": 0.31218078215668, + "learning_rate": 0.0009313161270554498, + "loss": 2.7746, + "step": 1690 + }, + { + "epoch": 0.19389978213507625, + "grad_norm": 0.30630615245995235, + "learning_rate": 0.0009312221667763472, + "loss": 2.6889, + "step": 1691 + }, + { + "epoch": 0.19401444788441694, + "grad_norm": 0.2886778151996603, + "learning_rate": 0.0009311281470183667, + "loss": 2.6624, + "step": 1692 + }, + { + "epoch": 0.1941291136337576, + "grad_norm": 0.26233581421762375, + "learning_rate": 0.0009310340677944762, + "loss": 2.5113, + "step": 1693 + }, + { + "epoch": 0.19424377938309828, + "grad_norm": 0.2478620895071049, + "learning_rate": 0.0009309399291176524, + "loss": 2.5476, + "step": 1694 + }, + { + "epoch": 0.19435844513243894, + "grad_norm": 0.26775185563544085, + "learning_rate": 0.0009308457310008798, + "loss": 2.5786, + "step": 1695 + }, + { + "epoch": 0.19447311088177963, + "grad_norm": 0.26014739028287986, + "learning_rate": 0.0009307514734571514, + "loss": 2.5151, + "step": 1696 + }, + { + "epoch": 0.19458777663112028, + "grad_norm": 0.2526489044714267, + "learning_rate": 0.0009306571564994679, + "loss": 2.5586, + "step": 1697 + }, + { + "epoch": 0.19470244238046094, + "grad_norm": 0.26145180880160135, + "learning_rate": 0.000930562780140839, + "loss": 2.7168, + "step": 1698 + }, + { + "epoch": 0.19481710812980163, + "grad_norm": 0.27692814346591, + "learning_rate": 0.0009304683443942816, + "loss": 2.5575, + "step": 1699 + }, + { + "epoch": 0.1949317738791423, + "grad_norm": 0.29041701130036884, + "learning_rate": 0.0009303738492728216, + "loss": 2.7003, + "step": 1700 + }, + { + "epoch": 0.19504643962848298, + "grad_norm": 0.2992879801572418, + "learning_rate": 0.000930279294789493, + "loss": 2.6822, + "step": 1701 + }, + { + "epoch": 0.19516110537782363, + "grad_norm": 0.2862615538992636, + "learning_rate": 0.0009301846809573373, + "loss": 2.6188, + "step": 1702 + }, + { + "epoch": 0.19527577112716432, + "grad_norm": 0.2872393578188177, + "learning_rate": 0.000930090007789405, + "loss": 2.5707, + "step": 1703 + }, + { + "epoch": 0.19539043687650498, + "grad_norm": 0.2809325292410022, + "learning_rate": 0.0009299952752987544, + "loss": 2.6404, + "step": 1704 + }, + { + "epoch": 0.19550510262584567, + "grad_norm": 0.33948355793714385, + "learning_rate": 0.0009299004834984519, + "loss": 2.5891, + "step": 1705 + }, + { + "epoch": 0.19561976837518633, + "grad_norm": 0.2942578740634884, + "learning_rate": 0.0009298056324015724, + "loss": 2.6191, + "step": 1706 + }, + { + "epoch": 0.195734434124527, + "grad_norm": 0.2929279931546125, + "learning_rate": 0.0009297107220211988, + "loss": 2.6574, + "step": 1707 + }, + { + "epoch": 0.19584909987386767, + "grad_norm": 0.2685094388898099, + "learning_rate": 0.0009296157523704223, + "loss": 2.5703, + "step": 1708 + }, + { + "epoch": 0.19596376562320836, + "grad_norm": 0.2911102519333245, + "learning_rate": 0.0009295207234623418, + "loss": 2.6691, + "step": 1709 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.28928000922977426, + "learning_rate": 0.000929425635310065, + "loss": 2.5787, + "step": 1710 + }, + { + "epoch": 0.1961930971218897, + "grad_norm": 0.2919157130864013, + "learning_rate": 0.0009293304879267073, + "loss": 2.8416, + "step": 1711 + }, + { + "epoch": 0.19630776287123036, + "grad_norm": 0.2748486990497807, + "learning_rate": 0.0009292352813253926, + "loss": 2.6306, + "step": 1712 + }, + { + "epoch": 0.19642242862057105, + "grad_norm": 0.27467221064391567, + "learning_rate": 0.0009291400155192528, + "loss": 2.5247, + "step": 1713 + }, + { + "epoch": 0.1965370943699117, + "grad_norm": 0.28655267312567906, + "learning_rate": 0.0009290446905214281, + "loss": 2.5682, + "step": 1714 + }, + { + "epoch": 0.19665176011925237, + "grad_norm": 0.27351615052854433, + "learning_rate": 0.0009289493063450666, + "loss": 2.6313, + "step": 1715 + }, + { + "epoch": 0.19676642586859305, + "grad_norm": 0.29991900735188, + "learning_rate": 0.0009288538630033247, + "loss": 2.732, + "step": 1716 + }, + { + "epoch": 0.1968810916179337, + "grad_norm": 0.26410057446094976, + "learning_rate": 0.0009287583605093674, + "loss": 2.6481, + "step": 1717 + }, + { + "epoch": 0.1969957573672744, + "grad_norm": 0.2553533369999085, + "learning_rate": 0.000928662798876367, + "loss": 2.7471, + "step": 1718 + }, + { + "epoch": 0.19711042311661506, + "grad_norm": 0.24060694671389823, + "learning_rate": 0.0009285671781175045, + "loss": 2.5678, + "step": 1719 + }, + { + "epoch": 0.19722508886595574, + "grad_norm": 0.2580506621163582, + "learning_rate": 0.000928471498245969, + "loss": 2.6716, + "step": 1720 + }, + { + "epoch": 0.1973397546152964, + "grad_norm": 0.2616284312681021, + "learning_rate": 0.0009283757592749577, + "loss": 2.5215, + "step": 1721 + }, + { + "epoch": 0.1974544203646371, + "grad_norm": 0.25203577974257835, + "learning_rate": 0.0009282799612176762, + "loss": 2.7473, + "step": 1722 + }, + { + "epoch": 0.19756908611397775, + "grad_norm": 0.296751865731299, + "learning_rate": 0.0009281841040873376, + "loss": 2.7065, + "step": 1723 + }, + { + "epoch": 0.19768375186331844, + "grad_norm": 0.2671438646220266, + "learning_rate": 0.0009280881878971637, + "loss": 2.7288, + "step": 1724 + }, + { + "epoch": 0.1977984176126591, + "grad_norm": 0.27546778661458454, + "learning_rate": 0.0009279922126603846, + "loss": 2.7598, + "step": 1725 + }, + { + "epoch": 0.19791308336199978, + "grad_norm": 0.27969097669459164, + "learning_rate": 0.000927896178390238, + "loss": 2.5758, + "step": 1726 + }, + { + "epoch": 0.19802774911134044, + "grad_norm": 0.2665419165349484, + "learning_rate": 0.0009278000850999699, + "loss": 2.628, + "step": 1727 + }, + { + "epoch": 0.19814241486068113, + "grad_norm": 0.32865196209771536, + "learning_rate": 0.0009277039328028347, + "loss": 2.6072, + "step": 1728 + }, + { + "epoch": 0.19825708061002179, + "grad_norm": 0.27609935996698404, + "learning_rate": 0.0009276077215120949, + "loss": 2.6378, + "step": 1729 + }, + { + "epoch": 0.19837174635936247, + "grad_norm": 0.297424795884251, + "learning_rate": 0.0009275114512410208, + "loss": 2.5096, + "step": 1730 + }, + { + "epoch": 0.19848641210870313, + "grad_norm": 0.2795760894911456, + "learning_rate": 0.000927415122002891, + "loss": 2.7274, + "step": 1731 + }, + { + "epoch": 0.1986010778580438, + "grad_norm": 0.26309631574320286, + "learning_rate": 0.0009273187338109925, + "loss": 2.5948, + "step": 1732 + }, + { + "epoch": 0.19871574360738448, + "grad_norm": 0.2704415797972162, + "learning_rate": 0.0009272222866786201, + "loss": 2.6391, + "step": 1733 + }, + { + "epoch": 0.19883040935672514, + "grad_norm": 0.2824461482521748, + "learning_rate": 0.0009271257806190769, + "loss": 2.6665, + "step": 1734 + }, + { + "epoch": 0.19894507510606582, + "grad_norm": 0.3378929695763108, + "learning_rate": 0.0009270292156456738, + "loss": 2.6966, + "step": 1735 + }, + { + "epoch": 0.19905974085540648, + "grad_norm": 0.2968579268909668, + "learning_rate": 0.0009269325917717306, + "loss": 2.6312, + "step": 1736 + }, + { + "epoch": 0.19917440660474717, + "grad_norm": 0.2604705864710766, + "learning_rate": 0.0009268359090105743, + "loss": 2.583, + "step": 1737 + }, + { + "epoch": 0.19928907235408783, + "grad_norm": 0.27078611162929744, + "learning_rate": 0.0009267391673755405, + "loss": 2.6624, + "step": 1738 + }, + { + "epoch": 0.1994037381034285, + "grad_norm": 0.30106035857019103, + "learning_rate": 0.0009266423668799731, + "loss": 2.6475, + "step": 1739 + }, + { + "epoch": 0.19951840385276917, + "grad_norm": 0.2803063222175058, + "learning_rate": 0.0009265455075372237, + "loss": 2.6104, + "step": 1740 + }, + { + "epoch": 0.19963306960210986, + "grad_norm": 0.2606181624767818, + "learning_rate": 0.0009264485893606523, + "loss": 2.687, + "step": 1741 + }, + { + "epoch": 0.19974773535145052, + "grad_norm": 0.2709598332186418, + "learning_rate": 0.0009263516123636267, + "loss": 2.5639, + "step": 1742 + }, + { + "epoch": 0.1998624011007912, + "grad_norm": 0.28347200949823614, + "learning_rate": 0.0009262545765595232, + "loss": 2.5372, + "step": 1743 + }, + { + "epoch": 0.19997706685013186, + "grad_norm": 0.27881331102566836, + "learning_rate": 0.000926157481961726, + "loss": 2.6542, + "step": 1744 + }, + { + "epoch": 0.20009173259947255, + "grad_norm": 0.27845622741764925, + "learning_rate": 0.0009260603285836276, + "loss": 2.7555, + "step": 1745 + }, + { + "epoch": 0.2002063983488132, + "grad_norm": 0.26847961598207754, + "learning_rate": 0.0009259631164386282, + "loss": 2.4554, + "step": 1746 + }, + { + "epoch": 0.2003210640981539, + "grad_norm": 0.31853256499486454, + "learning_rate": 0.0009258658455401365, + "loss": 2.6157, + "step": 1747 + }, + { + "epoch": 0.20043572984749455, + "grad_norm": 0.29895738916929404, + "learning_rate": 0.0009257685159015692, + "loss": 2.5628, + "step": 1748 + }, + { + "epoch": 0.2005503955968352, + "grad_norm": 0.27092781192713605, + "learning_rate": 0.0009256711275363509, + "loss": 2.787, + "step": 1749 + }, + { + "epoch": 0.2006650613461759, + "grad_norm": 0.32893431152903013, + "learning_rate": 0.0009255736804579147, + "loss": 2.5425, + "step": 1750 + }, + { + "epoch": 0.20077972709551656, + "grad_norm": 0.29513628112715307, + "learning_rate": 0.0009254761746797013, + "loss": 2.504, + "step": 1751 + }, + { + "epoch": 0.20089439284485724, + "grad_norm": 0.2988411154823401, + "learning_rate": 0.0009253786102151602, + "loss": 2.615, + "step": 1752 + }, + { + "epoch": 0.2010090585941979, + "grad_norm": 0.273621199490888, + "learning_rate": 0.0009252809870777481, + "loss": 2.7242, + "step": 1753 + }, + { + "epoch": 0.2011237243435386, + "grad_norm": 0.26078744569512974, + "learning_rate": 0.0009251833052809304, + "loss": 2.6302, + "step": 1754 + }, + { + "epoch": 0.20123839009287925, + "grad_norm": 0.254503886359509, + "learning_rate": 0.0009250855648381805, + "loss": 2.4573, + "step": 1755 + }, + { + "epoch": 0.20135305584221994, + "grad_norm": 0.26325298479855763, + "learning_rate": 0.0009249877657629799, + "loss": 2.6683, + "step": 1756 + }, + { + "epoch": 0.2014677215915606, + "grad_norm": 0.27149883444439366, + "learning_rate": 0.0009248899080688178, + "loss": 2.625, + "step": 1757 + }, + { + "epoch": 0.20158238734090128, + "grad_norm": 0.27157652111558644, + "learning_rate": 0.0009247919917691923, + "loss": 2.5554, + "step": 1758 + }, + { + "epoch": 0.20169705309024194, + "grad_norm": 0.2823642430565864, + "learning_rate": 0.0009246940168776086, + "loss": 2.6599, + "step": 1759 + }, + { + "epoch": 0.20181171883958263, + "grad_norm": 0.27323956886186274, + "learning_rate": 0.0009245959834075807, + "loss": 2.5351, + "step": 1760 + }, + { + "epoch": 0.20192638458892329, + "grad_norm": 0.2905324787805063, + "learning_rate": 0.0009244978913726304, + "loss": 2.7296, + "step": 1761 + }, + { + "epoch": 0.20204105033826397, + "grad_norm": 0.31225837240536775, + "learning_rate": 0.0009243997407862878, + "loss": 2.6828, + "step": 1762 + }, + { + "epoch": 0.20215571608760463, + "grad_norm": 0.2764447319487757, + "learning_rate": 0.0009243015316620906, + "loss": 2.5578, + "step": 1763 + }, + { + "epoch": 0.20227038183694532, + "grad_norm": 0.2841933683801059, + "learning_rate": 0.0009242032640135852, + "loss": 2.6, + "step": 1764 + }, + { + "epoch": 0.20238504758628598, + "grad_norm": 0.29179152760505483, + "learning_rate": 0.0009241049378543254, + "loss": 2.6247, + "step": 1765 + }, + { + "epoch": 0.20249971333562664, + "grad_norm": 0.2972631674408136, + "learning_rate": 0.0009240065531978736, + "loss": 2.5923, + "step": 1766 + }, + { + "epoch": 0.20261437908496732, + "grad_norm": 0.286572269169564, + "learning_rate": 0.0009239081100578002, + "loss": 2.5166, + "step": 1767 + }, + { + "epoch": 0.20272904483430798, + "grad_norm": 0.26487465032329527, + "learning_rate": 0.0009238096084476832, + "loss": 2.59, + "step": 1768 + }, + { + "epoch": 0.20284371058364867, + "grad_norm": 0.27462479085638664, + "learning_rate": 0.0009237110483811096, + "loss": 2.5336, + "step": 1769 + }, + { + "epoch": 0.20295837633298933, + "grad_norm": 0.2859307727208129, + "learning_rate": 0.0009236124298716734, + "loss": 2.8086, + "step": 1770 + }, + { + "epoch": 0.20307304208233, + "grad_norm": 0.2634898217111837, + "learning_rate": 0.0009235137529329772, + "loss": 2.6404, + "step": 1771 + }, + { + "epoch": 0.20318770783167067, + "grad_norm": 0.31524014390003985, + "learning_rate": 0.0009234150175786318, + "loss": 2.5594, + "step": 1772 + }, + { + "epoch": 0.20330237358101136, + "grad_norm": 0.3037261914238762, + "learning_rate": 0.0009233162238222556, + "loss": 2.5686, + "step": 1773 + }, + { + "epoch": 0.20341703933035202, + "grad_norm": 0.30005378358971446, + "learning_rate": 0.0009232173716774757, + "loss": 2.6601, + "step": 1774 + }, + { + "epoch": 0.2035317050796927, + "grad_norm": 0.30318877481006773, + "learning_rate": 0.0009231184611579265, + "loss": 2.6332, + "step": 1775 + }, + { + "epoch": 0.20364637082903336, + "grad_norm": 0.3015831389339266, + "learning_rate": 0.000923019492277251, + "loss": 2.6967, + "step": 1776 + }, + { + "epoch": 0.20376103657837405, + "grad_norm": 0.31032689094531113, + "learning_rate": 0.0009229204650491001, + "loss": 2.6506, + "step": 1777 + }, + { + "epoch": 0.2038757023277147, + "grad_norm": 0.31984075907495463, + "learning_rate": 0.0009228213794871325, + "loss": 2.6961, + "step": 1778 + }, + { + "epoch": 0.2039903680770554, + "grad_norm": 0.2995015241326771, + "learning_rate": 0.0009227222356050154, + "loss": 2.7223, + "step": 1779 + }, + { + "epoch": 0.20410503382639605, + "grad_norm": 0.254120920521997, + "learning_rate": 0.0009226230334164236, + "loss": 2.5075, + "step": 1780 + }, + { + "epoch": 0.20421969957573674, + "grad_norm": 0.2523150697559599, + "learning_rate": 0.0009225237729350403, + "loss": 2.627, + "step": 1781 + }, + { + "epoch": 0.2043343653250774, + "grad_norm": 0.2924672952770439, + "learning_rate": 0.0009224244541745566, + "loss": 2.5827, + "step": 1782 + }, + { + "epoch": 0.20444903107441806, + "grad_norm": 0.2964451721053978, + "learning_rate": 0.0009223250771486717, + "loss": 2.7902, + "step": 1783 + }, + { + "epoch": 0.20456369682375874, + "grad_norm": 0.27512054957429405, + "learning_rate": 0.0009222256418710923, + "loss": 2.4599, + "step": 1784 + }, + { + "epoch": 0.2046783625730994, + "grad_norm": 0.2577890929804786, + "learning_rate": 0.0009221261483555343, + "loss": 2.5715, + "step": 1785 + }, + { + "epoch": 0.2047930283224401, + "grad_norm": 0.29742219880611753, + "learning_rate": 0.0009220265966157205, + "loss": 2.6554, + "step": 1786 + }, + { + "epoch": 0.20490769407178075, + "grad_norm": 0.29582982979580524, + "learning_rate": 0.0009219269866653823, + "loss": 2.6075, + "step": 1787 + }, + { + "epoch": 0.20502235982112144, + "grad_norm": 0.2800466961496367, + "learning_rate": 0.0009218273185182588, + "loss": 2.5613, + "step": 1788 + }, + { + "epoch": 0.2051370255704621, + "grad_norm": 0.2720602791658052, + "learning_rate": 0.0009217275921880976, + "loss": 2.7009, + "step": 1789 + }, + { + "epoch": 0.20525169131980278, + "grad_norm": 0.26467284612710645, + "learning_rate": 0.000921627807688654, + "loss": 2.6417, + "step": 1790 + }, + { + "epoch": 0.20536635706914344, + "grad_norm": 0.28029396203374163, + "learning_rate": 0.0009215279650336911, + "loss": 2.6438, + "step": 1791 + }, + { + "epoch": 0.20548102281848413, + "grad_norm": 0.2952715814083847, + "learning_rate": 0.0009214280642369806, + "loss": 2.5739, + "step": 1792 + }, + { + "epoch": 0.20559568856782479, + "grad_norm": 0.27466522659836473, + "learning_rate": 0.0009213281053123018, + "loss": 2.5252, + "step": 1793 + }, + { + "epoch": 0.20571035431716547, + "grad_norm": 0.27300317387113116, + "learning_rate": 0.000921228088273442, + "loss": 2.5609, + "step": 1794 + }, + { + "epoch": 0.20582502006650613, + "grad_norm": 0.28475773425882983, + "learning_rate": 0.0009211280131341968, + "loss": 2.5327, + "step": 1795 + }, + { + "epoch": 0.20593968581584682, + "grad_norm": 0.27800539838571175, + "learning_rate": 0.0009210278799083695, + "loss": 2.4658, + "step": 1796 + }, + { + "epoch": 0.20605435156518748, + "grad_norm": 0.27344159847746036, + "learning_rate": 0.000920927688609772, + "loss": 2.4672, + "step": 1797 + }, + { + "epoch": 0.20616901731452816, + "grad_norm": 0.2882640478534675, + "learning_rate": 0.0009208274392522231, + "loss": 2.5276, + "step": 1798 + }, + { + "epoch": 0.20628368306386882, + "grad_norm": 0.28039057332029593, + "learning_rate": 0.0009207271318495509, + "loss": 2.5387, + "step": 1799 + }, + { + "epoch": 0.20639834881320948, + "grad_norm": 0.30498936725921144, + "learning_rate": 0.0009206267664155906, + "loss": 2.6403, + "step": 1800 + }, + { + "epoch": 0.20651301456255017, + "grad_norm": 0.2697333578929762, + "learning_rate": 0.0009205263429641857, + "loss": 2.7215, + "step": 1801 + }, + { + "epoch": 0.20662768031189083, + "grad_norm": 0.31249331175330564, + "learning_rate": 0.0009204258615091879, + "loss": 2.6832, + "step": 1802 + }, + { + "epoch": 0.2067423460612315, + "grad_norm": 0.2799012570294706, + "learning_rate": 0.0009203253220644564, + "loss": 2.6696, + "step": 1803 + }, + { + "epoch": 0.20685701181057217, + "grad_norm": 0.25278090626053484, + "learning_rate": 0.0009202247246438589, + "loss": 2.6514, + "step": 1804 + }, + { + "epoch": 0.20697167755991286, + "grad_norm": 0.28633395901928993, + "learning_rate": 0.0009201240692612708, + "loss": 2.6969, + "step": 1805 + }, + { + "epoch": 0.20708634330925352, + "grad_norm": 0.2929582198937101, + "learning_rate": 0.0009200233559305758, + "loss": 2.5596, + "step": 1806 + }, + { + "epoch": 0.2072010090585942, + "grad_norm": 0.2663089590681011, + "learning_rate": 0.0009199225846656649, + "loss": 2.6004, + "step": 1807 + }, + { + "epoch": 0.20731567480793486, + "grad_norm": 0.2760320054589639, + "learning_rate": 0.0009198217554804382, + "loss": 2.6414, + "step": 1808 + }, + { + "epoch": 0.20743034055727555, + "grad_norm": 0.2710764396013921, + "learning_rate": 0.0009197208683888028, + "loss": 2.6011, + "step": 1809 + }, + { + "epoch": 0.2075450063066162, + "grad_norm": 0.2517043468636936, + "learning_rate": 0.0009196199234046741, + "loss": 2.5256, + "step": 1810 + }, + { + "epoch": 0.2076596720559569, + "grad_norm": 0.31308343473426087, + "learning_rate": 0.0009195189205419757, + "loss": 2.6821, + "step": 1811 + }, + { + "epoch": 0.20777433780529755, + "grad_norm": 0.27151190853916474, + "learning_rate": 0.000919417859814639, + "loss": 2.6142, + "step": 1812 + }, + { + "epoch": 0.20788900355463824, + "grad_norm": 0.2701020751745533, + "learning_rate": 0.0009193167412366034, + "loss": 2.6516, + "step": 1813 + }, + { + "epoch": 0.2080036693039789, + "grad_norm": 0.2663926364421637, + "learning_rate": 0.0009192155648218162, + "loss": 2.6874, + "step": 1814 + }, + { + "epoch": 0.2081183350533196, + "grad_norm": 0.2644533736432294, + "learning_rate": 0.0009191143305842329, + "loss": 2.5328, + "step": 1815 + }, + { + "epoch": 0.20823300080266025, + "grad_norm": 0.2905391475269113, + "learning_rate": 0.0009190130385378166, + "loss": 2.5705, + "step": 1816 + }, + { + "epoch": 0.2083476665520009, + "grad_norm": 0.27639315799691644, + "learning_rate": 0.0009189116886965388, + "loss": 2.554, + "step": 1817 + }, + { + "epoch": 0.2084623323013416, + "grad_norm": 0.2693653631919564, + "learning_rate": 0.0009188102810743788, + "loss": 2.6249, + "step": 1818 + }, + { + "epoch": 0.20857699805068225, + "grad_norm": 0.286969232047764, + "learning_rate": 0.0009187088156853236, + "loss": 2.5677, + "step": 1819 + }, + { + "epoch": 0.20869166380002294, + "grad_norm": 0.2601503526373931, + "learning_rate": 0.0009186072925433689, + "loss": 2.5072, + "step": 1820 + }, + { + "epoch": 0.2088063295493636, + "grad_norm": 0.2653654402079119, + "learning_rate": 0.0009185057116625172, + "loss": 2.5455, + "step": 1821 + }, + { + "epoch": 0.20892099529870428, + "grad_norm": 0.2668601065990496, + "learning_rate": 0.0009184040730567803, + "loss": 2.442, + "step": 1822 + }, + { + "epoch": 0.20903566104804494, + "grad_norm": 0.2998095783103949, + "learning_rate": 0.0009183023767401769, + "loss": 2.464, + "step": 1823 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.3069579799522593, + "learning_rate": 0.0009182006227267343, + "loss": 2.7362, + "step": 1824 + }, + { + "epoch": 0.2092649925467263, + "grad_norm": 0.3136578445647607, + "learning_rate": 0.0009180988110304873, + "loss": 2.5904, + "step": 1825 + }, + { + "epoch": 0.20937965829606697, + "grad_norm": 0.30135775504050333, + "learning_rate": 0.000917996941665479, + "loss": 2.6904, + "step": 1826 + }, + { + "epoch": 0.20949432404540763, + "grad_norm": 0.2857759439690108, + "learning_rate": 0.0009178950146457606, + "loss": 2.6182, + "step": 1827 + }, + { + "epoch": 0.20960898979474832, + "grad_norm": 0.30608177123710906, + "learning_rate": 0.0009177930299853903, + "loss": 2.7166, + "step": 1828 + }, + { + "epoch": 0.20972365554408898, + "grad_norm": 0.2606953416847776, + "learning_rate": 0.0009176909876984356, + "loss": 2.5399, + "step": 1829 + }, + { + "epoch": 0.20983832129342966, + "grad_norm": 0.27569299374427203, + "learning_rate": 0.0009175888877989712, + "loss": 2.7004, + "step": 1830 + }, + { + "epoch": 0.20995298704277032, + "grad_norm": 0.30149350990317325, + "learning_rate": 0.0009174867303010795, + "loss": 2.5774, + "step": 1831 + }, + { + "epoch": 0.210067652792111, + "grad_norm": 0.27749096626494185, + "learning_rate": 0.0009173845152188516, + "loss": 2.5713, + "step": 1832 + }, + { + "epoch": 0.21018231854145167, + "grad_norm": 0.3138293889063431, + "learning_rate": 0.0009172822425663855, + "loss": 2.5801, + "step": 1833 + }, + { + "epoch": 0.21029698429079233, + "grad_norm": 0.2817368297169945, + "learning_rate": 0.0009171799123577886, + "loss": 2.5918, + "step": 1834 + }, + { + "epoch": 0.210411650040133, + "grad_norm": 0.27357098773786476, + "learning_rate": 0.0009170775246071747, + "loss": 2.6969, + "step": 1835 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2775443497152393, + "learning_rate": 0.0009169750793286667, + "loss": 2.6856, + "step": 1836 + }, + { + "epoch": 0.21064098153881436, + "grad_norm": 0.3031248784741946, + "learning_rate": 0.0009168725765363946, + "loss": 2.5311, + "step": 1837 + }, + { + "epoch": 0.21075564728815502, + "grad_norm": 0.257232567679352, + "learning_rate": 0.0009167700162444969, + "loss": 2.5365, + "step": 1838 + }, + { + "epoch": 0.2108703130374957, + "grad_norm": 0.2816349325373924, + "learning_rate": 0.0009166673984671198, + "loss": 2.611, + "step": 1839 + }, + { + "epoch": 0.21098497878683636, + "grad_norm": 0.2874232139828025, + "learning_rate": 0.0009165647232184176, + "loss": 2.6066, + "step": 1840 + }, + { + "epoch": 0.21109964453617705, + "grad_norm": 0.26525663183590725, + "learning_rate": 0.0009164619905125522, + "loss": 2.6868, + "step": 1841 + }, + { + "epoch": 0.2112143102855177, + "grad_norm": 0.2951763320631187, + "learning_rate": 0.0009163592003636936, + "loss": 2.6218, + "step": 1842 + }, + { + "epoch": 0.2113289760348584, + "grad_norm": 0.2959412129676108, + "learning_rate": 0.00091625635278602, + "loss": 2.6998, + "step": 1843 + }, + { + "epoch": 0.21144364178419905, + "grad_norm": 0.2658419991670816, + "learning_rate": 0.000916153447793717, + "loss": 2.774, + "step": 1844 + }, + { + "epoch": 0.21155830753353974, + "grad_norm": 0.3067430475063693, + "learning_rate": 0.0009160504854009786, + "loss": 2.597, + "step": 1845 + }, + { + "epoch": 0.2116729732828804, + "grad_norm": 0.27750197150782224, + "learning_rate": 0.0009159474656220063, + "loss": 2.4906, + "step": 1846 + }, + { + "epoch": 0.2117876390322211, + "grad_norm": 0.3013271728904373, + "learning_rate": 0.0009158443884710097, + "loss": 2.5229, + "step": 1847 + }, + { + "epoch": 0.21190230478156175, + "grad_norm": 0.32563786547260687, + "learning_rate": 0.0009157412539622065, + "loss": 2.6388, + "step": 1848 + }, + { + "epoch": 0.21201697053090243, + "grad_norm": 0.2700954928393621, + "learning_rate": 0.0009156380621098221, + "loss": 2.6039, + "step": 1849 + }, + { + "epoch": 0.2121316362802431, + "grad_norm": 0.29486758303320676, + "learning_rate": 0.0009155348129280898, + "loss": 2.5722, + "step": 1850 + }, + { + "epoch": 0.21224630202958375, + "grad_norm": 0.318338021691611, + "learning_rate": 0.000915431506431251, + "loss": 2.487, + "step": 1851 + }, + { + "epoch": 0.21236096777892444, + "grad_norm": 0.28896449032637983, + "learning_rate": 0.0009153281426335547, + "loss": 2.459, + "step": 1852 + }, + { + "epoch": 0.2124756335282651, + "grad_norm": 0.2796068687283091, + "learning_rate": 0.0009152247215492577, + "loss": 2.5065, + "step": 1853 + }, + { + "epoch": 0.21259029927760578, + "grad_norm": 0.29210895661653224, + "learning_rate": 0.0009151212431926256, + "loss": 2.6085, + "step": 1854 + }, + { + "epoch": 0.21270496502694644, + "grad_norm": 0.259018230564378, + "learning_rate": 0.0009150177075779308, + "loss": 2.5859, + "step": 1855 + }, + { + "epoch": 0.21281963077628713, + "grad_norm": 0.27516160389330796, + "learning_rate": 0.0009149141147194542, + "loss": 2.6781, + "step": 1856 + }, + { + "epoch": 0.2129342965256278, + "grad_norm": 0.243866375529754, + "learning_rate": 0.0009148104646314844, + "loss": 2.6508, + "step": 1857 + }, + { + "epoch": 0.21304896227496847, + "grad_norm": 0.26921311194148806, + "learning_rate": 0.000914706757328318, + "loss": 2.6083, + "step": 1858 + }, + { + "epoch": 0.21316362802430913, + "grad_norm": 0.2675641924022829, + "learning_rate": 0.0009146029928242596, + "loss": 2.6045, + "step": 1859 + }, + { + "epoch": 0.21327829377364982, + "grad_norm": 0.31126391034303047, + "learning_rate": 0.0009144991711336214, + "loss": 2.5702, + "step": 1860 + }, + { + "epoch": 0.21339295952299048, + "grad_norm": 0.2965333399936351, + "learning_rate": 0.0009143952922707235, + "loss": 2.6294, + "step": 1861 + }, + { + "epoch": 0.21350762527233116, + "grad_norm": 0.2956018161867304, + "learning_rate": 0.0009142913562498942, + "loss": 2.5606, + "step": 1862 + }, + { + "epoch": 0.21362229102167182, + "grad_norm": 0.26298415222915367, + "learning_rate": 0.0009141873630854694, + "loss": 2.6358, + "step": 1863 + }, + { + "epoch": 0.2137369567710125, + "grad_norm": 0.2803733082860882, + "learning_rate": 0.0009140833127917929, + "loss": 2.6963, + "step": 1864 + }, + { + "epoch": 0.21385162252035317, + "grad_norm": 0.2544748663167623, + "learning_rate": 0.0009139792053832166, + "loss": 2.5873, + "step": 1865 + }, + { + "epoch": 0.21396628826969386, + "grad_norm": 0.3152046627388256, + "learning_rate": 0.0009138750408741001, + "loss": 2.5582, + "step": 1866 + }, + { + "epoch": 0.21408095401903451, + "grad_norm": 0.3177331732417034, + "learning_rate": 0.000913770819278811, + "loss": 2.6573, + "step": 1867 + }, + { + "epoch": 0.21419561976837517, + "grad_norm": 0.2711250188987675, + "learning_rate": 0.0009136665406117244, + "loss": 2.6932, + "step": 1868 + }, + { + "epoch": 0.21431028551771586, + "grad_norm": 0.2806631734513546, + "learning_rate": 0.0009135622048872238, + "loss": 2.606, + "step": 1869 + }, + { + "epoch": 0.21442495126705652, + "grad_norm": 0.2859856161879142, + "learning_rate": 0.0009134578121197002, + "loss": 2.4877, + "step": 1870 + }, + { + "epoch": 0.2145396170163972, + "grad_norm": 0.26958369351317923, + "learning_rate": 0.0009133533623235526, + "loss": 2.4897, + "step": 1871 + }, + { + "epoch": 0.21465428276573786, + "grad_norm": 0.27924198489268304, + "learning_rate": 0.000913248855513188, + "loss": 2.5948, + "step": 1872 + }, + { + "epoch": 0.21476894851507855, + "grad_norm": 0.28351441938396327, + "learning_rate": 0.0009131442917030211, + "loss": 2.7292, + "step": 1873 + }, + { + "epoch": 0.2148836142644192, + "grad_norm": 0.2693439813139644, + "learning_rate": 0.0009130396709074741, + "loss": 2.6453, + "step": 1874 + }, + { + "epoch": 0.2149982800137599, + "grad_norm": 0.2624815228106838, + "learning_rate": 0.0009129349931409781, + "loss": 2.6312, + "step": 1875 + }, + { + "epoch": 0.21511294576310055, + "grad_norm": 0.2648119913687565, + "learning_rate": 0.0009128302584179708, + "loss": 2.5599, + "step": 1876 + }, + { + "epoch": 0.21522761151244124, + "grad_norm": 0.27046439808232203, + "learning_rate": 0.0009127254667528988, + "loss": 2.6051, + "step": 1877 + }, + { + "epoch": 0.2153422772617819, + "grad_norm": 0.27876532866756304, + "learning_rate": 0.0009126206181602158, + "loss": 2.6333, + "step": 1878 + }, + { + "epoch": 0.2154569430111226, + "grad_norm": 0.23141626998005832, + "learning_rate": 0.0009125157126543838, + "loss": 2.5377, + "step": 1879 + }, + { + "epoch": 0.21557160876046325, + "grad_norm": 0.261271787985513, + "learning_rate": 0.0009124107502498725, + "loss": 2.6866, + "step": 1880 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 0.26860498998455956, + "learning_rate": 0.0009123057309611595, + "loss": 2.5248, + "step": 1881 + }, + { + "epoch": 0.2158009402591446, + "grad_norm": 0.27140691316419624, + "learning_rate": 0.0009122006548027302, + "loss": 2.5887, + "step": 1882 + }, + { + "epoch": 0.21591560600848528, + "grad_norm": 0.2590548852444448, + "learning_rate": 0.0009120955217890778, + "loss": 2.5286, + "step": 1883 + }, + { + "epoch": 0.21603027175782594, + "grad_norm": 0.2649540281508204, + "learning_rate": 0.0009119903319347034, + "loss": 2.6273, + "step": 1884 + }, + { + "epoch": 0.2161449375071666, + "grad_norm": 0.30221413184569834, + "learning_rate": 0.000911885085254116, + "loss": 2.6532, + "step": 1885 + }, + { + "epoch": 0.21625960325650728, + "grad_norm": 0.2659387280570465, + "learning_rate": 0.0009117797817618323, + "loss": 2.5716, + "step": 1886 + }, + { + "epoch": 0.21637426900584794, + "grad_norm": 0.2741848094877138, + "learning_rate": 0.000911674421472377, + "loss": 2.514, + "step": 1887 + }, + { + "epoch": 0.21648893475518863, + "grad_norm": 0.2944245592233634, + "learning_rate": 0.0009115690044002824, + "loss": 2.4557, + "step": 1888 + }, + { + "epoch": 0.2166036005045293, + "grad_norm": 0.2809225446752393, + "learning_rate": 0.0009114635305600889, + "loss": 2.593, + "step": 1889 + }, + { + "epoch": 0.21671826625386997, + "grad_norm": 0.3094642148936137, + "learning_rate": 0.0009113579999663447, + "loss": 2.6578, + "step": 1890 + }, + { + "epoch": 0.21683293200321063, + "grad_norm": 0.32408141046131017, + "learning_rate": 0.0009112524126336054, + "loss": 2.6325, + "step": 1891 + }, + { + "epoch": 0.21694759775255132, + "grad_norm": 0.2915661617567413, + "learning_rate": 0.0009111467685764351, + "loss": 2.576, + "step": 1892 + }, + { + "epoch": 0.21706226350189198, + "grad_norm": 0.328285860938571, + "learning_rate": 0.0009110410678094051, + "loss": 2.5832, + "step": 1893 + }, + { + "epoch": 0.21717692925123266, + "grad_norm": 0.27907049962318375, + "learning_rate": 0.0009109353103470951, + "loss": 2.6467, + "step": 1894 + }, + { + "epoch": 0.21729159500057332, + "grad_norm": 0.26090877350144465, + "learning_rate": 0.0009108294962040921, + "loss": 2.5787, + "step": 1895 + }, + { + "epoch": 0.217406260749914, + "grad_norm": 0.28842180627322295, + "learning_rate": 0.0009107236253949912, + "loss": 2.7486, + "step": 1896 + }, + { + "epoch": 0.21752092649925467, + "grad_norm": 0.26362965143399214, + "learning_rate": 0.0009106176979343955, + "loss": 2.5946, + "step": 1897 + }, + { + "epoch": 0.21763559224859536, + "grad_norm": 0.2842749582815509, + "learning_rate": 0.0009105117138369151, + "loss": 2.4597, + "step": 1898 + }, + { + "epoch": 0.21775025799793601, + "grad_norm": 0.28080370326315857, + "learning_rate": 0.0009104056731171691, + "loss": 2.5793, + "step": 1899 + }, + { + "epoch": 0.2178649237472767, + "grad_norm": 0.24541024040210418, + "learning_rate": 0.0009102995757897834, + "loss": 2.7216, + "step": 1900 + }, + { + "epoch": 0.21797958949661736, + "grad_norm": 0.2670657057656899, + "learning_rate": 0.0009101934218693923, + "loss": 2.6512, + "step": 1901 + }, + { + "epoch": 0.21809425524595802, + "grad_norm": 0.26221270955037446, + "learning_rate": 0.0009100872113706375, + "loss": 2.6687, + "step": 1902 + }, + { + "epoch": 0.2182089209952987, + "grad_norm": 0.3145136942609155, + "learning_rate": 0.0009099809443081691, + "loss": 2.6042, + "step": 1903 + }, + { + "epoch": 0.21832358674463936, + "grad_norm": 0.26999972222507934, + "learning_rate": 0.0009098746206966443, + "loss": 2.6342, + "step": 1904 + }, + { + "epoch": 0.21843825249398005, + "grad_norm": 0.26445434448717314, + "learning_rate": 0.0009097682405507285, + "loss": 2.5026, + "step": 1905 + }, + { + "epoch": 0.2185529182433207, + "grad_norm": 0.26012609764076383, + "learning_rate": 0.0009096618038850948, + "loss": 2.6303, + "step": 1906 + }, + { + "epoch": 0.2186675839926614, + "grad_norm": 0.2692690711463572, + "learning_rate": 0.0009095553107144241, + "loss": 2.6002, + "step": 1907 + }, + { + "epoch": 0.21878224974200206, + "grad_norm": 0.2987751544110473, + "learning_rate": 0.0009094487610534052, + "loss": 2.4888, + "step": 1908 + }, + { + "epoch": 0.21889691549134274, + "grad_norm": 0.29662090412846376, + "learning_rate": 0.0009093421549167343, + "loss": 2.6612, + "step": 1909 + }, + { + "epoch": 0.2190115812406834, + "grad_norm": 0.28057920473199416, + "learning_rate": 0.0009092354923191161, + "loss": 2.7068, + "step": 1910 + }, + { + "epoch": 0.2191262469900241, + "grad_norm": 0.2874779490340694, + "learning_rate": 0.0009091287732752624, + "loss": 2.6985, + "step": 1911 + }, + { + "epoch": 0.21924091273936475, + "grad_norm": 0.2828317138805602, + "learning_rate": 0.0009090219977998933, + "loss": 2.5827, + "step": 1912 + }, + { + "epoch": 0.21935557848870543, + "grad_norm": 0.2745614086973994, + "learning_rate": 0.000908915165907736, + "loss": 2.4808, + "step": 1913 + }, + { + "epoch": 0.2194702442380461, + "grad_norm": 0.30613717156947323, + "learning_rate": 0.0009088082776135263, + "loss": 2.5825, + "step": 1914 + }, + { + "epoch": 0.21958490998738678, + "grad_norm": 0.2655508704288266, + "learning_rate": 0.0009087013329320073, + "loss": 2.6332, + "step": 1915 + }, + { + "epoch": 0.21969957573672744, + "grad_norm": 0.2554114466284559, + "learning_rate": 0.0009085943318779301, + "loss": 2.4686, + "step": 1916 + }, + { + "epoch": 0.21981424148606812, + "grad_norm": 0.2500802711103224, + "learning_rate": 0.0009084872744660532, + "loss": 2.4513, + "step": 1917 + }, + { + "epoch": 0.21992890723540878, + "grad_norm": 0.2523476571925537, + "learning_rate": 0.0009083801607111433, + "loss": 2.7623, + "step": 1918 + }, + { + "epoch": 0.22004357298474944, + "grad_norm": 0.27173208928970244, + "learning_rate": 0.0009082729906279746, + "loss": 2.519, + "step": 1919 + }, + { + "epoch": 0.22015823873409013, + "grad_norm": 0.27581178193610545, + "learning_rate": 0.0009081657642313292, + "loss": 2.5304, + "step": 1920 + }, + { + "epoch": 0.2202729044834308, + "grad_norm": 0.27120255021361356, + "learning_rate": 0.0009080584815359972, + "loss": 2.5753, + "step": 1921 + }, + { + "epoch": 0.22038757023277147, + "grad_norm": 0.250998699557209, + "learning_rate": 0.0009079511425567759, + "loss": 2.4572, + "step": 1922 + }, + { + "epoch": 0.22050223598211213, + "grad_norm": 0.2550318361099552, + "learning_rate": 0.0009078437473084706, + "loss": 2.6597, + "step": 1923 + }, + { + "epoch": 0.22061690173145282, + "grad_norm": 0.2590059578550897, + "learning_rate": 0.0009077362958058946, + "loss": 2.4548, + "step": 1924 + }, + { + "epoch": 0.22073156748079348, + "grad_norm": 0.2859852170905992, + "learning_rate": 0.0009076287880638689, + "loss": 2.7067, + "step": 1925 + }, + { + "epoch": 0.22084623323013416, + "grad_norm": 0.3119738888828401, + "learning_rate": 0.0009075212240972218, + "loss": 2.6942, + "step": 1926 + }, + { + "epoch": 0.22096089897947482, + "grad_norm": 0.289801152792331, + "learning_rate": 0.00090741360392079, + "loss": 2.4983, + "step": 1927 + }, + { + "epoch": 0.2210755647288155, + "grad_norm": 0.29610008529516996, + "learning_rate": 0.0009073059275494176, + "loss": 2.6153, + "step": 1928 + }, + { + "epoch": 0.22119023047815617, + "grad_norm": 0.26575778645368764, + "learning_rate": 0.0009071981949979564, + "loss": 2.6743, + "step": 1929 + }, + { + "epoch": 0.22130489622749686, + "grad_norm": 0.2894257444236656, + "learning_rate": 0.000907090406281266, + "loss": 2.6497, + "step": 1930 + }, + { + "epoch": 0.22141956197683751, + "grad_norm": 0.26585047703634673, + "learning_rate": 0.000906982561414214, + "loss": 2.6185, + "step": 1931 + }, + { + "epoch": 0.2215342277261782, + "grad_norm": 0.2754301975819511, + "learning_rate": 0.0009068746604116755, + "loss": 2.6864, + "step": 1932 + }, + { + "epoch": 0.22164889347551886, + "grad_norm": 0.27091326531361276, + "learning_rate": 0.0009067667032885334, + "loss": 2.5364, + "step": 1933 + }, + { + "epoch": 0.22176355922485955, + "grad_norm": 0.2604060571542392, + "learning_rate": 0.0009066586900596781, + "loss": 2.6768, + "step": 1934 + }, + { + "epoch": 0.2218782249742002, + "grad_norm": 0.27837045011853345, + "learning_rate": 0.000906550620740008, + "loss": 2.5315, + "step": 1935 + }, + { + "epoch": 0.2219928907235409, + "grad_norm": 0.2668698702701924, + "learning_rate": 0.0009064424953444296, + "loss": 2.5998, + "step": 1936 + }, + { + "epoch": 0.22210755647288155, + "grad_norm": 0.260843606153731, + "learning_rate": 0.0009063343138878563, + "loss": 2.4059, + "step": 1937 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.27938493112594875, + "learning_rate": 0.0009062260763852099, + "loss": 2.6744, + "step": 1938 + }, + { + "epoch": 0.2223368879715629, + "grad_norm": 0.3006081028560282, + "learning_rate": 0.0009061177828514198, + "loss": 2.593, + "step": 1939 + }, + { + "epoch": 0.22245155372090356, + "grad_norm": 0.2734578971446594, + "learning_rate": 0.0009060094333014226, + "loss": 2.5251, + "step": 1940 + }, + { + "epoch": 0.22256621947024424, + "grad_norm": 0.2649327416105208, + "learning_rate": 0.0009059010277501634, + "loss": 2.5062, + "step": 1941 + }, + { + "epoch": 0.2226808852195849, + "grad_norm": 0.2866524474555254, + "learning_rate": 0.0009057925662125946, + "loss": 2.6065, + "step": 1942 + }, + { + "epoch": 0.2227955509689256, + "grad_norm": 0.29712861989300904, + "learning_rate": 0.0009056840487036764, + "loss": 2.6034, + "step": 1943 + }, + { + "epoch": 0.22291021671826625, + "grad_norm": 0.29576524839234564, + "learning_rate": 0.0009055754752383768, + "loss": 2.5442, + "step": 1944 + }, + { + "epoch": 0.22302488246760693, + "grad_norm": 0.29975480471833216, + "learning_rate": 0.0009054668458316713, + "loss": 2.6857, + "step": 1945 + }, + { + "epoch": 0.2231395482169476, + "grad_norm": 0.31149051695256563, + "learning_rate": 0.0009053581604985433, + "loss": 2.6212, + "step": 1946 + }, + { + "epoch": 0.22325421396628828, + "grad_norm": 0.2722972735180898, + "learning_rate": 0.000905249419253984, + "loss": 2.7627, + "step": 1947 + }, + { + "epoch": 0.22336887971562894, + "grad_norm": 0.26737547910642695, + "learning_rate": 0.0009051406221129919, + "loss": 2.7949, + "step": 1948 + }, + { + "epoch": 0.22348354546496962, + "grad_norm": 0.28222234187931433, + "learning_rate": 0.0009050317690905737, + "loss": 2.5675, + "step": 1949 + }, + { + "epoch": 0.22359821121431028, + "grad_norm": 0.2597653078328947, + "learning_rate": 0.0009049228602017437, + "loss": 2.6455, + "step": 1950 + }, + { + "epoch": 0.22371287696365097, + "grad_norm": 0.26215588556396907, + "learning_rate": 0.0009048138954615235, + "loss": 2.6061, + "step": 1951 + }, + { + "epoch": 0.22382754271299163, + "grad_norm": 0.2510070527640125, + "learning_rate": 0.0009047048748849429, + "loss": 2.6727, + "step": 1952 + }, + { + "epoch": 0.22394220846233232, + "grad_norm": 0.26425335175254944, + "learning_rate": 0.0009045957984870393, + "loss": 2.4553, + "step": 1953 + }, + { + "epoch": 0.22405687421167297, + "grad_norm": 0.2640848196742785, + "learning_rate": 0.0009044866662828575, + "loss": 2.5792, + "step": 1954 + }, + { + "epoch": 0.22417153996101363, + "grad_norm": 0.28247075641625724, + "learning_rate": 0.0009043774782874503, + "loss": 2.5274, + "step": 1955 + }, + { + "epoch": 0.22428620571035432, + "grad_norm": 0.264571221801559, + "learning_rate": 0.0009042682345158781, + "loss": 2.6095, + "step": 1956 + }, + { + "epoch": 0.22440087145969498, + "grad_norm": 0.27020248369221733, + "learning_rate": 0.0009041589349832091, + "loss": 2.5915, + "step": 1957 + }, + { + "epoch": 0.22451553720903566, + "grad_norm": 0.2797456782640959, + "learning_rate": 0.000904049579704519, + "loss": 2.5646, + "step": 1958 + }, + { + "epoch": 0.22463020295837632, + "grad_norm": 0.24184221607772377, + "learning_rate": 0.0009039401686948912, + "loss": 2.5008, + "step": 1959 + }, + { + "epoch": 0.224744868707717, + "grad_norm": 0.26922537256729834, + "learning_rate": 0.0009038307019694169, + "loss": 2.5759, + "step": 1960 + }, + { + "epoch": 0.22485953445705767, + "grad_norm": 0.25375396608739825, + "learning_rate": 0.000903721179543195, + "loss": 2.6332, + "step": 1961 + }, + { + "epoch": 0.22497420020639836, + "grad_norm": 0.2711673553945902, + "learning_rate": 0.0009036116014313321, + "loss": 2.6366, + "step": 1962 + }, + { + "epoch": 0.22508886595573901, + "grad_norm": 0.255907562785913, + "learning_rate": 0.0009035019676489422, + "loss": 2.5764, + "step": 1963 + }, + { + "epoch": 0.2252035317050797, + "grad_norm": 0.2562543775541522, + "learning_rate": 0.0009033922782111473, + "loss": 2.5173, + "step": 1964 + }, + { + "epoch": 0.22531819745442036, + "grad_norm": 0.28997616558416617, + "learning_rate": 0.000903282533133077, + "loss": 2.5987, + "step": 1965 + }, + { + "epoch": 0.22543286320376105, + "grad_norm": 0.2567588176667797, + "learning_rate": 0.0009031727324298686, + "loss": 2.6207, + "step": 1966 + }, + { + "epoch": 0.2255475289531017, + "grad_norm": 0.2635235397589899, + "learning_rate": 0.0009030628761166668, + "loss": 2.6897, + "step": 1967 + }, + { + "epoch": 0.2256621947024424, + "grad_norm": 0.2755671557312719, + "learning_rate": 0.0009029529642086245, + "loss": 2.7063, + "step": 1968 + }, + { + "epoch": 0.22577686045178305, + "grad_norm": 0.30016741390490403, + "learning_rate": 0.0009028429967209015, + "loss": 2.7238, + "step": 1969 + }, + { + "epoch": 0.22589152620112374, + "grad_norm": 0.24278714521168407, + "learning_rate": 0.0009027329736686663, + "loss": 2.5206, + "step": 1970 + }, + { + "epoch": 0.2260061919504644, + "grad_norm": 0.2815515189648223, + "learning_rate": 0.000902622895067094, + "loss": 2.5978, + "step": 1971 + }, + { + "epoch": 0.22612085769980506, + "grad_norm": 0.30617726406810347, + "learning_rate": 0.000902512760931368, + "loss": 2.3762, + "step": 1972 + }, + { + "epoch": 0.22623552344914574, + "grad_norm": 0.3014572069320811, + "learning_rate": 0.0009024025712766792, + "loss": 2.571, + "step": 1973 + }, + { + "epoch": 0.2263501891984864, + "grad_norm": 0.3321771139923687, + "learning_rate": 0.0009022923261182264, + "loss": 2.7446, + "step": 1974 + }, + { + "epoch": 0.2264648549478271, + "grad_norm": 0.2840748245323971, + "learning_rate": 0.0009021820254712153, + "loss": 2.5953, + "step": 1975 + }, + { + "epoch": 0.22657952069716775, + "grad_norm": 0.29779548722246973, + "learning_rate": 0.0009020716693508602, + "loss": 2.8379, + "step": 1976 + }, + { + "epoch": 0.22669418644650843, + "grad_norm": 0.25930289688366726, + "learning_rate": 0.0009019612577723826, + "loss": 2.6142, + "step": 1977 + }, + { + "epoch": 0.2268088521958491, + "grad_norm": 0.3080007238585445, + "learning_rate": 0.0009018507907510114, + "loss": 2.7338, + "step": 1978 + }, + { + "epoch": 0.22692351794518978, + "grad_norm": 0.28650189508590296, + "learning_rate": 0.0009017402683019838, + "loss": 2.4582, + "step": 1979 + }, + { + "epoch": 0.22703818369453044, + "grad_norm": 0.256677474385844, + "learning_rate": 0.0009016296904405439, + "loss": 2.6505, + "step": 1980 + }, + { + "epoch": 0.22715284944387112, + "grad_norm": 0.22769947930848727, + "learning_rate": 0.0009015190571819438, + "loss": 2.4163, + "step": 1981 + }, + { + "epoch": 0.22726751519321178, + "grad_norm": 0.2634196780583111, + "learning_rate": 0.0009014083685414437, + "loss": 2.5837, + "step": 1982 + }, + { + "epoch": 0.22738218094255247, + "grad_norm": 0.2587993373100034, + "learning_rate": 0.0009012976245343106, + "loss": 2.5766, + "step": 1983 + }, + { + "epoch": 0.22749684669189313, + "grad_norm": 0.2679774633487803, + "learning_rate": 0.0009011868251758195, + "loss": 2.5395, + "step": 1984 + }, + { + "epoch": 0.22761151244123382, + "grad_norm": 0.2464764208886438, + "learning_rate": 0.0009010759704812533, + "loss": 2.5595, + "step": 1985 + }, + { + "epoch": 0.22772617819057447, + "grad_norm": 0.30034187657100625, + "learning_rate": 0.0009009650604659023, + "loss": 2.6609, + "step": 1986 + }, + { + "epoch": 0.22784084393991516, + "grad_norm": 0.27478891063492406, + "learning_rate": 0.0009008540951450641, + "loss": 2.5061, + "step": 1987 + }, + { + "epoch": 0.22795550968925582, + "grad_norm": 0.32996377214402783, + "learning_rate": 0.0009007430745340446, + "loss": 2.6858, + "step": 1988 + }, + { + "epoch": 0.22807017543859648, + "grad_norm": 0.2977665579874024, + "learning_rate": 0.0009006319986481567, + "loss": 2.48, + "step": 1989 + }, + { + "epoch": 0.22818484118793717, + "grad_norm": 0.2733278678858816, + "learning_rate": 0.0009005208675027215, + "loss": 2.5679, + "step": 1990 + }, + { + "epoch": 0.22829950693727782, + "grad_norm": 0.30524051395291907, + "learning_rate": 0.000900409681113067, + "loss": 2.5508, + "step": 1991 + }, + { + "epoch": 0.2284141726866185, + "grad_norm": 0.2800427758134093, + "learning_rate": 0.0009002984394945298, + "loss": 2.6196, + "step": 1992 + }, + { + "epoch": 0.22852883843595917, + "grad_norm": 0.2573831033007541, + "learning_rate": 0.0009001871426624528, + "loss": 2.3963, + "step": 1993 + }, + { + "epoch": 0.22864350418529986, + "grad_norm": 0.2891264003884552, + "learning_rate": 0.0009000757906321882, + "loss": 2.6205, + "step": 1994 + }, + { + "epoch": 0.22875816993464052, + "grad_norm": 0.3077980662285859, + "learning_rate": 0.0008999643834190941, + "loss": 2.7084, + "step": 1995 + }, + { + "epoch": 0.2288728356839812, + "grad_norm": 0.26250196112539614, + "learning_rate": 0.0008998529210385375, + "loss": 2.508, + "step": 1996 + }, + { + "epoch": 0.22898750143332186, + "grad_norm": 0.29108230390030326, + "learning_rate": 0.0008997414035058922, + "loss": 2.6415, + "step": 1997 + }, + { + "epoch": 0.22910216718266255, + "grad_norm": 0.2847300715396866, + "learning_rate": 0.0008996298308365403, + "loss": 2.7271, + "step": 1998 + }, + { + "epoch": 0.2292168329320032, + "grad_norm": 0.2682951236555709, + "learning_rate": 0.0008995182030458707, + "loss": 2.5449, + "step": 1999 + }, + { + "epoch": 0.2293314986813439, + "grad_norm": 0.26825010011137673, + "learning_rate": 0.0008994065201492804, + "loss": 2.4948, + "step": 2000 + }, + { + "epoch": 0.22944616443068455, + "grad_norm": 0.2672136640508456, + "learning_rate": 0.0008992947821621741, + "loss": 2.5919, + "step": 2001 + }, + { + "epoch": 0.22956083018002524, + "grad_norm": 0.25284187507606976, + "learning_rate": 0.0008991829890999639, + "loss": 2.5536, + "step": 2002 + }, + { + "epoch": 0.2296754959293659, + "grad_norm": 0.2594435921976009, + "learning_rate": 0.0008990711409780694, + "loss": 2.5322, + "step": 2003 + }, + { + "epoch": 0.22979016167870658, + "grad_norm": 0.29046265198513505, + "learning_rate": 0.000898959237811918, + "loss": 2.5496, + "step": 2004 + }, + { + "epoch": 0.22990482742804724, + "grad_norm": 0.2691829907742786, + "learning_rate": 0.0008988472796169447, + "loss": 2.685, + "step": 2005 + }, + { + "epoch": 0.2300194931773879, + "grad_norm": 0.2776127116050503, + "learning_rate": 0.0008987352664085919, + "loss": 2.6757, + "step": 2006 + }, + { + "epoch": 0.2301341589267286, + "grad_norm": 0.27972545281697553, + "learning_rate": 0.0008986231982023097, + "loss": 2.5275, + "step": 2007 + }, + { + "epoch": 0.23024882467606925, + "grad_norm": 0.3006560524748079, + "learning_rate": 0.0008985110750135556, + "loss": 2.5572, + "step": 2008 + }, + { + "epoch": 0.23036349042540993, + "grad_norm": 0.29022518851669266, + "learning_rate": 0.0008983988968577951, + "loss": 2.6411, + "step": 2009 + }, + { + "epoch": 0.2304781561747506, + "grad_norm": 0.2788691649586618, + "learning_rate": 0.0008982866637505009, + "loss": 2.6019, + "step": 2010 + }, + { + "epoch": 0.23059282192409128, + "grad_norm": 0.25984198988934054, + "learning_rate": 0.0008981743757071535, + "loss": 2.6253, + "step": 2011 + }, + { + "epoch": 0.23070748767343194, + "grad_norm": 0.2959609216818093, + "learning_rate": 0.000898062032743241, + "loss": 2.5903, + "step": 2012 + }, + { + "epoch": 0.23082215342277262, + "grad_norm": 0.27262415972247456, + "learning_rate": 0.0008979496348742586, + "loss": 2.5754, + "step": 2013 + }, + { + "epoch": 0.23093681917211328, + "grad_norm": 0.3136942085157362, + "learning_rate": 0.0008978371821157098, + "loss": 2.533, + "step": 2014 + }, + { + "epoch": 0.23105148492145397, + "grad_norm": 0.2560522239365619, + "learning_rate": 0.0008977246744831052, + "loss": 2.4911, + "step": 2015 + }, + { + "epoch": 0.23116615067079463, + "grad_norm": 0.28767308296605126, + "learning_rate": 0.0008976121119919631, + "loss": 2.516, + "step": 2016 + }, + { + "epoch": 0.23128081642013532, + "grad_norm": 0.2888355088889753, + "learning_rate": 0.0008974994946578093, + "loss": 2.6907, + "step": 2017 + }, + { + "epoch": 0.23139548216947597, + "grad_norm": 0.26796277320531725, + "learning_rate": 0.0008973868224961772, + "loss": 2.582, + "step": 2018 + }, + { + "epoch": 0.23151014791881666, + "grad_norm": 0.2728003912529523, + "learning_rate": 0.0008972740955226079, + "loss": 2.5613, + "step": 2019 + }, + { + "epoch": 0.23162481366815732, + "grad_norm": 0.2385740338397358, + "learning_rate": 0.0008971613137526498, + "loss": 2.5391, + "step": 2020 + }, + { + "epoch": 0.231739479417498, + "grad_norm": 0.295634394796691, + "learning_rate": 0.000897048477201859, + "loss": 2.6977, + "step": 2021 + }, + { + "epoch": 0.23185414516683867, + "grad_norm": 0.29329468558132005, + "learning_rate": 0.0008969355858857994, + "loss": 2.6056, + "step": 2022 + }, + { + "epoch": 0.23196881091617932, + "grad_norm": 0.290327238218866, + "learning_rate": 0.0008968226398200418, + "loss": 2.5391, + "step": 2023 + }, + { + "epoch": 0.23208347666552, + "grad_norm": 0.2971721146485292, + "learning_rate": 0.0008967096390201652, + "loss": 2.7293, + "step": 2024 + }, + { + "epoch": 0.23219814241486067, + "grad_norm": 0.3033863910349409, + "learning_rate": 0.000896596583501756, + "loss": 2.5282, + "step": 2025 + }, + { + "epoch": 0.23231280816420136, + "grad_norm": 0.27782767988868723, + "learning_rate": 0.0008964834732804078, + "loss": 2.6139, + "step": 2026 + }, + { + "epoch": 0.23242747391354202, + "grad_norm": 0.29141287249691744, + "learning_rate": 0.0008963703083717222, + "loss": 2.4896, + "step": 2027 + }, + { + "epoch": 0.2325421396628827, + "grad_norm": 0.3166698914267737, + "learning_rate": 0.000896257088791308, + "loss": 2.5739, + "step": 2028 + }, + { + "epoch": 0.23265680541222336, + "grad_norm": 0.2837361071196873, + "learning_rate": 0.0008961438145547818, + "loss": 2.7438, + "step": 2029 + }, + { + "epoch": 0.23277147116156405, + "grad_norm": 0.2819738877993953, + "learning_rate": 0.0008960304856777675, + "loss": 2.6682, + "step": 2030 + }, + { + "epoch": 0.2328861369109047, + "grad_norm": 0.28331415765898876, + "learning_rate": 0.0008959171021758967, + "loss": 2.6967, + "step": 2031 + }, + { + "epoch": 0.2330008026602454, + "grad_norm": 0.26747427331751383, + "learning_rate": 0.0008958036640648086, + "loss": 2.6176, + "step": 2032 + }, + { + "epoch": 0.23311546840958605, + "grad_norm": 0.23171303557327103, + "learning_rate": 0.0008956901713601499, + "loss": 2.6092, + "step": 2033 + }, + { + "epoch": 0.23323013415892674, + "grad_norm": 0.25595726239499245, + "learning_rate": 0.0008955766240775745, + "loss": 2.6118, + "step": 2034 + }, + { + "epoch": 0.2333447999082674, + "grad_norm": 0.2577097650207744, + "learning_rate": 0.000895463022232744, + "loss": 2.6268, + "step": 2035 + }, + { + "epoch": 0.23345946565760808, + "grad_norm": 0.27300775433398217, + "learning_rate": 0.0008953493658413279, + "loss": 2.4494, + "step": 2036 + }, + { + "epoch": 0.23357413140694874, + "grad_norm": 0.2750678679822444, + "learning_rate": 0.0008952356549190028, + "loss": 2.591, + "step": 2037 + }, + { + "epoch": 0.23368879715628943, + "grad_norm": 0.24804890414123798, + "learning_rate": 0.000895121889481453, + "loss": 2.515, + "step": 2038 + }, + { + "epoch": 0.2338034629056301, + "grad_norm": 0.2917829401576775, + "learning_rate": 0.0008950080695443704, + "loss": 2.6668, + "step": 2039 + }, + { + "epoch": 0.23391812865497075, + "grad_norm": 0.26037178457093035, + "learning_rate": 0.000894894195123454, + "loss": 2.4622, + "step": 2040 + }, + { + "epoch": 0.23403279440431143, + "grad_norm": 0.2930535976906809, + "learning_rate": 0.0008947802662344108, + "loss": 2.6106, + "step": 2041 + }, + { + "epoch": 0.2341474601536521, + "grad_norm": 0.30151602216694573, + "learning_rate": 0.0008946662828929551, + "loss": 2.5512, + "step": 2042 + }, + { + "epoch": 0.23426212590299278, + "grad_norm": 0.2725319309718963, + "learning_rate": 0.0008945522451148086, + "loss": 2.5881, + "step": 2043 + }, + { + "epoch": 0.23437679165233344, + "grad_norm": 0.2547792911152703, + "learning_rate": 0.0008944381529157008, + "loss": 2.5752, + "step": 2044 + }, + { + "epoch": 0.23449145740167412, + "grad_norm": 0.2924461510705153, + "learning_rate": 0.0008943240063113685, + "loss": 2.6693, + "step": 2045 + }, + { + "epoch": 0.23460612315101478, + "grad_norm": 0.2660855655343598, + "learning_rate": 0.0008942098053175559, + "loss": 2.6169, + "step": 2046 + }, + { + "epoch": 0.23472078890035547, + "grad_norm": 0.28891568303753096, + "learning_rate": 0.0008940955499500152, + "loss": 2.6187, + "step": 2047 + }, + { + "epoch": 0.23483545464969613, + "grad_norm": 0.2928016772942542, + "learning_rate": 0.0008939812402245053, + "loss": 2.5617, + "step": 2048 + }, + { + "epoch": 0.23495012039903682, + "grad_norm": 0.2796043634393937, + "learning_rate": 0.0008938668761567934, + "loss": 2.6915, + "step": 2049 + }, + { + "epoch": 0.23506478614837747, + "grad_norm": 0.2639617720309141, + "learning_rate": 0.0008937524577626537, + "loss": 2.5295, + "step": 2050 + }, + { + "epoch": 0.23517945189771816, + "grad_norm": 0.28332468148810735, + "learning_rate": 0.0008936379850578679, + "loss": 2.608, + "step": 2051 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.29056727485708317, + "learning_rate": 0.0008935234580582258, + "loss": 2.5081, + "step": 2052 + }, + { + "epoch": 0.2354087833963995, + "grad_norm": 0.2789825026228519, + "learning_rate": 0.0008934088767795236, + "loss": 2.6309, + "step": 2053 + }, + { + "epoch": 0.23552344914574017, + "grad_norm": 0.25803688005507325, + "learning_rate": 0.0008932942412375662, + "loss": 2.6421, + "step": 2054 + }, + { + "epoch": 0.23563811489508085, + "grad_norm": 0.28067943823856445, + "learning_rate": 0.0008931795514481648, + "loss": 2.5896, + "step": 2055 + }, + { + "epoch": 0.2357527806444215, + "grad_norm": 0.2626655722620775, + "learning_rate": 0.0008930648074271391, + "loss": 2.5226, + "step": 2056 + }, + { + "epoch": 0.23586744639376217, + "grad_norm": 0.27793313295909816, + "learning_rate": 0.0008929500091903158, + "loss": 2.6282, + "step": 2057 + }, + { + "epoch": 0.23598211214310286, + "grad_norm": 0.35055272401260446, + "learning_rate": 0.0008928351567535289, + "loss": 2.7735, + "step": 2058 + }, + { + "epoch": 0.23609677789244352, + "grad_norm": 0.2917132432380481, + "learning_rate": 0.0008927202501326204, + "loss": 2.5179, + "step": 2059 + }, + { + "epoch": 0.2362114436417842, + "grad_norm": 0.27648522151288074, + "learning_rate": 0.0008926052893434391, + "loss": 2.6153, + "step": 2060 + }, + { + "epoch": 0.23632610939112486, + "grad_norm": 0.2929851217902678, + "learning_rate": 0.000892490274401842, + "loss": 2.5626, + "step": 2061 + }, + { + "epoch": 0.23644077514046555, + "grad_norm": 0.3222088766833428, + "learning_rate": 0.0008923752053236931, + "loss": 2.5384, + "step": 2062 + }, + { + "epoch": 0.2365554408898062, + "grad_norm": 0.28362840031094594, + "learning_rate": 0.0008922600821248638, + "loss": 2.5666, + "step": 2063 + }, + { + "epoch": 0.2366701066391469, + "grad_norm": 0.3026770530884364, + "learning_rate": 0.0008921449048212336, + "loss": 2.4204, + "step": 2064 + }, + { + "epoch": 0.23678477238848755, + "grad_norm": 0.27490345210920014, + "learning_rate": 0.0008920296734286886, + "loss": 2.5661, + "step": 2065 + }, + { + "epoch": 0.23689943813782824, + "grad_norm": 0.2911812069660475, + "learning_rate": 0.000891914387963123, + "loss": 2.6534, + "step": 2066 + }, + { + "epoch": 0.2370141038871689, + "grad_norm": 0.3227271905331262, + "learning_rate": 0.0008917990484404382, + "loss": 2.435, + "step": 2067 + }, + { + "epoch": 0.23712876963650958, + "grad_norm": 0.302089936192019, + "learning_rate": 0.0008916836548765427, + "loss": 2.7398, + "step": 2068 + }, + { + "epoch": 0.23724343538585024, + "grad_norm": 0.2970085321764663, + "learning_rate": 0.0008915682072873535, + "loss": 2.5906, + "step": 2069 + }, + { + "epoch": 0.23735810113519093, + "grad_norm": 0.28518087127667213, + "learning_rate": 0.000891452705688794, + "loss": 2.6535, + "step": 2070 + }, + { + "epoch": 0.2374727668845316, + "grad_norm": 0.2611123593225715, + "learning_rate": 0.0008913371500967955, + "loss": 2.4873, + "step": 2071 + }, + { + "epoch": 0.23758743263387228, + "grad_norm": 0.24976771111916965, + "learning_rate": 0.0008912215405272967, + "loss": 2.5358, + "step": 2072 + }, + { + "epoch": 0.23770209838321293, + "grad_norm": 0.2663529780949666, + "learning_rate": 0.0008911058769962437, + "loss": 2.7244, + "step": 2073 + }, + { + "epoch": 0.2378167641325536, + "grad_norm": 0.27255045045625054, + "learning_rate": 0.0008909901595195902, + "loss": 2.6532, + "step": 2074 + }, + { + "epoch": 0.23793142988189428, + "grad_norm": 0.26066324396023205, + "learning_rate": 0.0008908743881132972, + "loss": 2.6092, + "step": 2075 + }, + { + "epoch": 0.23804609563123494, + "grad_norm": 0.2733797221471541, + "learning_rate": 0.000890758562793333, + "loss": 2.6079, + "step": 2076 + }, + { + "epoch": 0.23816076138057563, + "grad_norm": 0.26982144597322827, + "learning_rate": 0.0008906426835756736, + "loss": 2.7263, + "step": 2077 + }, + { + "epoch": 0.23827542712991628, + "grad_norm": 0.2848488148017355, + "learning_rate": 0.0008905267504763024, + "loss": 2.5552, + "step": 2078 + }, + { + "epoch": 0.23839009287925697, + "grad_norm": 0.2708983786640919, + "learning_rate": 0.00089041076351121, + "loss": 2.5775, + "step": 2079 + }, + { + "epoch": 0.23850475862859763, + "grad_norm": 0.30089596508206945, + "learning_rate": 0.0008902947226963946, + "loss": 2.6537, + "step": 2080 + }, + { + "epoch": 0.23861942437793832, + "grad_norm": 0.31414989648770386, + "learning_rate": 0.0008901786280478621, + "loss": 2.6811, + "step": 2081 + }, + { + "epoch": 0.23873409012727898, + "grad_norm": 0.29653672102809836, + "learning_rate": 0.0008900624795816252, + "loss": 2.5481, + "step": 2082 + }, + { + "epoch": 0.23884875587661966, + "grad_norm": 0.27451522996875266, + "learning_rate": 0.0008899462773137047, + "loss": 2.5268, + "step": 2083 + }, + { + "epoch": 0.23896342162596032, + "grad_norm": 0.28515334716983537, + "learning_rate": 0.0008898300212601281, + "loss": 2.5153, + "step": 2084 + }, + { + "epoch": 0.239078087375301, + "grad_norm": 0.29701746576479504, + "learning_rate": 0.0008897137114369309, + "loss": 2.6846, + "step": 2085 + }, + { + "epoch": 0.23919275312464167, + "grad_norm": 0.27696166580891945, + "learning_rate": 0.000889597347860156, + "loss": 2.6531, + "step": 2086 + }, + { + "epoch": 0.23930741887398235, + "grad_norm": 0.2909980793085496, + "learning_rate": 0.0008894809305458534, + "loss": 2.6851, + "step": 2087 + }, + { + "epoch": 0.239422084623323, + "grad_norm": 0.2754157723614315, + "learning_rate": 0.0008893644595100803, + "loss": 2.5742, + "step": 2088 + }, + { + "epoch": 0.2395367503726637, + "grad_norm": 0.291648554431086, + "learning_rate": 0.0008892479347689022, + "loss": 2.5607, + "step": 2089 + }, + { + "epoch": 0.23965141612200436, + "grad_norm": 0.284995634125818, + "learning_rate": 0.0008891313563383911, + "loss": 2.7239, + "step": 2090 + }, + { + "epoch": 0.23976608187134502, + "grad_norm": 0.25591892297732627, + "learning_rate": 0.0008890147242346272, + "loss": 2.5548, + "step": 2091 + }, + { + "epoch": 0.2398807476206857, + "grad_norm": 0.2838251614906059, + "learning_rate": 0.0008888980384736972, + "loss": 2.5922, + "step": 2092 + }, + { + "epoch": 0.23999541337002636, + "grad_norm": 0.2742489823788876, + "learning_rate": 0.0008887812990716957, + "loss": 2.5153, + "step": 2093 + }, + { + "epoch": 0.24011007911936705, + "grad_norm": 0.2913391536991361, + "learning_rate": 0.000888664506044725, + "loss": 2.4547, + "step": 2094 + }, + { + "epoch": 0.2402247448687077, + "grad_norm": 0.24916125892655305, + "learning_rate": 0.0008885476594088943, + "loss": 2.5089, + "step": 2095 + }, + { + "epoch": 0.2403394106180484, + "grad_norm": 0.27984691585298577, + "learning_rate": 0.0008884307591803203, + "loss": 2.4254, + "step": 2096 + }, + { + "epoch": 0.24045407636738905, + "grad_norm": 0.3127172976315307, + "learning_rate": 0.0008883138053751274, + "loss": 2.6464, + "step": 2097 + }, + { + "epoch": 0.24056874211672974, + "grad_norm": 0.2629728202024017, + "learning_rate": 0.0008881967980094469, + "loss": 2.6215, + "step": 2098 + }, + { + "epoch": 0.2406834078660704, + "grad_norm": 0.2660403165936794, + "learning_rate": 0.0008880797370994178, + "loss": 2.4748, + "step": 2099 + }, + { + "epoch": 0.24079807361541108, + "grad_norm": 0.2757967313778535, + "learning_rate": 0.0008879626226611865, + "loss": 2.6183, + "step": 2100 + }, + { + "epoch": 0.24091273936475174, + "grad_norm": 0.30581378298112066, + "learning_rate": 0.0008878454547109065, + "loss": 2.5983, + "step": 2101 + }, + { + "epoch": 0.24102740511409243, + "grad_norm": 0.2657161835172812, + "learning_rate": 0.0008877282332647392, + "loss": 2.5119, + "step": 2102 + }, + { + "epoch": 0.2411420708634331, + "grad_norm": 0.28291918803406085, + "learning_rate": 0.0008876109583388528, + "loss": 2.6194, + "step": 2103 + }, + { + "epoch": 0.24125673661277378, + "grad_norm": 0.28503406464744396, + "learning_rate": 0.0008874936299494232, + "loss": 2.6356, + "step": 2104 + }, + { + "epoch": 0.24137140236211443, + "grad_norm": 0.2932311454641575, + "learning_rate": 0.0008873762481126337, + "loss": 2.557, + "step": 2105 + }, + { + "epoch": 0.24148606811145512, + "grad_norm": 0.26203023217233284, + "learning_rate": 0.0008872588128446749, + "loss": 2.576, + "step": 2106 + }, + { + "epoch": 0.24160073386079578, + "grad_norm": 0.29478108813955256, + "learning_rate": 0.0008871413241617446, + "loss": 2.4965, + "step": 2107 + }, + { + "epoch": 0.24171539961013644, + "grad_norm": 0.2885864704894152, + "learning_rate": 0.0008870237820800482, + "loss": 2.7297, + "step": 2108 + }, + { + "epoch": 0.24183006535947713, + "grad_norm": 0.3079972953284916, + "learning_rate": 0.0008869061866157985, + "loss": 2.7824, + "step": 2109 + }, + { + "epoch": 0.24194473110881778, + "grad_norm": 0.2930604790395038, + "learning_rate": 0.0008867885377852153, + "loss": 2.5534, + "step": 2110 + }, + { + "epoch": 0.24205939685815847, + "grad_norm": 0.2587602955529037, + "learning_rate": 0.0008866708356045263, + "loss": 2.5457, + "step": 2111 + }, + { + "epoch": 0.24217406260749913, + "grad_norm": 0.27537765505377526, + "learning_rate": 0.0008865530800899661, + "loss": 2.5993, + "step": 2112 + }, + { + "epoch": 0.24228872835683982, + "grad_norm": 0.2576908746403623, + "learning_rate": 0.000886435271257777, + "loss": 2.555, + "step": 2113 + }, + { + "epoch": 0.24240339410618048, + "grad_norm": 0.24960534322044353, + "learning_rate": 0.0008863174091242083, + "loss": 2.6807, + "step": 2114 + }, + { + "epoch": 0.24251805985552116, + "grad_norm": 0.2494879295425208, + "learning_rate": 0.0008861994937055167, + "loss": 2.5761, + "step": 2115 + }, + { + "epoch": 0.24263272560486182, + "grad_norm": 0.2445031689763979, + "learning_rate": 0.0008860815250179668, + "loss": 2.5988, + "step": 2116 + }, + { + "epoch": 0.2427473913542025, + "grad_norm": 0.2548489042535031, + "learning_rate": 0.00088596350307783, + "loss": 2.552, + "step": 2117 + }, + { + "epoch": 0.24286205710354317, + "grad_norm": 0.25714456865733787, + "learning_rate": 0.0008858454279013848, + "loss": 2.4304, + "step": 2118 + }, + { + "epoch": 0.24297672285288385, + "grad_norm": 0.26127206422432886, + "learning_rate": 0.0008857272995049178, + "loss": 2.7842, + "step": 2119 + }, + { + "epoch": 0.2430913886022245, + "grad_norm": 0.25636324021133794, + "learning_rate": 0.0008856091179047225, + "loss": 2.5493, + "step": 2120 + }, + { + "epoch": 0.2432060543515652, + "grad_norm": 0.2660947714054031, + "learning_rate": 0.0008854908831170998, + "loss": 2.6183, + "step": 2121 + }, + { + "epoch": 0.24332072010090586, + "grad_norm": 0.2474972961650398, + "learning_rate": 0.0008853725951583578, + "loss": 2.4701, + "step": 2122 + }, + { + "epoch": 0.24343538585024654, + "grad_norm": 0.2873052454032071, + "learning_rate": 0.0008852542540448123, + "loss": 2.5648, + "step": 2123 + }, + { + "epoch": 0.2435500515995872, + "grad_norm": 0.25216119347971927, + "learning_rate": 0.0008851358597927859, + "loss": 2.5548, + "step": 2124 + }, + { + "epoch": 0.24366471734892786, + "grad_norm": 0.28806755938933815, + "learning_rate": 0.0008850174124186091, + "loss": 2.5978, + "step": 2125 + }, + { + "epoch": 0.24377938309826855, + "grad_norm": 0.29679955624019355, + "learning_rate": 0.0008848989119386193, + "loss": 2.66, + "step": 2126 + }, + { + "epoch": 0.2438940488476092, + "grad_norm": 0.27547071257363437, + "learning_rate": 0.0008847803583691614, + "loss": 2.6101, + "step": 2127 + }, + { + "epoch": 0.2440087145969499, + "grad_norm": 0.25162276324212957, + "learning_rate": 0.0008846617517265878, + "loss": 2.5406, + "step": 2128 + }, + { + "epoch": 0.24412338034629055, + "grad_norm": 0.26300932807766597, + "learning_rate": 0.0008845430920272578, + "loss": 2.6072, + "step": 2129 + }, + { + "epoch": 0.24423804609563124, + "grad_norm": 0.27272647585638227, + "learning_rate": 0.0008844243792875384, + "loss": 2.4734, + "step": 2130 + }, + { + "epoch": 0.2443527118449719, + "grad_norm": 0.27752219779695264, + "learning_rate": 0.0008843056135238034, + "loss": 2.6215, + "step": 2131 + }, + { + "epoch": 0.24446737759431258, + "grad_norm": 0.26074212968109706, + "learning_rate": 0.0008841867947524349, + "loss": 2.5537, + "step": 2132 + }, + { + "epoch": 0.24458204334365324, + "grad_norm": 0.2978108333818431, + "learning_rate": 0.0008840679229898211, + "loss": 2.6106, + "step": 2133 + }, + { + "epoch": 0.24469670909299393, + "grad_norm": 0.24477389564073224, + "learning_rate": 0.0008839489982523583, + "loss": 2.4645, + "step": 2134 + }, + { + "epoch": 0.2448113748423346, + "grad_norm": 0.22725209154719025, + "learning_rate": 0.00088383002055645, + "loss": 2.4696, + "step": 2135 + }, + { + "epoch": 0.24492604059167528, + "grad_norm": 0.2938523284784154, + "learning_rate": 0.000883710989918507, + "loss": 2.598, + "step": 2136 + }, + { + "epoch": 0.24504070634101593, + "grad_norm": 0.261331527021155, + "learning_rate": 0.0008835919063549469, + "loss": 2.6178, + "step": 2137 + }, + { + "epoch": 0.24515537209035662, + "grad_norm": 0.26859697226278734, + "learning_rate": 0.0008834727698821953, + "loss": 2.6278, + "step": 2138 + }, + { + "epoch": 0.24527003783969728, + "grad_norm": 0.27695959912974366, + "learning_rate": 0.0008833535805166848, + "loss": 2.621, + "step": 2139 + }, + { + "epoch": 0.24538470358903797, + "grad_norm": 0.2563986158997265, + "learning_rate": 0.0008832343382748551, + "loss": 2.6361, + "step": 2140 + }, + { + "epoch": 0.24549936933837863, + "grad_norm": 0.2721319431725499, + "learning_rate": 0.0008831150431731537, + "loss": 2.5338, + "step": 2141 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 0.2775860886077074, + "learning_rate": 0.0008829956952280349, + "loss": 2.6157, + "step": 2142 + }, + { + "epoch": 0.24572870083705997, + "grad_norm": 0.3132617899302838, + "learning_rate": 0.0008828762944559605, + "loss": 2.5978, + "step": 2143 + }, + { + "epoch": 0.24584336658640063, + "grad_norm": 0.29381633344940555, + "learning_rate": 0.0008827568408733996, + "loss": 2.6299, + "step": 2144 + }, + { + "epoch": 0.24595803233574132, + "grad_norm": 0.30463962240462816, + "learning_rate": 0.0008826373344968285, + "loss": 2.5962, + "step": 2145 + }, + { + "epoch": 0.24607269808508198, + "grad_norm": 0.29782790236136236, + "learning_rate": 0.0008825177753427309, + "loss": 2.5782, + "step": 2146 + }, + { + "epoch": 0.24618736383442266, + "grad_norm": 0.30281231221941285, + "learning_rate": 0.0008823981634275975, + "loss": 2.6383, + "step": 2147 + }, + { + "epoch": 0.24630202958376332, + "grad_norm": 0.24215061991169268, + "learning_rate": 0.0008822784987679266, + "loss": 2.5703, + "step": 2148 + }, + { + "epoch": 0.246416695333104, + "grad_norm": 0.31253175077088474, + "learning_rate": 0.0008821587813802237, + "loss": 2.6332, + "step": 2149 + }, + { + "epoch": 0.24653136108244467, + "grad_norm": 0.24886492815626768, + "learning_rate": 0.0008820390112810017, + "loss": 2.6129, + "step": 2150 + }, + { + "epoch": 0.24664602683178535, + "grad_norm": 0.2643695296927128, + "learning_rate": 0.0008819191884867803, + "loss": 2.5687, + "step": 2151 + }, + { + "epoch": 0.246760692581126, + "grad_norm": 0.2515771588573229, + "learning_rate": 0.0008817993130140869, + "loss": 2.5878, + "step": 2152 + }, + { + "epoch": 0.2468753583304667, + "grad_norm": 0.25750067607299454, + "learning_rate": 0.000881679384879456, + "loss": 2.6469, + "step": 2153 + }, + { + "epoch": 0.24699002407980736, + "grad_norm": 0.27383810231683675, + "learning_rate": 0.0008815594040994294, + "loss": 2.665, + "step": 2154 + }, + { + "epoch": 0.24710468982914804, + "grad_norm": 0.27762673834547347, + "learning_rate": 0.0008814393706905564, + "loss": 2.472, + "step": 2155 + }, + { + "epoch": 0.2472193555784887, + "grad_norm": 0.27819375963144277, + "learning_rate": 0.000881319284669393, + "loss": 2.6225, + "step": 2156 + }, + { + "epoch": 0.2473340213278294, + "grad_norm": 0.33649204522384596, + "learning_rate": 0.0008811991460525027, + "loss": 2.5926, + "step": 2157 + }, + { + "epoch": 0.24744868707717005, + "grad_norm": 0.24599223525986988, + "learning_rate": 0.0008810789548564566, + "loss": 2.549, + "step": 2158 + }, + { + "epoch": 0.2475633528265107, + "grad_norm": 0.26582966025208427, + "learning_rate": 0.0008809587110978328, + "loss": 2.7502, + "step": 2159 + }, + { + "epoch": 0.2476780185758514, + "grad_norm": 0.28866014819628577, + "learning_rate": 0.0008808384147932165, + "loss": 2.6429, + "step": 2160 + }, + { + "epoch": 0.24779268432519205, + "grad_norm": 0.32020278086194337, + "learning_rate": 0.0008807180659592004, + "loss": 2.643, + "step": 2161 + }, + { + "epoch": 0.24790735007453274, + "grad_norm": 0.2648469713338004, + "learning_rate": 0.0008805976646123841, + "loss": 2.5552, + "step": 2162 + }, + { + "epoch": 0.2480220158238734, + "grad_norm": 0.2988374877564414, + "learning_rate": 0.0008804772107693748, + "loss": 2.528, + "step": 2163 + }, + { + "epoch": 0.24813668157321409, + "grad_norm": 0.26260890842065365, + "learning_rate": 0.000880356704446787, + "loss": 2.6647, + "step": 2164 + }, + { + "epoch": 0.24825134732255474, + "grad_norm": 0.30818242750749236, + "learning_rate": 0.000880236145661242, + "loss": 2.5685, + "step": 2165 + }, + { + "epoch": 0.24836601307189543, + "grad_norm": 0.2787901023832149, + "learning_rate": 0.0008801155344293686, + "loss": 2.6331, + "step": 2166 + }, + { + "epoch": 0.2484806788212361, + "grad_norm": 0.28566430283277655, + "learning_rate": 0.0008799948707678031, + "loss": 2.5343, + "step": 2167 + }, + { + "epoch": 0.24859534457057678, + "grad_norm": 0.27868848627223514, + "learning_rate": 0.0008798741546931883, + "loss": 2.6632, + "step": 2168 + }, + { + "epoch": 0.24871001031991744, + "grad_norm": 0.30755355774382004, + "learning_rate": 0.000879753386222175, + "loss": 2.5377, + "step": 2169 + }, + { + "epoch": 0.24882467606925812, + "grad_norm": 0.2609655515484714, + "learning_rate": 0.0008796325653714208, + "loss": 2.6144, + "step": 2170 + }, + { + "epoch": 0.24893934181859878, + "grad_norm": 0.24629231747677463, + "learning_rate": 0.0008795116921575907, + "loss": 2.5052, + "step": 2171 + }, + { + "epoch": 0.24905400756793947, + "grad_norm": 0.2644459369657751, + "learning_rate": 0.0008793907665973569, + "loss": 2.5238, + "step": 2172 + }, + { + "epoch": 0.24916867331728013, + "grad_norm": 0.2978346463875741, + "learning_rate": 0.0008792697887073986, + "loss": 2.6363, + "step": 2173 + }, + { + "epoch": 0.2492833390666208, + "grad_norm": 0.2500251150066589, + "learning_rate": 0.0008791487585044025, + "loss": 2.559, + "step": 2174 + }, + { + "epoch": 0.24939800481596147, + "grad_norm": 0.25651828158917256, + "learning_rate": 0.0008790276760050624, + "loss": 2.4847, + "step": 2175 + }, + { + "epoch": 0.24951267056530213, + "grad_norm": 0.2859445171250938, + "learning_rate": 0.0008789065412260793, + "loss": 2.5623, + "step": 2176 + }, + { + "epoch": 0.24962733631464282, + "grad_norm": 0.2811702279116988, + "learning_rate": 0.0008787853541841614, + "loss": 2.4792, + "step": 2177 + }, + { + "epoch": 0.24974200206398348, + "grad_norm": 0.2906677192542289, + "learning_rate": 0.0008786641148960243, + "loss": 2.5391, + "step": 2178 + }, + { + "epoch": 0.24985666781332416, + "grad_norm": 0.2531651188370428, + "learning_rate": 0.0008785428233783905, + "loss": 2.5464, + "step": 2179 + }, + { + "epoch": 0.24997133356266482, + "grad_norm": 0.3326303153761615, + "learning_rate": 0.0008784214796479899, + "loss": 2.5027, + "step": 2180 + }, + { + "epoch": 0.2500859993120055, + "grad_norm": 0.3193515188542318, + "learning_rate": 0.0008783000837215596, + "loss": 2.6335, + "step": 2181 + }, + { + "epoch": 0.2502006650613462, + "grad_norm": 0.30926487274517483, + "learning_rate": 0.0008781786356158437, + "loss": 2.4585, + "step": 2182 + }, + { + "epoch": 0.25031533081068685, + "grad_norm": 0.2715811566625429, + "learning_rate": 0.0008780571353475939, + "loss": 2.5039, + "step": 2183 + }, + { + "epoch": 0.2504299965600275, + "grad_norm": 0.2964901218750409, + "learning_rate": 0.0008779355829335684, + "loss": 2.6953, + "step": 2184 + }, + { + "epoch": 0.25054466230936817, + "grad_norm": 0.24544218550243066, + "learning_rate": 0.0008778139783905337, + "loss": 2.5912, + "step": 2185 + }, + { + "epoch": 0.2506593280587089, + "grad_norm": 0.2529431259467698, + "learning_rate": 0.0008776923217352624, + "loss": 2.6229, + "step": 2186 + }, + { + "epoch": 0.25077399380804954, + "grad_norm": 0.25739451217442405, + "learning_rate": 0.0008775706129845347, + "loss": 2.4644, + "step": 2187 + }, + { + "epoch": 0.2508886595573902, + "grad_norm": 0.3060219737497598, + "learning_rate": 0.0008774488521551381, + "loss": 2.5758, + "step": 2188 + }, + { + "epoch": 0.25100332530673086, + "grad_norm": 0.23999009094817067, + "learning_rate": 0.0008773270392638671, + "loss": 2.5606, + "step": 2189 + }, + { + "epoch": 0.2511179910560716, + "grad_norm": 0.27697438838967164, + "learning_rate": 0.0008772051743275237, + "loss": 2.5119, + "step": 2190 + }, + { + "epoch": 0.25123265680541224, + "grad_norm": 0.234245135757513, + "learning_rate": 0.0008770832573629166, + "loss": 2.4491, + "step": 2191 + }, + { + "epoch": 0.2513473225547529, + "grad_norm": 0.2825508410145344, + "learning_rate": 0.000876961288386862, + "loss": 2.494, + "step": 2192 + }, + { + "epoch": 0.25146198830409355, + "grad_norm": 0.2800573382234421, + "learning_rate": 0.0008768392674161833, + "loss": 2.5834, + "step": 2193 + }, + { + "epoch": 0.2515766540534342, + "grad_norm": 0.2633897696161971, + "learning_rate": 0.0008767171944677108, + "loss": 2.5069, + "step": 2194 + }, + { + "epoch": 0.2516913198027749, + "grad_norm": 0.28697020795131756, + "learning_rate": 0.0008765950695582821, + "loss": 2.4781, + "step": 2195 + }, + { + "epoch": 0.2518059855521156, + "grad_norm": 0.277875981133512, + "learning_rate": 0.0008764728927047423, + "loss": 2.7404, + "step": 2196 + }, + { + "epoch": 0.25192065130145624, + "grad_norm": 0.31508805771299275, + "learning_rate": 0.0008763506639239432, + "loss": 2.68, + "step": 2197 + }, + { + "epoch": 0.2520353170507969, + "grad_norm": 0.2648019354850289, + "learning_rate": 0.0008762283832327436, + "loss": 2.5557, + "step": 2198 + }, + { + "epoch": 0.2521499828001376, + "grad_norm": 0.2807908635015067, + "learning_rate": 0.0008761060506480103, + "loss": 2.5822, + "step": 2199 + }, + { + "epoch": 0.2522646485494783, + "grad_norm": 0.2484454147193635, + "learning_rate": 0.0008759836661866165, + "loss": 2.6415, + "step": 2200 + }, + { + "epoch": 0.25237931429881894, + "grad_norm": 0.2508520028353344, + "learning_rate": 0.0008758612298654429, + "loss": 2.5816, + "step": 2201 + }, + { + "epoch": 0.2524939800481596, + "grad_norm": 0.2656137039955321, + "learning_rate": 0.0008757387417013772, + "loss": 2.6365, + "step": 2202 + }, + { + "epoch": 0.2526086457975003, + "grad_norm": 0.24477903790884062, + "learning_rate": 0.0008756162017113144, + "loss": 2.666, + "step": 2203 + }, + { + "epoch": 0.25272331154684097, + "grad_norm": 0.257342758419661, + "learning_rate": 0.0008754936099121565, + "loss": 2.6132, + "step": 2204 + }, + { + "epoch": 0.2528379772961816, + "grad_norm": 0.27160361774340414, + "learning_rate": 0.0008753709663208125, + "loss": 2.6301, + "step": 2205 + }, + { + "epoch": 0.2529526430455223, + "grad_norm": 0.29788517351294885, + "learning_rate": 0.0008752482709541989, + "loss": 2.4578, + "step": 2206 + }, + { + "epoch": 0.253067308794863, + "grad_norm": 0.2541502158599345, + "learning_rate": 0.0008751255238292392, + "loss": 2.7553, + "step": 2207 + }, + { + "epoch": 0.25318197454420366, + "grad_norm": 0.25521331647142953, + "learning_rate": 0.0008750027249628643, + "loss": 2.6085, + "step": 2208 + }, + { + "epoch": 0.2532966402935443, + "grad_norm": 0.2854386178227579, + "learning_rate": 0.0008748798743720115, + "loss": 2.5559, + "step": 2209 + }, + { + "epoch": 0.253411306042885, + "grad_norm": 0.27181717228193636, + "learning_rate": 0.0008747569720736257, + "loss": 2.5418, + "step": 2210 + }, + { + "epoch": 0.25352597179222564, + "grad_norm": 0.2782953203836333, + "learning_rate": 0.0008746340180846595, + "loss": 2.502, + "step": 2211 + }, + { + "epoch": 0.25364063754156635, + "grad_norm": 0.24452727294337298, + "learning_rate": 0.0008745110124220714, + "loss": 2.4816, + "step": 2212 + }, + { + "epoch": 0.253755303290907, + "grad_norm": 0.2634087885656492, + "learning_rate": 0.000874387955102828, + "loss": 2.6175, + "step": 2213 + }, + { + "epoch": 0.25386996904024767, + "grad_norm": 0.28336580811993284, + "learning_rate": 0.0008742648461439028, + "loss": 2.5207, + "step": 2214 + }, + { + "epoch": 0.2539846347895883, + "grad_norm": 0.30107090185932867, + "learning_rate": 0.0008741416855622762, + "loss": 2.5349, + "step": 2215 + }, + { + "epoch": 0.25409930053892904, + "grad_norm": 0.2890848556663944, + "learning_rate": 0.0008740184733749357, + "loss": 2.5632, + "step": 2216 + }, + { + "epoch": 0.2542139662882697, + "grad_norm": 0.2898358629216642, + "learning_rate": 0.0008738952095988763, + "loss": 2.4426, + "step": 2217 + }, + { + "epoch": 0.25432863203761036, + "grad_norm": 0.28206572032378063, + "learning_rate": 0.0008737718942510999, + "loss": 2.5301, + "step": 2218 + }, + { + "epoch": 0.254443297786951, + "grad_norm": 0.30775317701560984, + "learning_rate": 0.0008736485273486155, + "loss": 2.6642, + "step": 2219 + }, + { + "epoch": 0.25455796353629173, + "grad_norm": 0.30761789488620694, + "learning_rate": 0.000873525108908439, + "loss": 2.5808, + "step": 2220 + }, + { + "epoch": 0.2546726292856324, + "grad_norm": 0.27849907028762777, + "learning_rate": 0.0008734016389475938, + "loss": 2.5928, + "step": 2221 + }, + { + "epoch": 0.25478729503497305, + "grad_norm": 0.2601712252443285, + "learning_rate": 0.0008732781174831103, + "loss": 2.6341, + "step": 2222 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 0.29336267711412556, + "learning_rate": 0.0008731545445320257, + "loss": 2.6945, + "step": 2223 + }, + { + "epoch": 0.2550166265336544, + "grad_norm": 0.26181899424102684, + "learning_rate": 0.0008730309201113847, + "loss": 2.6164, + "step": 2224 + }, + { + "epoch": 0.2551312922829951, + "grad_norm": 0.29306490022846154, + "learning_rate": 0.0008729072442382387, + "loss": 2.6241, + "step": 2225 + }, + { + "epoch": 0.25524595803233574, + "grad_norm": 0.2665302061003145, + "learning_rate": 0.0008727835169296469, + "loss": 2.5605, + "step": 2226 + }, + { + "epoch": 0.2553606237816764, + "grad_norm": 0.2912004963267849, + "learning_rate": 0.0008726597382026747, + "loss": 2.6974, + "step": 2227 + }, + { + "epoch": 0.25547528953101706, + "grad_norm": 0.2544195464220581, + "learning_rate": 0.0008725359080743951, + "loss": 2.5399, + "step": 2228 + }, + { + "epoch": 0.2555899552803578, + "grad_norm": 0.26771490699631617, + "learning_rate": 0.0008724120265618882, + "loss": 2.5924, + "step": 2229 + }, + { + "epoch": 0.25570462102969843, + "grad_norm": 0.2860306691778564, + "learning_rate": 0.0008722880936822411, + "loss": 2.6421, + "step": 2230 + }, + { + "epoch": 0.2558192867790391, + "grad_norm": 0.26006800853774487, + "learning_rate": 0.0008721641094525481, + "loss": 2.6681, + "step": 2231 + }, + { + "epoch": 0.25593395252837975, + "grad_norm": 0.24753378686516658, + "learning_rate": 0.0008720400738899101, + "loss": 2.6519, + "step": 2232 + }, + { + "epoch": 0.25604861827772046, + "grad_norm": 0.2611555553673946, + "learning_rate": 0.0008719159870114356, + "loss": 2.7554, + "step": 2233 + }, + { + "epoch": 0.2561632840270611, + "grad_norm": 0.2641734092054442, + "learning_rate": 0.00087179184883424, + "loss": 2.5378, + "step": 2234 + }, + { + "epoch": 0.2562779497764018, + "grad_norm": 0.2862601053336579, + "learning_rate": 0.000871667659375446, + "loss": 2.5587, + "step": 2235 + }, + { + "epoch": 0.25639261552574244, + "grad_norm": 0.26929362885664176, + "learning_rate": 0.0008715434186521831, + "loss": 2.7045, + "step": 2236 + }, + { + "epoch": 0.25650728127508315, + "grad_norm": 0.25995581843538057, + "learning_rate": 0.0008714191266815877, + "loss": 2.6604, + "step": 2237 + }, + { + "epoch": 0.2566219470244238, + "grad_norm": 0.26718769637286294, + "learning_rate": 0.0008712947834808036, + "loss": 2.7194, + "step": 2238 + }, + { + "epoch": 0.25673661277376447, + "grad_norm": 0.2819896919993998, + "learning_rate": 0.0008711703890669818, + "loss": 2.7002, + "step": 2239 + }, + { + "epoch": 0.25685127852310513, + "grad_norm": 0.28295472742410066, + "learning_rate": 0.0008710459434572799, + "loss": 2.5698, + "step": 2240 + }, + { + "epoch": 0.25696594427244585, + "grad_norm": 0.26807300227378156, + "learning_rate": 0.0008709214466688629, + "loss": 2.4006, + "step": 2241 + }, + { + "epoch": 0.2570806100217865, + "grad_norm": 0.29296859623661425, + "learning_rate": 0.0008707968987189028, + "loss": 2.6625, + "step": 2242 + }, + { + "epoch": 0.25719527577112716, + "grad_norm": 0.2712407379275656, + "learning_rate": 0.0008706722996245784, + "loss": 2.5442, + "step": 2243 + }, + { + "epoch": 0.2573099415204678, + "grad_norm": 0.2650985823525957, + "learning_rate": 0.0008705476494030762, + "loss": 2.536, + "step": 2244 + }, + { + "epoch": 0.2574246072698085, + "grad_norm": 0.3120190406251584, + "learning_rate": 0.0008704229480715887, + "loss": 2.73, + "step": 2245 + }, + { + "epoch": 0.2575392730191492, + "grad_norm": 0.3045619757284662, + "learning_rate": 0.0008702981956473166, + "loss": 2.489, + "step": 2246 + }, + { + "epoch": 0.25765393876848985, + "grad_norm": 0.3015644488012433, + "learning_rate": 0.0008701733921474671, + "loss": 2.6156, + "step": 2247 + }, + { + "epoch": 0.2577686045178305, + "grad_norm": 0.2896816265627592, + "learning_rate": 0.0008700485375892539, + "loss": 2.5312, + "step": 2248 + }, + { + "epoch": 0.25788327026717117, + "grad_norm": 0.2664923768416574, + "learning_rate": 0.000869923631989899, + "loss": 2.5933, + "step": 2249 + }, + { + "epoch": 0.2579979360165119, + "grad_norm": 0.27558176584652383, + "learning_rate": 0.0008697986753666304, + "loss": 2.5954, + "step": 2250 + }, + { + "epoch": 0.25811260176585255, + "grad_norm": 0.28325361328164844, + "learning_rate": 0.0008696736677366834, + "loss": 2.5474, + "step": 2251 + }, + { + "epoch": 0.2582272675151932, + "grad_norm": 0.275625344387118, + "learning_rate": 0.0008695486091173008, + "loss": 2.5564, + "step": 2252 + }, + { + "epoch": 0.25834193326453386, + "grad_norm": 0.26234831296859634, + "learning_rate": 0.0008694234995257318, + "loss": 2.5326, + "step": 2253 + }, + { + "epoch": 0.2584565990138746, + "grad_norm": 0.2711624295680704, + "learning_rate": 0.0008692983389792326, + "loss": 2.6239, + "step": 2254 + }, + { + "epoch": 0.25857126476321524, + "grad_norm": 0.28714049442026357, + "learning_rate": 0.0008691731274950671, + "loss": 2.5635, + "step": 2255 + }, + { + "epoch": 0.2586859305125559, + "grad_norm": 0.24116240418726115, + "learning_rate": 0.0008690478650905059, + "loss": 2.5789, + "step": 2256 + }, + { + "epoch": 0.25880059626189655, + "grad_norm": 0.2339413263774728, + "learning_rate": 0.0008689225517828263, + "loss": 2.5411, + "step": 2257 + }, + { + "epoch": 0.25891526201123727, + "grad_norm": 0.24285699025705562, + "learning_rate": 0.000868797187589313, + "loss": 2.5574, + "step": 2258 + }, + { + "epoch": 0.2590299277605779, + "grad_norm": 0.2569789387457917, + "learning_rate": 0.0008686717725272577, + "loss": 2.5587, + "step": 2259 + }, + { + "epoch": 0.2591445935099186, + "grad_norm": 0.26186626068656543, + "learning_rate": 0.0008685463066139587, + "loss": 2.5272, + "step": 2260 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.28113688577406803, + "learning_rate": 0.000868420789866722, + "loss": 2.5089, + "step": 2261 + }, + { + "epoch": 0.2593739250085999, + "grad_norm": 0.2674832989457913, + "learning_rate": 0.00086829522230286, + "loss": 2.5474, + "step": 2262 + }, + { + "epoch": 0.2594885907579406, + "grad_norm": 0.279935803774432, + "learning_rate": 0.0008681696039396924, + "loss": 2.5312, + "step": 2263 + }, + { + "epoch": 0.2596032565072813, + "grad_norm": 0.3140926597774166, + "learning_rate": 0.0008680439347945459, + "loss": 2.7051, + "step": 2264 + }, + { + "epoch": 0.25971792225662194, + "grad_norm": 0.26318236477325124, + "learning_rate": 0.0008679182148847542, + "loss": 2.6129, + "step": 2265 + }, + { + "epoch": 0.2598325880059626, + "grad_norm": 0.2534825062346113, + "learning_rate": 0.000867792444227658, + "loss": 2.3572, + "step": 2266 + }, + { + "epoch": 0.2599472537553033, + "grad_norm": 0.25444406860824587, + "learning_rate": 0.0008676666228406047, + "loss": 2.6224, + "step": 2267 + }, + { + "epoch": 0.26006191950464397, + "grad_norm": 0.27342413708333674, + "learning_rate": 0.0008675407507409492, + "loss": 2.6509, + "step": 2268 + }, + { + "epoch": 0.2601765852539846, + "grad_norm": 0.263645666074119, + "learning_rate": 0.0008674148279460532, + "loss": 2.7187, + "step": 2269 + }, + { + "epoch": 0.2602912510033253, + "grad_norm": 0.2747055708545287, + "learning_rate": 0.0008672888544732851, + "loss": 2.6211, + "step": 2270 + }, + { + "epoch": 0.260405916752666, + "grad_norm": 0.2501465085484311, + "learning_rate": 0.0008671628303400208, + "loss": 2.6499, + "step": 2271 + }, + { + "epoch": 0.26052058250200666, + "grad_norm": 0.2651447647654398, + "learning_rate": 0.0008670367555636427, + "loss": 2.6995, + "step": 2272 + }, + { + "epoch": 0.2606352482513473, + "grad_norm": 0.2818071413622489, + "learning_rate": 0.0008669106301615406, + "loss": 2.5712, + "step": 2273 + }, + { + "epoch": 0.260749914000688, + "grad_norm": 0.27795347653828967, + "learning_rate": 0.0008667844541511109, + "loss": 2.6053, + "step": 2274 + }, + { + "epoch": 0.2608645797500287, + "grad_norm": 0.2706301706183267, + "learning_rate": 0.0008666582275497575, + "loss": 2.6176, + "step": 2275 + }, + { + "epoch": 0.26097924549936935, + "grad_norm": 0.27947435276032123, + "learning_rate": 0.0008665319503748908, + "loss": 2.5361, + "step": 2276 + }, + { + "epoch": 0.26109391124871, + "grad_norm": 0.28012157778219815, + "learning_rate": 0.0008664056226439281, + "loss": 2.5968, + "step": 2277 + }, + { + "epoch": 0.26120857699805067, + "grad_norm": 0.2627926551931407, + "learning_rate": 0.0008662792443742942, + "loss": 2.592, + "step": 2278 + }, + { + "epoch": 0.2613232427473913, + "grad_norm": 0.2687960150664113, + "learning_rate": 0.0008661528155834203, + "loss": 2.628, + "step": 2279 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.24767276404619976, + "learning_rate": 0.0008660263362887451, + "loss": 2.5398, + "step": 2280 + }, + { + "epoch": 0.2615525742460727, + "grad_norm": 0.27999600688641113, + "learning_rate": 0.000865899806507714, + "loss": 2.6205, + "step": 2281 + }, + { + "epoch": 0.26166723999541336, + "grad_norm": 0.27008255050964314, + "learning_rate": 0.0008657732262577791, + "loss": 2.6563, + "step": 2282 + }, + { + "epoch": 0.261781905744754, + "grad_norm": 0.2499751243536985, + "learning_rate": 0.0008656465955564, + "loss": 2.5624, + "step": 2283 + }, + { + "epoch": 0.26189657149409473, + "grad_norm": 0.2702387919166331, + "learning_rate": 0.0008655199144210428, + "loss": 2.6124, + "step": 2284 + }, + { + "epoch": 0.2620112372434354, + "grad_norm": 0.24898275034168982, + "learning_rate": 0.0008653931828691808, + "loss": 2.5676, + "step": 2285 + }, + { + "epoch": 0.26212590299277605, + "grad_norm": 0.22982996965081692, + "learning_rate": 0.0008652664009182945, + "loss": 2.4897, + "step": 2286 + }, + { + "epoch": 0.2622405687421167, + "grad_norm": 0.23881766012194008, + "learning_rate": 0.0008651395685858708, + "loss": 2.6736, + "step": 2287 + }, + { + "epoch": 0.2623552344914574, + "grad_norm": 0.2621231316643747, + "learning_rate": 0.0008650126858894035, + "loss": 2.418, + "step": 2288 + }, + { + "epoch": 0.2624699002407981, + "grad_norm": 0.25745968051603146, + "learning_rate": 0.0008648857528463943, + "loss": 2.6177, + "step": 2289 + }, + { + "epoch": 0.26258456599013874, + "grad_norm": 0.2410131085682068, + "learning_rate": 0.0008647587694743506, + "loss": 2.6091, + "step": 2290 + }, + { + "epoch": 0.2626992317394794, + "grad_norm": 0.2911608861533398, + "learning_rate": 0.0008646317357907877, + "loss": 2.5578, + "step": 2291 + }, + { + "epoch": 0.2628138974888201, + "grad_norm": 0.2597313089724537, + "learning_rate": 0.0008645046518132273, + "loss": 2.604, + "step": 2292 + }, + { + "epoch": 0.2629285632381608, + "grad_norm": 0.28612944571895904, + "learning_rate": 0.0008643775175591983, + "loss": 2.667, + "step": 2293 + }, + { + "epoch": 0.26304322898750143, + "grad_norm": 0.29389120261015184, + "learning_rate": 0.0008642503330462364, + "loss": 2.5746, + "step": 2294 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.32426563848643625, + "learning_rate": 0.0008641230982918844, + "loss": 2.4325, + "step": 2295 + }, + { + "epoch": 0.26327256048618275, + "grad_norm": 0.3111532756914654, + "learning_rate": 0.0008639958133136918, + "loss": 2.5558, + "step": 2296 + }, + { + "epoch": 0.26338722623552346, + "grad_norm": 0.3085588522037956, + "learning_rate": 0.000863868478129215, + "loss": 2.5575, + "step": 2297 + }, + { + "epoch": 0.2635018919848641, + "grad_norm": 0.2987257402683066, + "learning_rate": 0.0008637410927560176, + "loss": 2.729, + "step": 2298 + }, + { + "epoch": 0.2636165577342048, + "grad_norm": 0.2722103541175778, + "learning_rate": 0.0008636136572116702, + "loss": 2.7217, + "step": 2299 + }, + { + "epoch": 0.26373122348354544, + "grad_norm": 0.24881535814502806, + "learning_rate": 0.0008634861715137497, + "loss": 2.5942, + "step": 2300 + }, + { + "epoch": 0.26384588923288615, + "grad_norm": 0.2884808729743463, + "learning_rate": 0.0008633586356798406, + "loss": 2.6489, + "step": 2301 + }, + { + "epoch": 0.2639605549822268, + "grad_norm": 0.24489913293384957, + "learning_rate": 0.0008632310497275339, + "loss": 2.5774, + "step": 2302 + }, + { + "epoch": 0.2640752207315675, + "grad_norm": 0.2877128025810281, + "learning_rate": 0.0008631034136744278, + "loss": 2.615, + "step": 2303 + }, + { + "epoch": 0.26418988648090813, + "grad_norm": 0.2547734183267323, + "learning_rate": 0.0008629757275381272, + "loss": 2.5753, + "step": 2304 + }, + { + "epoch": 0.26430455223024885, + "grad_norm": 0.2446393197687182, + "learning_rate": 0.0008628479913362438, + "loss": 2.7021, + "step": 2305 + }, + { + "epoch": 0.2644192179795895, + "grad_norm": 0.26814388699750946, + "learning_rate": 0.0008627202050863966, + "loss": 2.5921, + "step": 2306 + }, + { + "epoch": 0.26453388372893016, + "grad_norm": 0.23947191148845884, + "learning_rate": 0.0008625923688062112, + "loss": 2.5164, + "step": 2307 + }, + { + "epoch": 0.2646485494782708, + "grad_norm": 0.26722104557776455, + "learning_rate": 0.0008624644825133201, + "loss": 2.528, + "step": 2308 + }, + { + "epoch": 0.26476321522761154, + "grad_norm": 0.24810937347291795, + "learning_rate": 0.0008623365462253627, + "loss": 2.4456, + "step": 2309 + }, + { + "epoch": 0.2648778809769522, + "grad_norm": 0.24831882185878187, + "learning_rate": 0.0008622085599599857, + "loss": 2.5779, + "step": 2310 + }, + { + "epoch": 0.26499254672629285, + "grad_norm": 0.29787307242777805, + "learning_rate": 0.0008620805237348422, + "loss": 2.4263, + "step": 2311 + }, + { + "epoch": 0.2651072124756335, + "grad_norm": 0.2587598462078542, + "learning_rate": 0.0008619524375675922, + "loss": 2.5832, + "step": 2312 + }, + { + "epoch": 0.26522187822497423, + "grad_norm": 0.3100564905846058, + "learning_rate": 0.0008618243014759028, + "loss": 2.5114, + "step": 2313 + }, + { + "epoch": 0.2653365439743149, + "grad_norm": 0.2818893566801113, + "learning_rate": 0.0008616961154774483, + "loss": 2.5174, + "step": 2314 + }, + { + "epoch": 0.26545120972365555, + "grad_norm": 0.25761873777185484, + "learning_rate": 0.0008615678795899091, + "loss": 2.6134, + "step": 2315 + }, + { + "epoch": 0.2655658754729962, + "grad_norm": 0.2732215523199713, + "learning_rate": 0.0008614395938309729, + "loss": 2.618, + "step": 2316 + }, + { + "epoch": 0.26568054122233686, + "grad_norm": 0.26976045020583034, + "learning_rate": 0.0008613112582183345, + "loss": 2.4406, + "step": 2317 + }, + { + "epoch": 0.2657952069716776, + "grad_norm": 0.2572225278488332, + "learning_rate": 0.0008611828727696953, + "loss": 2.6612, + "step": 2318 + }, + { + "epoch": 0.26590987272101824, + "grad_norm": 0.2846060532839916, + "learning_rate": 0.0008610544375027636, + "loss": 2.5487, + "step": 2319 + }, + { + "epoch": 0.2660245384703589, + "grad_norm": 0.23319881504909837, + "learning_rate": 0.0008609259524352544, + "loss": 2.504, + "step": 2320 + }, + { + "epoch": 0.26613920421969955, + "grad_norm": 0.2655935855591492, + "learning_rate": 0.00086079741758489, + "loss": 2.6042, + "step": 2321 + }, + { + "epoch": 0.26625386996904027, + "grad_norm": 0.258056321688033, + "learning_rate": 0.0008606688329693994, + "loss": 2.4456, + "step": 2322 + }, + { + "epoch": 0.2663685357183809, + "grad_norm": 0.3250098759672273, + "learning_rate": 0.000860540198606518, + "loss": 2.6761, + "step": 2323 + }, + { + "epoch": 0.2664832014677216, + "grad_norm": 0.2853917234116672, + "learning_rate": 0.0008604115145139889, + "loss": 2.5193, + "step": 2324 + }, + { + "epoch": 0.26659786721706225, + "grad_norm": 0.26109745392877237, + "learning_rate": 0.0008602827807095614, + "loss": 2.7035, + "step": 2325 + }, + { + "epoch": 0.26671253296640296, + "grad_norm": 0.2739598735209245, + "learning_rate": 0.000860153997210992, + "loss": 2.608, + "step": 2326 + }, + { + "epoch": 0.2668271987157436, + "grad_norm": 0.29390691633841487, + "learning_rate": 0.0008600251640360438, + "loss": 2.6331, + "step": 2327 + }, + { + "epoch": 0.2669418644650843, + "grad_norm": 0.23810613046277532, + "learning_rate": 0.0008598962812024868, + "loss": 2.5662, + "step": 2328 + }, + { + "epoch": 0.26705653021442494, + "grad_norm": 0.267632072156182, + "learning_rate": 0.0008597673487280983, + "loss": 2.5877, + "step": 2329 + }, + { + "epoch": 0.26717119596376565, + "grad_norm": 0.2544187566098131, + "learning_rate": 0.0008596383666306616, + "loss": 2.4746, + "step": 2330 + }, + { + "epoch": 0.2672858617131063, + "grad_norm": 0.2749768367889401, + "learning_rate": 0.0008595093349279677, + "loss": 2.6508, + "step": 2331 + }, + { + "epoch": 0.26740052746244697, + "grad_norm": 0.25312208607294706, + "learning_rate": 0.000859380253637814, + "loss": 2.5226, + "step": 2332 + }, + { + "epoch": 0.2675151932117876, + "grad_norm": 0.26215612093210444, + "learning_rate": 0.0008592511227780045, + "loss": 2.6001, + "step": 2333 + }, + { + "epoch": 0.2676298589611283, + "grad_norm": 0.25599855025681517, + "learning_rate": 0.0008591219423663506, + "loss": 2.6444, + "step": 2334 + }, + { + "epoch": 0.267744524710469, + "grad_norm": 0.2773340988173593, + "learning_rate": 0.0008589927124206702, + "loss": 2.5458, + "step": 2335 + }, + { + "epoch": 0.26785919045980966, + "grad_norm": 0.2960624828024453, + "learning_rate": 0.0008588634329587884, + "loss": 2.5557, + "step": 2336 + }, + { + "epoch": 0.2679738562091503, + "grad_norm": 0.3002920480819714, + "learning_rate": 0.0008587341039985363, + "loss": 2.6424, + "step": 2337 + }, + { + "epoch": 0.268088521958491, + "grad_norm": 0.28600619522031384, + "learning_rate": 0.0008586047255577527, + "loss": 2.5821, + "step": 2338 + }, + { + "epoch": 0.2682031877078317, + "grad_norm": 0.2689083665762978, + "learning_rate": 0.000858475297654283, + "loss": 2.4294, + "step": 2339 + }, + { + "epoch": 0.26831785345717235, + "grad_norm": 0.2991149739333668, + "learning_rate": 0.0008583458203059791, + "loss": 2.5029, + "step": 2340 + }, + { + "epoch": 0.268432519206513, + "grad_norm": 0.2342273778261057, + "learning_rate": 0.0008582162935306998, + "loss": 2.5249, + "step": 2341 + }, + { + "epoch": 0.26854718495585367, + "grad_norm": 0.2785858019619248, + "learning_rate": 0.0008580867173463112, + "loss": 2.6476, + "step": 2342 + }, + { + "epoch": 0.2686618507051944, + "grad_norm": 0.26287471423883446, + "learning_rate": 0.0008579570917706857, + "loss": 2.5578, + "step": 2343 + }, + { + "epoch": 0.26877651645453504, + "grad_norm": 0.25362805040973957, + "learning_rate": 0.0008578274168217026, + "loss": 2.4611, + "step": 2344 + }, + { + "epoch": 0.2688911822038757, + "grad_norm": 0.2800245801392496, + "learning_rate": 0.0008576976925172481, + "loss": 2.5901, + "step": 2345 + }, + { + "epoch": 0.26900584795321636, + "grad_norm": 0.28218377782448223, + "learning_rate": 0.0008575679188752154, + "loss": 2.5458, + "step": 2346 + }, + { + "epoch": 0.2691205137025571, + "grad_norm": 0.29359380738226193, + "learning_rate": 0.0008574380959135042, + "loss": 2.5408, + "step": 2347 + }, + { + "epoch": 0.26923517945189773, + "grad_norm": 0.2895979524877854, + "learning_rate": 0.0008573082236500209, + "loss": 2.5359, + "step": 2348 + }, + { + "epoch": 0.2693498452012384, + "grad_norm": 0.2780567330672685, + "learning_rate": 0.0008571783021026791, + "loss": 2.6347, + "step": 2349 + }, + { + "epoch": 0.26946451095057905, + "grad_norm": 0.2875165221234477, + "learning_rate": 0.000857048331289399, + "loss": 2.4006, + "step": 2350 + }, + { + "epoch": 0.2695791766999197, + "grad_norm": 0.26286409714740516, + "learning_rate": 0.0008569183112281075, + "loss": 2.4972, + "step": 2351 + }, + { + "epoch": 0.2696938424492604, + "grad_norm": 0.282120865991687, + "learning_rate": 0.0008567882419367386, + "loss": 2.5622, + "step": 2352 + }, + { + "epoch": 0.2698085081986011, + "grad_norm": 0.24898876873022102, + "learning_rate": 0.0008566581234332327, + "loss": 2.5247, + "step": 2353 + }, + { + "epoch": 0.26992317394794174, + "grad_norm": 0.26425377192459965, + "learning_rate": 0.000856527955735537, + "loss": 2.484, + "step": 2354 + }, + { + "epoch": 0.2700378396972824, + "grad_norm": 0.240228977936261, + "learning_rate": 0.000856397738861606, + "loss": 2.3536, + "step": 2355 + }, + { + "epoch": 0.2701525054466231, + "grad_norm": 0.28021635249607196, + "learning_rate": 0.0008562674728294004, + "loss": 2.5926, + "step": 2356 + }, + { + "epoch": 0.2702671711959638, + "grad_norm": 0.2650233129350482, + "learning_rate": 0.0008561371576568881, + "loss": 2.4835, + "step": 2357 + }, + { + "epoch": 0.27038183694530443, + "grad_norm": 0.2633359009816506, + "learning_rate": 0.0008560067933620435, + "loss": 2.4433, + "step": 2358 + }, + { + "epoch": 0.2704965026946451, + "grad_norm": 0.2946647492962772, + "learning_rate": 0.0008558763799628477, + "loss": 2.6765, + "step": 2359 + }, + { + "epoch": 0.2706111684439858, + "grad_norm": 0.3010921807269685, + "learning_rate": 0.000855745917477289, + "loss": 2.5501, + "step": 2360 + }, + { + "epoch": 0.27072583419332646, + "grad_norm": 0.25278395579767765, + "learning_rate": 0.0008556154059233622, + "loss": 2.5531, + "step": 2361 + }, + { + "epoch": 0.2708404999426671, + "grad_norm": 0.28667903087024116, + "learning_rate": 0.0008554848453190686, + "loss": 2.5936, + "step": 2362 + }, + { + "epoch": 0.2709551656920078, + "grad_norm": 0.2789194585401289, + "learning_rate": 0.0008553542356824168, + "loss": 2.526, + "step": 2363 + }, + { + "epoch": 0.2710698314413485, + "grad_norm": 0.2579661629699175, + "learning_rate": 0.0008552235770314221, + "loss": 2.5519, + "step": 2364 + }, + { + "epoch": 0.27118449719068916, + "grad_norm": 0.2981772464908079, + "learning_rate": 0.0008550928693841058, + "loss": 2.5796, + "step": 2365 + }, + { + "epoch": 0.2712991629400298, + "grad_norm": 0.282944415161875, + "learning_rate": 0.0008549621127584971, + "loss": 2.4011, + "step": 2366 + }, + { + "epoch": 0.2714138286893705, + "grad_norm": 0.290034883553547, + "learning_rate": 0.000854831307172631, + "loss": 2.5336, + "step": 2367 + }, + { + "epoch": 0.27152849443871113, + "grad_norm": 0.2550067760669629, + "learning_rate": 0.0008547004526445499, + "loss": 2.5061, + "step": 2368 + }, + { + "epoch": 0.27164316018805185, + "grad_norm": 0.25115199820867107, + "learning_rate": 0.0008545695491923024, + "loss": 2.4673, + "step": 2369 + }, + { + "epoch": 0.2717578259373925, + "grad_norm": 0.25328329478881895, + "learning_rate": 0.0008544385968339445, + "loss": 2.5227, + "step": 2370 + }, + { + "epoch": 0.27187249168673316, + "grad_norm": 0.23758367028112443, + "learning_rate": 0.0008543075955875382, + "loss": 2.561, + "step": 2371 + }, + { + "epoch": 0.2719871574360738, + "grad_norm": 0.2661259078259569, + "learning_rate": 0.0008541765454711527, + "loss": 2.6191, + "step": 2372 + }, + { + "epoch": 0.27210182318541454, + "grad_norm": 0.26616889714959324, + "learning_rate": 0.0008540454465028643, + "loss": 2.6842, + "step": 2373 + }, + { + "epoch": 0.2722164889347552, + "grad_norm": 0.24992609194787788, + "learning_rate": 0.0008539142987007551, + "loss": 2.4991, + "step": 2374 + }, + { + "epoch": 0.27233115468409586, + "grad_norm": 0.24783171434680837, + "learning_rate": 0.0008537831020829147, + "loss": 2.5491, + "step": 2375 + }, + { + "epoch": 0.2724458204334365, + "grad_norm": 0.2525704602115128, + "learning_rate": 0.0008536518566674389, + "loss": 2.549, + "step": 2376 + }, + { + "epoch": 0.27256048618277723, + "grad_norm": 0.30882216886242264, + "learning_rate": 0.0008535205624724309, + "loss": 2.5271, + "step": 2377 + }, + { + "epoch": 0.2726751519321179, + "grad_norm": 0.26141228900484637, + "learning_rate": 0.0008533892195159999, + "loss": 2.5505, + "step": 2378 + }, + { + "epoch": 0.27278981768145855, + "grad_norm": 0.2994381817516312, + "learning_rate": 0.0008532578278162624, + "loss": 2.6687, + "step": 2379 + }, + { + "epoch": 0.2729044834307992, + "grad_norm": 0.28209418377683404, + "learning_rate": 0.0008531263873913411, + "loss": 2.545, + "step": 2380 + }, + { + "epoch": 0.2730191491801399, + "grad_norm": 0.2784338902283846, + "learning_rate": 0.0008529948982593658, + "loss": 2.538, + "step": 2381 + }, + { + "epoch": 0.2731338149294806, + "grad_norm": 0.2884789419416564, + "learning_rate": 0.0008528633604384733, + "loss": 2.4972, + "step": 2382 + }, + { + "epoch": 0.27324848067882124, + "grad_norm": 0.2631061396320721, + "learning_rate": 0.0008527317739468061, + "loss": 2.5626, + "step": 2383 + }, + { + "epoch": 0.2733631464281619, + "grad_norm": 0.2521564988524757, + "learning_rate": 0.0008526001388025145, + "loss": 2.5278, + "step": 2384 + }, + { + "epoch": 0.27347781217750256, + "grad_norm": 0.2549158276931651, + "learning_rate": 0.0008524684550237549, + "loss": 2.6384, + "step": 2385 + }, + { + "epoch": 0.27359247792684327, + "grad_norm": 0.24669552814327803, + "learning_rate": 0.0008523367226286907, + "loss": 2.5763, + "step": 2386 + }, + { + "epoch": 0.27370714367618393, + "grad_norm": 0.2678545522168121, + "learning_rate": 0.0008522049416354915, + "loss": 2.4326, + "step": 2387 + }, + { + "epoch": 0.2738218094255246, + "grad_norm": 0.251078084078413, + "learning_rate": 0.0008520731120623344, + "loss": 2.5283, + "step": 2388 + }, + { + "epoch": 0.27393647517486525, + "grad_norm": 0.282524954608787, + "learning_rate": 0.0008519412339274027, + "loss": 2.6199, + "step": 2389 + }, + { + "epoch": 0.27405114092420596, + "grad_norm": 0.2524336125108421, + "learning_rate": 0.0008518093072488863, + "loss": 2.5556, + "step": 2390 + }, + { + "epoch": 0.2741658066735466, + "grad_norm": 0.2512518467991055, + "learning_rate": 0.000851677332044982, + "loss": 2.6842, + "step": 2391 + }, + { + "epoch": 0.2742804724228873, + "grad_norm": 0.2578260544313042, + "learning_rate": 0.0008515453083338935, + "loss": 2.6217, + "step": 2392 + }, + { + "epoch": 0.27439513817222794, + "grad_norm": 0.2694193318946938, + "learning_rate": 0.0008514132361338306, + "loss": 2.6302, + "step": 2393 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 0.26191327678962867, + "learning_rate": 0.0008512811154630104, + "loss": 2.6819, + "step": 2394 + }, + { + "epoch": 0.2746244696709093, + "grad_norm": 0.2782602232038335, + "learning_rate": 0.0008511489463396563, + "loss": 2.5318, + "step": 2395 + }, + { + "epoch": 0.27473913542024997, + "grad_norm": 0.2885139016158749, + "learning_rate": 0.0008510167287819986, + "loss": 2.6102, + "step": 2396 + }, + { + "epoch": 0.27485380116959063, + "grad_norm": 0.2581976087460682, + "learning_rate": 0.0008508844628082741, + "loss": 2.5229, + "step": 2397 + }, + { + "epoch": 0.27496846691893134, + "grad_norm": 0.2794809924836603, + "learning_rate": 0.0008507521484367265, + "loss": 2.6789, + "step": 2398 + }, + { + "epoch": 0.275083132668272, + "grad_norm": 0.24471043596524703, + "learning_rate": 0.0008506197856856059, + "loss": 2.5583, + "step": 2399 + }, + { + "epoch": 0.27519779841761266, + "grad_norm": 0.2961465808074545, + "learning_rate": 0.0008504873745731694, + "loss": 2.553, + "step": 2400 + }, + { + "epoch": 0.2753124641669533, + "grad_norm": 0.24845955451060422, + "learning_rate": 0.0008503549151176804, + "loss": 2.5822, + "step": 2401 + }, + { + "epoch": 0.275427129916294, + "grad_norm": 0.29059266575733805, + "learning_rate": 0.0008502224073374092, + "loss": 2.6134, + "step": 2402 + }, + { + "epoch": 0.2755417956656347, + "grad_norm": 0.2573275862831089, + "learning_rate": 0.0008500898512506328, + "loss": 2.3948, + "step": 2403 + }, + { + "epoch": 0.27565646141497535, + "grad_norm": 0.2708467629757828, + "learning_rate": 0.0008499572468756347, + "loss": 2.6952, + "step": 2404 + }, + { + "epoch": 0.275771127164316, + "grad_norm": 0.2584709937525598, + "learning_rate": 0.0008498245942307052, + "loss": 2.4943, + "step": 2405 + }, + { + "epoch": 0.27588579291365667, + "grad_norm": 0.2638199755017948, + "learning_rate": 0.0008496918933341413, + "loss": 2.6278, + "step": 2406 + }, + { + "epoch": 0.2760004586629974, + "grad_norm": 0.284013308754895, + "learning_rate": 0.0008495591442042463, + "loss": 2.6087, + "step": 2407 + }, + { + "epoch": 0.27611512441233804, + "grad_norm": 0.25285473429827365, + "learning_rate": 0.0008494263468593307, + "loss": 2.489, + "step": 2408 + }, + { + "epoch": 0.2762297901616787, + "grad_norm": 0.27345790478784665, + "learning_rate": 0.000849293501317711, + "loss": 2.4463, + "step": 2409 + }, + { + "epoch": 0.27634445591101936, + "grad_norm": 0.2798468559473655, + "learning_rate": 0.000849160607597711, + "loss": 2.5281, + "step": 2410 + }, + { + "epoch": 0.2764591216603601, + "grad_norm": 0.2675057092775682, + "learning_rate": 0.000849027665717661, + "loss": 2.5024, + "step": 2411 + }, + { + "epoch": 0.27657378740970073, + "grad_norm": 0.2803925449427268, + "learning_rate": 0.0008488946756958973, + "loss": 2.5563, + "step": 2412 + }, + { + "epoch": 0.2766884531590414, + "grad_norm": 0.28868003010772186, + "learning_rate": 0.0008487616375507639, + "loss": 2.384, + "step": 2413 + }, + { + "epoch": 0.27680311890838205, + "grad_norm": 0.30729004164670604, + "learning_rate": 0.0008486285513006104, + "loss": 2.5996, + "step": 2414 + }, + { + "epoch": 0.27691778465772277, + "grad_norm": 0.29676006219158807, + "learning_rate": 0.0008484954169637937, + "loss": 2.601, + "step": 2415 + }, + { + "epoch": 0.2770324504070634, + "grad_norm": 0.27217287440114984, + "learning_rate": 0.0008483622345586774, + "loss": 2.5239, + "step": 2416 + }, + { + "epoch": 0.2771471161564041, + "grad_norm": 0.27321044490303814, + "learning_rate": 0.0008482290041036309, + "loss": 2.643, + "step": 2417 + }, + { + "epoch": 0.27726178190574474, + "grad_norm": 0.25424309019211944, + "learning_rate": 0.0008480957256170314, + "loss": 2.5906, + "step": 2418 + }, + { + "epoch": 0.2773764476550854, + "grad_norm": 0.25470424132020775, + "learning_rate": 0.0008479623991172618, + "loss": 2.526, + "step": 2419 + }, + { + "epoch": 0.2774911134044261, + "grad_norm": 0.27683146470548986, + "learning_rate": 0.000847829024622712, + "loss": 2.6117, + "step": 2420 + }, + { + "epoch": 0.2776057791537668, + "grad_norm": 0.2589440078264431, + "learning_rate": 0.0008476956021517783, + "loss": 2.6008, + "step": 2421 + }, + { + "epoch": 0.27772044490310743, + "grad_norm": 0.27949741614070356, + "learning_rate": 0.0008475621317228641, + "loss": 2.4709, + "step": 2422 + }, + { + "epoch": 0.2778351106524481, + "grad_norm": 0.2693507174978494, + "learning_rate": 0.000847428613354379, + "loss": 2.5444, + "step": 2423 + }, + { + "epoch": 0.2779497764017888, + "grad_norm": 0.2556183818146168, + "learning_rate": 0.0008472950470647393, + "loss": 2.4278, + "step": 2424 + }, + { + "epoch": 0.27806444215112947, + "grad_norm": 0.24387398362986776, + "learning_rate": 0.0008471614328723678, + "loss": 2.544, + "step": 2425 + }, + { + "epoch": 0.2781791079004701, + "grad_norm": 0.2674125616777062, + "learning_rate": 0.0008470277707956943, + "loss": 2.5712, + "step": 2426 + }, + { + "epoch": 0.2782937736498108, + "grad_norm": 0.26005974293490713, + "learning_rate": 0.0008468940608531546, + "loss": 2.5905, + "step": 2427 + }, + { + "epoch": 0.2784084393991515, + "grad_norm": 0.27520008793186473, + "learning_rate": 0.0008467603030631916, + "loss": 2.4995, + "step": 2428 + }, + { + "epoch": 0.27852310514849216, + "grad_norm": 0.2740718219058786, + "learning_rate": 0.0008466264974442548, + "loss": 2.6654, + "step": 2429 + }, + { + "epoch": 0.2786377708978328, + "grad_norm": 0.2786869731536121, + "learning_rate": 0.0008464926440147998, + "loss": 2.6526, + "step": 2430 + }, + { + "epoch": 0.2787524366471735, + "grad_norm": 0.2865174008230928, + "learning_rate": 0.0008463587427932895, + "loss": 2.5338, + "step": 2431 + }, + { + "epoch": 0.2788671023965142, + "grad_norm": 0.3013424398143583, + "learning_rate": 0.0008462247937981928, + "loss": 2.6616, + "step": 2432 + }, + { + "epoch": 0.27898176814585485, + "grad_norm": 0.27499385160668566, + "learning_rate": 0.0008460907970479853, + "loss": 2.5329, + "step": 2433 + }, + { + "epoch": 0.2790964338951955, + "grad_norm": 0.25946737896770045, + "learning_rate": 0.0008459567525611496, + "loss": 2.59, + "step": 2434 + }, + { + "epoch": 0.27921109964453616, + "grad_norm": 0.24892950398149521, + "learning_rate": 0.0008458226603561742, + "loss": 2.5647, + "step": 2435 + }, + { + "epoch": 0.2793257653938768, + "grad_norm": 0.2830585037696782, + "learning_rate": 0.0008456885204515549, + "loss": 2.5004, + "step": 2436 + }, + { + "epoch": 0.27944043114321754, + "grad_norm": 0.2696320895904184, + "learning_rate": 0.0008455543328657937, + "loss": 2.7736, + "step": 2437 + }, + { + "epoch": 0.2795550968925582, + "grad_norm": 0.24828188271491208, + "learning_rate": 0.0008454200976173991, + "loss": 2.6013, + "step": 2438 + }, + { + "epoch": 0.27966976264189886, + "grad_norm": 0.26321309516174424, + "learning_rate": 0.0008452858147248863, + "loss": 2.6673, + "step": 2439 + }, + { + "epoch": 0.2797844283912395, + "grad_norm": 0.23956378267391462, + "learning_rate": 0.0008451514842067771, + "loss": 2.6743, + "step": 2440 + }, + { + "epoch": 0.27989909414058023, + "grad_norm": 0.27328229047816405, + "learning_rate": 0.0008450171060815999, + "loss": 2.5108, + "step": 2441 + }, + { + "epoch": 0.2800137598899209, + "grad_norm": 0.2776195209007553, + "learning_rate": 0.0008448826803678896, + "loss": 2.6933, + "step": 2442 + }, + { + "epoch": 0.28012842563926155, + "grad_norm": 0.26376373959280863, + "learning_rate": 0.0008447482070841875, + "loss": 2.5415, + "step": 2443 + }, + { + "epoch": 0.2802430913886022, + "grad_norm": 0.2623112008839188, + "learning_rate": 0.0008446136862490417, + "loss": 2.644, + "step": 2444 + }, + { + "epoch": 0.2803577571379429, + "grad_norm": 0.2511333606686069, + "learning_rate": 0.0008444791178810068, + "loss": 2.6078, + "step": 2445 + }, + { + "epoch": 0.2804724228872836, + "grad_norm": 0.2637196874034492, + "learning_rate": 0.0008443445019986441, + "loss": 2.6085, + "step": 2446 + }, + { + "epoch": 0.28058708863662424, + "grad_norm": 0.25387644452304403, + "learning_rate": 0.0008442098386205211, + "loss": 2.5923, + "step": 2447 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 0.2637723568440843, + "learning_rate": 0.0008440751277652122, + "loss": 2.5806, + "step": 2448 + }, + { + "epoch": 0.2808164201353056, + "grad_norm": 0.2490812290960805, + "learning_rate": 0.0008439403694512978, + "loss": 2.696, + "step": 2449 + }, + { + "epoch": 0.28093108588464627, + "grad_norm": 0.24113497441315793, + "learning_rate": 0.0008438055636973657, + "loss": 2.659, + "step": 2450 + }, + { + "epoch": 0.28104575163398693, + "grad_norm": 0.2521301586845647, + "learning_rate": 0.0008436707105220096, + "loss": 2.4955, + "step": 2451 + }, + { + "epoch": 0.2811604173833276, + "grad_norm": 0.27395494839982504, + "learning_rate": 0.00084353580994383, + "loss": 2.6096, + "step": 2452 + }, + { + "epoch": 0.28127508313266825, + "grad_norm": 0.2756061744104837, + "learning_rate": 0.0008434008619814337, + "loss": 2.6378, + "step": 2453 + }, + { + "epoch": 0.28138974888200896, + "grad_norm": 0.27729473882185857, + "learning_rate": 0.0008432658666534345, + "loss": 2.6098, + "step": 2454 + }, + { + "epoch": 0.2815044146313496, + "grad_norm": 0.2626666974994579, + "learning_rate": 0.0008431308239784521, + "loss": 2.5488, + "step": 2455 + }, + { + "epoch": 0.2816190803806903, + "grad_norm": 0.2538847254037964, + "learning_rate": 0.0008429957339751132, + "loss": 2.5729, + "step": 2456 + }, + { + "epoch": 0.28173374613003094, + "grad_norm": 0.2843633830913365, + "learning_rate": 0.0008428605966620508, + "loss": 2.5347, + "step": 2457 + }, + { + "epoch": 0.28184841187937165, + "grad_norm": 0.2644753301904815, + "learning_rate": 0.0008427254120579047, + "loss": 2.4835, + "step": 2458 + }, + { + "epoch": 0.2819630776287123, + "grad_norm": 0.2729346014066077, + "learning_rate": 0.0008425901801813212, + "loss": 2.5508, + "step": 2459 + }, + { + "epoch": 0.28207774337805297, + "grad_norm": 0.28233038553180995, + "learning_rate": 0.0008424549010509524, + "loss": 2.5295, + "step": 2460 + }, + { + "epoch": 0.28219240912739363, + "grad_norm": 0.2877390788139464, + "learning_rate": 0.0008423195746854578, + "loss": 2.682, + "step": 2461 + }, + { + "epoch": 0.28230707487673434, + "grad_norm": 0.2766417386296377, + "learning_rate": 0.000842184201103503, + "loss": 2.5467, + "step": 2462 + }, + { + "epoch": 0.282421740626075, + "grad_norm": 0.26270755010262625, + "learning_rate": 0.0008420487803237604, + "loss": 2.5227, + "step": 2463 + }, + { + "epoch": 0.28253640637541566, + "grad_norm": 0.262026148325715, + "learning_rate": 0.0008419133123649088, + "loss": 2.7123, + "step": 2464 + }, + { + "epoch": 0.2826510721247563, + "grad_norm": 0.2601549999321762, + "learning_rate": 0.0008417777972456328, + "loss": 2.5418, + "step": 2465 + }, + { + "epoch": 0.28276573787409703, + "grad_norm": 0.27153287650376556, + "learning_rate": 0.0008416422349846249, + "loss": 2.4264, + "step": 2466 + }, + { + "epoch": 0.2828804036234377, + "grad_norm": 0.265032650510231, + "learning_rate": 0.0008415066256005827, + "loss": 2.4691, + "step": 2467 + }, + { + "epoch": 0.28299506937277835, + "grad_norm": 0.2567405159664836, + "learning_rate": 0.0008413709691122115, + "loss": 2.4004, + "step": 2468 + }, + { + "epoch": 0.283109735122119, + "grad_norm": 0.27719165538156515, + "learning_rate": 0.0008412352655382221, + "loss": 2.6787, + "step": 2469 + }, + { + "epoch": 0.28322440087145967, + "grad_norm": 0.2812957518489372, + "learning_rate": 0.0008410995148973323, + "loss": 2.6341, + "step": 2470 + }, + { + "epoch": 0.2833390666208004, + "grad_norm": 0.281271010793345, + "learning_rate": 0.0008409637172082664, + "loss": 2.4621, + "step": 2471 + }, + { + "epoch": 0.28345373237014104, + "grad_norm": 0.2587891989115105, + "learning_rate": 0.0008408278724897551, + "loss": 2.5004, + "step": 2472 + }, + { + "epoch": 0.2835683981194817, + "grad_norm": 0.271106795596705, + "learning_rate": 0.0008406919807605356, + "loss": 2.6544, + "step": 2473 + }, + { + "epoch": 0.28368306386882236, + "grad_norm": 0.2552763136343683, + "learning_rate": 0.0008405560420393515, + "loss": 2.5887, + "step": 2474 + }, + { + "epoch": 0.2837977296181631, + "grad_norm": 0.2670747247445295, + "learning_rate": 0.000840420056344953, + "loss": 2.5077, + "step": 2475 + }, + { + "epoch": 0.28391239536750373, + "grad_norm": 0.26887812616498585, + "learning_rate": 0.0008402840236960967, + "loss": 2.44, + "step": 2476 + }, + { + "epoch": 0.2840270611168444, + "grad_norm": 0.23851366647159697, + "learning_rate": 0.0008401479441115456, + "loss": 2.6145, + "step": 2477 + }, + { + "epoch": 0.28414172686618505, + "grad_norm": 0.2588460697419801, + "learning_rate": 0.0008400118176100697, + "loss": 2.4538, + "step": 2478 + }, + { + "epoch": 0.28425639261552577, + "grad_norm": 0.2537866070969727, + "learning_rate": 0.0008398756442104446, + "loss": 2.5388, + "step": 2479 + }, + { + "epoch": 0.2843710583648664, + "grad_norm": 0.25674138356251236, + "learning_rate": 0.0008397394239314529, + "loss": 2.5838, + "step": 2480 + }, + { + "epoch": 0.2844857241142071, + "grad_norm": 0.22532047471017413, + "learning_rate": 0.0008396031567918839, + "loss": 2.5809, + "step": 2481 + }, + { + "epoch": 0.28460038986354774, + "grad_norm": 0.2284490132812174, + "learning_rate": 0.0008394668428105327, + "loss": 2.591, + "step": 2482 + }, + { + "epoch": 0.28471505561288846, + "grad_norm": 0.240063483113093, + "learning_rate": 0.0008393304820062016, + "loss": 2.5132, + "step": 2483 + }, + { + "epoch": 0.2848297213622291, + "grad_norm": 0.2509812631721475, + "learning_rate": 0.0008391940743976984, + "loss": 2.6037, + "step": 2484 + }, + { + "epoch": 0.2849443871115698, + "grad_norm": 0.28944807384776783, + "learning_rate": 0.0008390576200038385, + "loss": 2.6257, + "step": 2485 + }, + { + "epoch": 0.28505905286091043, + "grad_norm": 0.24861666331172017, + "learning_rate": 0.0008389211188434429, + "loss": 2.6277, + "step": 2486 + }, + { + "epoch": 0.2851737186102511, + "grad_norm": 0.2732205773122366, + "learning_rate": 0.0008387845709353392, + "loss": 2.6306, + "step": 2487 + }, + { + "epoch": 0.2852883843595918, + "grad_norm": 0.2896018687748785, + "learning_rate": 0.000838647976298362, + "loss": 2.5352, + "step": 2488 + }, + { + "epoch": 0.28540305010893247, + "grad_norm": 0.2745605087443794, + "learning_rate": 0.0008385113349513516, + "loss": 2.4832, + "step": 2489 + }, + { + "epoch": 0.2855177158582731, + "grad_norm": 0.27181938832496855, + "learning_rate": 0.0008383746469131551, + "loss": 2.4318, + "step": 2490 + }, + { + "epoch": 0.2856323816076138, + "grad_norm": 0.2923342484725118, + "learning_rate": 0.0008382379122026263, + "loss": 2.5815, + "step": 2491 + }, + { + "epoch": 0.2857470473569545, + "grad_norm": 0.26771335676206004, + "learning_rate": 0.0008381011308386246, + "loss": 2.6396, + "step": 2492 + }, + { + "epoch": 0.28586171310629516, + "grad_norm": 0.24439132331250865, + "learning_rate": 0.0008379643028400168, + "loss": 2.5168, + "step": 2493 + }, + { + "epoch": 0.2859763788556358, + "grad_norm": 0.26464200707960456, + "learning_rate": 0.0008378274282256757, + "loss": 2.5411, + "step": 2494 + }, + { + "epoch": 0.2860910446049765, + "grad_norm": 0.25397448550067075, + "learning_rate": 0.0008376905070144804, + "loss": 2.6693, + "step": 2495 + }, + { + "epoch": 0.2862057103543172, + "grad_norm": 0.25259471750924994, + "learning_rate": 0.0008375535392253166, + "loss": 2.5454, + "step": 2496 + }, + { + "epoch": 0.28632037610365785, + "grad_norm": 0.24574643562235957, + "learning_rate": 0.0008374165248770764, + "loss": 2.5513, + "step": 2497 + }, + { + "epoch": 0.2864350418529985, + "grad_norm": 0.24133489002653277, + "learning_rate": 0.0008372794639886583, + "loss": 2.5478, + "step": 2498 + }, + { + "epoch": 0.28654970760233917, + "grad_norm": 0.23693809749911574, + "learning_rate": 0.0008371423565789674, + "loss": 2.6303, + "step": 2499 + }, + { + "epoch": 0.2866643733516799, + "grad_norm": 0.2382040425841025, + "learning_rate": 0.0008370052026669149, + "loss": 2.4535, + "step": 2500 + }, + { + "epoch": 0.28677903910102054, + "grad_norm": 0.26455669545078525, + "learning_rate": 0.0008368680022714186, + "loss": 2.5823, + "step": 2501 + }, + { + "epoch": 0.2868937048503612, + "grad_norm": 0.24753695532485726, + "learning_rate": 0.0008367307554114025, + "loss": 2.5698, + "step": 2502 + }, + { + "epoch": 0.28700837059970186, + "grad_norm": 0.24750414229584652, + "learning_rate": 0.0008365934621057976, + "loss": 2.6356, + "step": 2503 + }, + { + "epoch": 0.2871230363490425, + "grad_norm": 0.28685584546240395, + "learning_rate": 0.0008364561223735405, + "loss": 2.6334, + "step": 2504 + }, + { + "epoch": 0.28723770209838323, + "grad_norm": 0.23903842125290242, + "learning_rate": 0.0008363187362335749, + "loss": 2.4476, + "step": 2505 + }, + { + "epoch": 0.2873523678477239, + "grad_norm": 0.2630605831270681, + "learning_rate": 0.0008361813037048503, + "loss": 2.5156, + "step": 2506 + }, + { + "epoch": 0.28746703359706455, + "grad_norm": 0.24244475738613178, + "learning_rate": 0.0008360438248063231, + "loss": 2.6068, + "step": 2507 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.2528370204111683, + "learning_rate": 0.0008359062995569559, + "loss": 2.5871, + "step": 2508 + }, + { + "epoch": 0.2876963650957459, + "grad_norm": 0.2616128700634317, + "learning_rate": 0.0008357687279757177, + "loss": 2.5416, + "step": 2509 + }, + { + "epoch": 0.2878110308450866, + "grad_norm": 0.23412276764895418, + "learning_rate": 0.0008356311100815837, + "loss": 2.49, + "step": 2510 + }, + { + "epoch": 0.28792569659442724, + "grad_norm": 0.2677953432188376, + "learning_rate": 0.0008354934458935357, + "loss": 2.5526, + "step": 2511 + }, + { + "epoch": 0.2880403623437679, + "grad_norm": 0.2855967137862939, + "learning_rate": 0.0008353557354305621, + "loss": 2.6344, + "step": 2512 + }, + { + "epoch": 0.2881550280931086, + "grad_norm": 0.29922786567165965, + "learning_rate": 0.0008352179787116572, + "loss": 2.6045, + "step": 2513 + }, + { + "epoch": 0.28826969384244927, + "grad_norm": 0.28152451197141004, + "learning_rate": 0.000835080175755822, + "loss": 2.6077, + "step": 2514 + }, + { + "epoch": 0.28838435959178993, + "grad_norm": 0.2801391401577633, + "learning_rate": 0.0008349423265820636, + "loss": 2.5958, + "step": 2515 + }, + { + "epoch": 0.2884990253411306, + "grad_norm": 0.24648791106938245, + "learning_rate": 0.0008348044312093959, + "loss": 2.6154, + "step": 2516 + }, + { + "epoch": 0.2886136910904713, + "grad_norm": 0.27225314902474773, + "learning_rate": 0.0008346664896568389, + "loss": 2.606, + "step": 2517 + }, + { + "epoch": 0.28872835683981196, + "grad_norm": 0.2873948014265642, + "learning_rate": 0.000834528501943419, + "loss": 2.7184, + "step": 2518 + }, + { + "epoch": 0.2888430225891526, + "grad_norm": 0.26780405853681727, + "learning_rate": 0.000834390468088169, + "loss": 2.5804, + "step": 2519 + }, + { + "epoch": 0.2889576883384933, + "grad_norm": 0.24182334785712256, + "learning_rate": 0.0008342523881101279, + "loss": 2.6582, + "step": 2520 + }, + { + "epoch": 0.28907235408783394, + "grad_norm": 0.27523201646743733, + "learning_rate": 0.0008341142620283412, + "loss": 2.6939, + "step": 2521 + }, + { + "epoch": 0.28918701983717465, + "grad_norm": 0.24884301518765278, + "learning_rate": 0.0008339760898618611, + "loss": 2.5228, + "step": 2522 + }, + { + "epoch": 0.2893016855865153, + "grad_norm": 0.24056876252172377, + "learning_rate": 0.0008338378716297454, + "loss": 2.6042, + "step": 2523 + }, + { + "epoch": 0.28941635133585597, + "grad_norm": 0.25163468894240254, + "learning_rate": 0.0008336996073510589, + "loss": 2.5245, + "step": 2524 + }, + { + "epoch": 0.28953101708519663, + "grad_norm": 0.2598603076723986, + "learning_rate": 0.0008335612970448723, + "loss": 2.5671, + "step": 2525 + }, + { + "epoch": 0.28964568283453734, + "grad_norm": 0.26415173104784145, + "learning_rate": 0.0008334229407302632, + "loss": 2.5604, + "step": 2526 + }, + { + "epoch": 0.289760348583878, + "grad_norm": 0.2675244020569685, + "learning_rate": 0.000833284538426315, + "loss": 2.5647, + "step": 2527 + }, + { + "epoch": 0.28987501433321866, + "grad_norm": 0.28987122631968115, + "learning_rate": 0.0008331460901521178, + "loss": 2.561, + "step": 2528 + }, + { + "epoch": 0.2899896800825593, + "grad_norm": 0.2887040856180903, + "learning_rate": 0.0008330075959267677, + "loss": 2.6132, + "step": 2529 + }, + { + "epoch": 0.29010434583190003, + "grad_norm": 0.2873896980901336, + "learning_rate": 0.0008328690557693674, + "loss": 2.5155, + "step": 2530 + }, + { + "epoch": 0.2902190115812407, + "grad_norm": 0.258179408833659, + "learning_rate": 0.000832730469699026, + "loss": 2.4676, + "step": 2531 + }, + { + "epoch": 0.29033367733058135, + "grad_norm": 0.2714605469391852, + "learning_rate": 0.0008325918377348587, + "loss": 2.5601, + "step": 2532 + }, + { + "epoch": 0.290448343079922, + "grad_norm": 0.2435376709444652, + "learning_rate": 0.0008324531598959871, + "loss": 2.5328, + "step": 2533 + }, + { + "epoch": 0.2905630088292627, + "grad_norm": 0.27102761470688463, + "learning_rate": 0.0008323144362015393, + "loss": 2.6384, + "step": 2534 + }, + { + "epoch": 0.2906776745786034, + "grad_norm": 0.2669748822496197, + "learning_rate": 0.0008321756666706495, + "loss": 2.6509, + "step": 2535 + }, + { + "epoch": 0.29079234032794404, + "grad_norm": 0.27142868317612345, + "learning_rate": 0.0008320368513224584, + "loss": 2.5714, + "step": 2536 + }, + { + "epoch": 0.2909070060772847, + "grad_norm": 0.29845158420613227, + "learning_rate": 0.0008318979901761128, + "loss": 2.6769, + "step": 2537 + }, + { + "epoch": 0.29102167182662536, + "grad_norm": 0.23053483844719486, + "learning_rate": 0.000831759083250766, + "loss": 2.5696, + "step": 2538 + }, + { + "epoch": 0.2911363375759661, + "grad_norm": 0.2582694574957293, + "learning_rate": 0.0008316201305655775, + "loss": 2.5023, + "step": 2539 + }, + { + "epoch": 0.29125100332530673, + "grad_norm": 0.2686303123294477, + "learning_rate": 0.0008314811321397134, + "loss": 2.5818, + "step": 2540 + }, + { + "epoch": 0.2913656690746474, + "grad_norm": 0.2553653961158758, + "learning_rate": 0.0008313420879923456, + "loss": 2.5225, + "step": 2541 + }, + { + "epoch": 0.29148033482398805, + "grad_norm": 0.26972664974837973, + "learning_rate": 0.0008312029981426528, + "loss": 2.5773, + "step": 2542 + }, + { + "epoch": 0.29159500057332877, + "grad_norm": 0.26307320502763415, + "learning_rate": 0.0008310638626098196, + "loss": 2.5796, + "step": 2543 + }, + { + "epoch": 0.2917096663226694, + "grad_norm": 0.2698760819557667, + "learning_rate": 0.0008309246814130372, + "loss": 2.5995, + "step": 2544 + }, + { + "epoch": 0.2918243320720101, + "grad_norm": 0.2574911402894862, + "learning_rate": 0.0008307854545715032, + "loss": 2.6013, + "step": 2545 + }, + { + "epoch": 0.29193899782135074, + "grad_norm": 0.25379289111904746, + "learning_rate": 0.0008306461821044209, + "loss": 2.5271, + "step": 2546 + }, + { + "epoch": 0.29205366357069146, + "grad_norm": 0.2297227331859884, + "learning_rate": 0.0008305068640310006, + "loss": 2.748, + "step": 2547 + }, + { + "epoch": 0.2921683293200321, + "grad_norm": 0.24517836060190237, + "learning_rate": 0.0008303675003704583, + "loss": 2.4866, + "step": 2548 + }, + { + "epoch": 0.2922829950693728, + "grad_norm": 0.26780788251948234, + "learning_rate": 0.0008302280911420167, + "loss": 2.5668, + "step": 2549 + }, + { + "epoch": 0.29239766081871343, + "grad_norm": 0.24178008615090107, + "learning_rate": 0.0008300886363649048, + "loss": 2.5103, + "step": 2550 + }, + { + "epoch": 0.29251232656805415, + "grad_norm": 0.2671598309721243, + "learning_rate": 0.0008299491360583574, + "loss": 2.3844, + "step": 2551 + }, + { + "epoch": 0.2926269923173948, + "grad_norm": 0.23935761227462785, + "learning_rate": 0.000829809590241616, + "loss": 2.5809, + "step": 2552 + }, + { + "epoch": 0.29274165806673547, + "grad_norm": 0.27754340309061803, + "learning_rate": 0.0008296699989339287, + "loss": 2.6769, + "step": 2553 + }, + { + "epoch": 0.2928563238160761, + "grad_norm": 0.24351411458797384, + "learning_rate": 0.000829530362154549, + "loss": 2.4271, + "step": 2554 + }, + { + "epoch": 0.2929709895654168, + "grad_norm": 0.2676976197480251, + "learning_rate": 0.0008293906799227371, + "loss": 2.4924, + "step": 2555 + }, + { + "epoch": 0.2930856553147575, + "grad_norm": 0.2540753797965595, + "learning_rate": 0.0008292509522577599, + "loss": 2.5094, + "step": 2556 + }, + { + "epoch": 0.29320032106409816, + "grad_norm": 0.24750244288415035, + "learning_rate": 0.0008291111791788897, + "loss": 2.5268, + "step": 2557 + }, + { + "epoch": 0.2933149868134388, + "grad_norm": 0.2642507582435455, + "learning_rate": 0.0008289713607054059, + "loss": 2.6035, + "step": 2558 + }, + { + "epoch": 0.2934296525627795, + "grad_norm": 0.26800905287740867, + "learning_rate": 0.0008288314968565938, + "loss": 2.5571, + "step": 2559 + }, + { + "epoch": 0.2935443183121202, + "grad_norm": 0.27667328908230365, + "learning_rate": 0.0008286915876517444, + "loss": 2.5551, + "step": 2560 + }, + { + "epoch": 0.29365898406146085, + "grad_norm": 0.2507592239871691, + "learning_rate": 0.0008285516331101563, + "loss": 2.5304, + "step": 2561 + }, + { + "epoch": 0.2937736498108015, + "grad_norm": 0.26084774321358145, + "learning_rate": 0.0008284116332511329, + "loss": 2.6026, + "step": 2562 + }, + { + "epoch": 0.29388831556014217, + "grad_norm": 0.2801695105940615, + "learning_rate": 0.0008282715880939851, + "loss": 2.4756, + "step": 2563 + }, + { + "epoch": 0.2940029813094829, + "grad_norm": 0.2459966038896636, + "learning_rate": 0.0008281314976580289, + "loss": 2.5764, + "step": 2564 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.278512867919984, + "learning_rate": 0.0008279913619625874, + "loss": 2.578, + "step": 2565 + }, + { + "epoch": 0.2942323128081642, + "grad_norm": 0.27075830268459006, + "learning_rate": 0.0008278511810269896, + "loss": 2.4781, + "step": 2566 + }, + { + "epoch": 0.29434697855750486, + "grad_norm": 0.24267260078221942, + "learning_rate": 0.0008277109548705708, + "loss": 2.4221, + "step": 2567 + }, + { + "epoch": 0.29446164430684557, + "grad_norm": 0.2764497208632351, + "learning_rate": 0.0008275706835126726, + "loss": 2.5206, + "step": 2568 + }, + { + "epoch": 0.29457631005618623, + "grad_norm": 0.2521141230184501, + "learning_rate": 0.0008274303669726426, + "loss": 2.477, + "step": 2569 + }, + { + "epoch": 0.2946909758055269, + "grad_norm": 0.2615121583425356, + "learning_rate": 0.0008272900052698349, + "loss": 2.5262, + "step": 2570 + }, + { + "epoch": 0.29480564155486755, + "grad_norm": 0.2838419722726068, + "learning_rate": 0.0008271495984236096, + "loss": 2.5906, + "step": 2571 + }, + { + "epoch": 0.2949203073042082, + "grad_norm": 0.29428895214928474, + "learning_rate": 0.0008270091464533333, + "loss": 2.5638, + "step": 2572 + }, + { + "epoch": 0.2950349730535489, + "grad_norm": 0.2516902648675629, + "learning_rate": 0.0008268686493783786, + "loss": 2.5653, + "step": 2573 + }, + { + "epoch": 0.2951496388028896, + "grad_norm": 0.2686053861533405, + "learning_rate": 0.0008267281072181245, + "loss": 2.3053, + "step": 2574 + }, + { + "epoch": 0.29526430455223024, + "grad_norm": 0.24311864073122846, + "learning_rate": 0.0008265875199919558, + "loss": 2.4381, + "step": 2575 + }, + { + "epoch": 0.2953789703015709, + "grad_norm": 0.25337583157589805, + "learning_rate": 0.0008264468877192641, + "loss": 2.4448, + "step": 2576 + }, + { + "epoch": 0.2954936360509116, + "grad_norm": 0.23407568226072326, + "learning_rate": 0.000826306210419447, + "loss": 2.5709, + "step": 2577 + }, + { + "epoch": 0.29560830180025227, + "grad_norm": 0.25538288865585457, + "learning_rate": 0.0008261654881119081, + "loss": 2.6779, + "step": 2578 + }, + { + "epoch": 0.29572296754959293, + "grad_norm": 0.2574883768889266, + "learning_rate": 0.0008260247208160574, + "loss": 2.4956, + "step": 2579 + }, + { + "epoch": 0.2958376332989336, + "grad_norm": 0.23959193148638167, + "learning_rate": 0.000825883908551311, + "loss": 2.52, + "step": 2580 + }, + { + "epoch": 0.2959522990482743, + "grad_norm": 0.26020538219078226, + "learning_rate": 0.0008257430513370914, + "loss": 2.542, + "step": 2581 + }, + { + "epoch": 0.29606696479761496, + "grad_norm": 0.2514207975682644, + "learning_rate": 0.000825602149192827, + "loss": 2.4919, + "step": 2582 + }, + { + "epoch": 0.2961816305469556, + "grad_norm": 0.23948799311614954, + "learning_rate": 0.0008254612021379526, + "loss": 2.6958, + "step": 2583 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.2616184053597987, + "learning_rate": 0.0008253202101919095, + "loss": 2.6454, + "step": 2584 + }, + { + "epoch": 0.296410962045637, + "grad_norm": 0.272890953355618, + "learning_rate": 0.0008251791733741442, + "loss": 2.6542, + "step": 2585 + }, + { + "epoch": 0.29652562779497765, + "grad_norm": 0.264928550882729, + "learning_rate": 0.0008250380917041107, + "loss": 2.5926, + "step": 2586 + }, + { + "epoch": 0.2966402935443183, + "grad_norm": 0.249725572599277, + "learning_rate": 0.0008248969652012681, + "loss": 2.4806, + "step": 2587 + }, + { + "epoch": 0.29675495929365897, + "grad_norm": 0.26393468016124505, + "learning_rate": 0.0008247557938850824, + "loss": 2.5642, + "step": 2588 + }, + { + "epoch": 0.29686962504299963, + "grad_norm": 0.2740089980255711, + "learning_rate": 0.0008246145777750253, + "loss": 2.3383, + "step": 2589 + }, + { + "epoch": 0.29698429079234034, + "grad_norm": 0.2750800957963272, + "learning_rate": 0.0008244733168905748, + "loss": 2.5697, + "step": 2590 + }, + { + "epoch": 0.297098956541681, + "grad_norm": 0.2741802728961887, + "learning_rate": 0.0008243320112512153, + "loss": 2.6756, + "step": 2591 + }, + { + "epoch": 0.29721362229102166, + "grad_norm": 0.2766665595057287, + "learning_rate": 0.0008241906608764373, + "loss": 2.5277, + "step": 2592 + }, + { + "epoch": 0.2973282880403623, + "grad_norm": 0.2677089262380423, + "learning_rate": 0.000824049265785737, + "loss": 2.5989, + "step": 2593 + }, + { + "epoch": 0.29744295378970304, + "grad_norm": 0.2618647860442471, + "learning_rate": 0.0008239078259986177, + "loss": 2.4987, + "step": 2594 + }, + { + "epoch": 0.2975576195390437, + "grad_norm": 0.24787801235604665, + "learning_rate": 0.0008237663415345879, + "loss": 2.6281, + "step": 2595 + }, + { + "epoch": 0.29767228528838435, + "grad_norm": 0.2462064791589125, + "learning_rate": 0.0008236248124131629, + "loss": 2.5271, + "step": 2596 + }, + { + "epoch": 0.297786951037725, + "grad_norm": 0.25919899633384147, + "learning_rate": 0.0008234832386538639, + "loss": 2.5578, + "step": 2597 + }, + { + "epoch": 0.2979016167870657, + "grad_norm": 0.2410867345853169, + "learning_rate": 0.0008233416202762182, + "loss": 2.5262, + "step": 2598 + }, + { + "epoch": 0.2980162825364064, + "grad_norm": 0.2500468643598101, + "learning_rate": 0.0008231999572997595, + "loss": 2.5121, + "step": 2599 + }, + { + "epoch": 0.29813094828574704, + "grad_norm": 0.25563868695159186, + "learning_rate": 0.0008230582497440273, + "loss": 2.4701, + "step": 2600 + }, + { + "epoch": 0.2982456140350877, + "grad_norm": 0.25592953141750713, + "learning_rate": 0.0008229164976285678, + "loss": 2.5537, + "step": 2601 + }, + { + "epoch": 0.2983602797844284, + "grad_norm": 0.2664228790988395, + "learning_rate": 0.0008227747009729327, + "loss": 2.4221, + "step": 2602 + }, + { + "epoch": 0.2984749455337691, + "grad_norm": 0.26691023665909697, + "learning_rate": 0.0008226328597966803, + "loss": 2.721, + "step": 2603 + }, + { + "epoch": 0.29858961128310973, + "grad_norm": 0.27734809539936495, + "learning_rate": 0.0008224909741193747, + "loss": 2.6227, + "step": 2604 + }, + { + "epoch": 0.2987042770324504, + "grad_norm": 0.27451531824787123, + "learning_rate": 0.0008223490439605865, + "loss": 2.5723, + "step": 2605 + }, + { + "epoch": 0.29881894278179105, + "grad_norm": 0.27516780950669967, + "learning_rate": 0.0008222070693398924, + "loss": 2.4942, + "step": 2606 + }, + { + "epoch": 0.29893360853113177, + "grad_norm": 0.27411150746865875, + "learning_rate": 0.0008220650502768748, + "loss": 2.5202, + "step": 2607 + }, + { + "epoch": 0.2990482742804724, + "grad_norm": 0.27486556650745697, + "learning_rate": 0.0008219229867911224, + "loss": 2.5505, + "step": 2608 + }, + { + "epoch": 0.2991629400298131, + "grad_norm": 0.2843172835844898, + "learning_rate": 0.0008217808789022308, + "loss": 2.4929, + "step": 2609 + }, + { + "epoch": 0.29927760577915374, + "grad_norm": 0.2549478895169563, + "learning_rate": 0.0008216387266298004, + "loss": 2.5998, + "step": 2610 + }, + { + "epoch": 0.29939227152849446, + "grad_norm": 0.2539881219322232, + "learning_rate": 0.0008214965299934386, + "loss": 2.3953, + "step": 2611 + }, + { + "epoch": 0.2995069372778351, + "grad_norm": 0.27181899012056543, + "learning_rate": 0.0008213542890127589, + "loss": 2.6682, + "step": 2612 + }, + { + "epoch": 0.2996216030271758, + "grad_norm": 0.27438492832813927, + "learning_rate": 0.0008212120037073805, + "loss": 2.5172, + "step": 2613 + }, + { + "epoch": 0.29973626877651643, + "grad_norm": 0.294729568093543, + "learning_rate": 0.0008210696740969292, + "loss": 2.5638, + "step": 2614 + }, + { + "epoch": 0.29985093452585715, + "grad_norm": 0.25101844542013674, + "learning_rate": 0.0008209273002010364, + "loss": 2.5346, + "step": 2615 + }, + { + "epoch": 0.2999656002751978, + "grad_norm": 0.2651371400762005, + "learning_rate": 0.00082078488203934, + "loss": 2.4691, + "step": 2616 + }, + { + "epoch": 0.30008026602453847, + "grad_norm": 0.23102422636217235, + "learning_rate": 0.0008206424196314838, + "loss": 2.6656, + "step": 2617 + }, + { + "epoch": 0.3001949317738791, + "grad_norm": 0.24545261754765782, + "learning_rate": 0.0008204999129971178, + "loss": 2.5504, + "step": 2618 + }, + { + "epoch": 0.30030959752321984, + "grad_norm": 0.2929914124979611, + "learning_rate": 0.0008203573621558982, + "loss": 2.6014, + "step": 2619 + }, + { + "epoch": 0.3004242632725605, + "grad_norm": 0.26382821691189, + "learning_rate": 0.0008202147671274869, + "loss": 2.4691, + "step": 2620 + }, + { + "epoch": 0.30053892902190116, + "grad_norm": 0.2787423105909502, + "learning_rate": 0.0008200721279315524, + "loss": 2.5393, + "step": 2621 + }, + { + "epoch": 0.3006535947712418, + "grad_norm": 0.27007698698041177, + "learning_rate": 0.000819929444587769, + "loss": 2.6526, + "step": 2622 + }, + { + "epoch": 0.3007682605205825, + "grad_norm": 0.29976303574868957, + "learning_rate": 0.0008197867171158171, + "loss": 2.5902, + "step": 2623 + }, + { + "epoch": 0.3008829262699232, + "grad_norm": 0.26229563612291473, + "learning_rate": 0.0008196439455353833, + "loss": 2.5184, + "step": 2624 + }, + { + "epoch": 0.30099759201926385, + "grad_norm": 0.28907670550652503, + "learning_rate": 0.0008195011298661601, + "loss": 2.4586, + "step": 2625 + }, + { + "epoch": 0.3011122577686045, + "grad_norm": 0.2523295885737605, + "learning_rate": 0.0008193582701278464, + "loss": 2.6349, + "step": 2626 + }, + { + "epoch": 0.30122692351794517, + "grad_norm": 0.265167982685223, + "learning_rate": 0.0008192153663401467, + "loss": 2.5177, + "step": 2627 + }, + { + "epoch": 0.3013415892672859, + "grad_norm": 0.2663613845479394, + "learning_rate": 0.0008190724185227722, + "loss": 2.5488, + "step": 2628 + }, + { + "epoch": 0.30145625501662654, + "grad_norm": 0.24514794125436934, + "learning_rate": 0.0008189294266954395, + "loss": 2.488, + "step": 2629 + }, + { + "epoch": 0.3015709207659672, + "grad_norm": 0.25204367834137453, + "learning_rate": 0.0008187863908778718, + "loss": 2.5412, + "step": 2630 + }, + { + "epoch": 0.30168558651530786, + "grad_norm": 0.27596833928265563, + "learning_rate": 0.0008186433110897982, + "loss": 2.5455, + "step": 2631 + }, + { + "epoch": 0.30180025226464857, + "grad_norm": 0.22970249849334712, + "learning_rate": 0.0008185001873509534, + "loss": 2.5921, + "step": 2632 + }, + { + "epoch": 0.30191491801398923, + "grad_norm": 0.23848289902335468, + "learning_rate": 0.0008183570196810793, + "loss": 2.6264, + "step": 2633 + }, + { + "epoch": 0.3020295837633299, + "grad_norm": 0.28015867889088225, + "learning_rate": 0.0008182138080999226, + "loss": 2.5359, + "step": 2634 + }, + { + "epoch": 0.30214424951267055, + "grad_norm": 0.25331648027154957, + "learning_rate": 0.0008180705526272368, + "loss": 2.4238, + "step": 2635 + }, + { + "epoch": 0.30225891526201126, + "grad_norm": 0.24063925033175645, + "learning_rate": 0.0008179272532827811, + "loss": 2.4331, + "step": 2636 + }, + { + "epoch": 0.3023735810113519, + "grad_norm": 0.24859421665637899, + "learning_rate": 0.0008177839100863212, + "loss": 2.5105, + "step": 2637 + }, + { + "epoch": 0.3024882467606926, + "grad_norm": 0.2487247812278199, + "learning_rate": 0.0008176405230576285, + "loss": 2.4875, + "step": 2638 + }, + { + "epoch": 0.30260291251003324, + "grad_norm": 0.25636835463746843, + "learning_rate": 0.0008174970922164803, + "loss": 2.3896, + "step": 2639 + }, + { + "epoch": 0.3027175782593739, + "grad_norm": 0.25496403348677044, + "learning_rate": 0.0008173536175826603, + "loss": 2.5221, + "step": 2640 + }, + { + "epoch": 0.3028322440087146, + "grad_norm": 0.25622824681501527, + "learning_rate": 0.000817210099175958, + "loss": 2.4956, + "step": 2641 + }, + { + "epoch": 0.30294690975805527, + "grad_norm": 0.26447630328246774, + "learning_rate": 0.0008170665370161691, + "loss": 2.5692, + "step": 2642 + }, + { + "epoch": 0.30306157550739593, + "grad_norm": 0.25224223071648505, + "learning_rate": 0.0008169229311230954, + "loss": 2.5633, + "step": 2643 + }, + { + "epoch": 0.3031762412567366, + "grad_norm": 0.2872893734550882, + "learning_rate": 0.0008167792815165444, + "loss": 2.7366, + "step": 2644 + }, + { + "epoch": 0.3032909070060773, + "grad_norm": 0.2792001134825715, + "learning_rate": 0.0008166355882163296, + "loss": 2.6292, + "step": 2645 + }, + { + "epoch": 0.30340557275541796, + "grad_norm": 0.24986772439521734, + "learning_rate": 0.0008164918512422715, + "loss": 2.5339, + "step": 2646 + }, + { + "epoch": 0.3035202385047586, + "grad_norm": 0.2671803016762137, + "learning_rate": 0.000816348070614195, + "loss": 2.6087, + "step": 2647 + }, + { + "epoch": 0.3036349042540993, + "grad_norm": 0.2622006373092917, + "learning_rate": 0.0008162042463519326, + "loss": 2.6436, + "step": 2648 + }, + { + "epoch": 0.30374957000344, + "grad_norm": 0.2620055852396241, + "learning_rate": 0.0008160603784753217, + "loss": 2.6013, + "step": 2649 + }, + { + "epoch": 0.30386423575278065, + "grad_norm": 0.2546561832347683, + "learning_rate": 0.0008159164670042062, + "loss": 2.4997, + "step": 2650 + }, + { + "epoch": 0.3039789015021213, + "grad_norm": 0.24637347391066008, + "learning_rate": 0.000815772511958436, + "loss": 2.4108, + "step": 2651 + }, + { + "epoch": 0.30409356725146197, + "grad_norm": 0.25959646568885353, + "learning_rate": 0.000815628513357867, + "loss": 2.5015, + "step": 2652 + }, + { + "epoch": 0.3042082330008027, + "grad_norm": 0.24838959079694223, + "learning_rate": 0.000815484471222361, + "loss": 2.5001, + "step": 2653 + }, + { + "epoch": 0.30432289875014334, + "grad_norm": 0.2705195954772799, + "learning_rate": 0.0008153403855717858, + "loss": 2.4347, + "step": 2654 + }, + { + "epoch": 0.304437564499484, + "grad_norm": 0.2528392991176851, + "learning_rate": 0.0008151962564260153, + "loss": 2.5035, + "step": 2655 + }, + { + "epoch": 0.30455223024882466, + "grad_norm": 0.25160209300453745, + "learning_rate": 0.0008150520838049297, + "loss": 2.5245, + "step": 2656 + }, + { + "epoch": 0.3046668959981653, + "grad_norm": 0.2600276091220383, + "learning_rate": 0.0008149078677284143, + "loss": 2.5614, + "step": 2657 + }, + { + "epoch": 0.30478156174750604, + "grad_norm": 0.260708029582181, + "learning_rate": 0.0008147636082163614, + "loss": 2.5877, + "step": 2658 + }, + { + "epoch": 0.3048962274968467, + "grad_norm": 0.2729979278760044, + "learning_rate": 0.0008146193052886685, + "loss": 2.5036, + "step": 2659 + }, + { + "epoch": 0.30501089324618735, + "grad_norm": 0.2629217107333538, + "learning_rate": 0.0008144749589652398, + "loss": 2.3489, + "step": 2660 + }, + { + "epoch": 0.305125558995528, + "grad_norm": 0.24482058659224318, + "learning_rate": 0.0008143305692659849, + "loss": 2.5435, + "step": 2661 + }, + { + "epoch": 0.3052402247448687, + "grad_norm": 0.2875927971134496, + "learning_rate": 0.0008141861362108196, + "loss": 2.5491, + "step": 2662 + }, + { + "epoch": 0.3053548904942094, + "grad_norm": 0.28872340256237156, + "learning_rate": 0.0008140416598196659, + "loss": 2.5559, + "step": 2663 + }, + { + "epoch": 0.30546955624355004, + "grad_norm": 0.26513107891437365, + "learning_rate": 0.0008138971401124513, + "loss": 2.5457, + "step": 2664 + }, + { + "epoch": 0.3055842219928907, + "grad_norm": 0.25649235122659414, + "learning_rate": 0.0008137525771091097, + "loss": 2.4689, + "step": 2665 + }, + { + "epoch": 0.3056988877422314, + "grad_norm": 0.23955248174996402, + "learning_rate": 0.0008136079708295807, + "loss": 2.6119, + "step": 2666 + }, + { + "epoch": 0.3058135534915721, + "grad_norm": 0.25677393793355757, + "learning_rate": 0.00081346332129381, + "loss": 2.6494, + "step": 2667 + }, + { + "epoch": 0.30592821924091274, + "grad_norm": 0.25144938577487663, + "learning_rate": 0.0008133186285217493, + "loss": 2.5477, + "step": 2668 + }, + { + "epoch": 0.3060428849902534, + "grad_norm": 0.2417079255669188, + "learning_rate": 0.0008131738925333563, + "loss": 2.528, + "step": 2669 + }, + { + "epoch": 0.3061575507395941, + "grad_norm": 0.2585545639496042, + "learning_rate": 0.0008130291133485943, + "loss": 2.6565, + "step": 2670 + }, + { + "epoch": 0.30627221648893477, + "grad_norm": 0.2567159322259357, + "learning_rate": 0.000812884290987433, + "loss": 2.562, + "step": 2671 + }, + { + "epoch": 0.3063868822382754, + "grad_norm": 0.2270050528058689, + "learning_rate": 0.0008127394254698479, + "loss": 2.5468, + "step": 2672 + }, + { + "epoch": 0.3065015479876161, + "grad_norm": 0.2367038454114927, + "learning_rate": 0.0008125945168158205, + "loss": 2.6484, + "step": 2673 + }, + { + "epoch": 0.30661621373695674, + "grad_norm": 0.26456186919946745, + "learning_rate": 0.000812449565045338, + "loss": 2.4856, + "step": 2674 + }, + { + "epoch": 0.30673087948629746, + "grad_norm": 0.2860237512600303, + "learning_rate": 0.000812304570178394, + "loss": 2.5322, + "step": 2675 + }, + { + "epoch": 0.3068455452356381, + "grad_norm": 0.2560239071956316, + "learning_rate": 0.0008121595322349875, + "loss": 2.5024, + "step": 2676 + }, + { + "epoch": 0.3069602109849788, + "grad_norm": 0.26663122374546655, + "learning_rate": 0.0008120144512351237, + "loss": 2.4211, + "step": 2677 + }, + { + "epoch": 0.30707487673431944, + "grad_norm": 0.24580009466119174, + "learning_rate": 0.0008118693271988142, + "loss": 2.4617, + "step": 2678 + }, + { + "epoch": 0.30718954248366015, + "grad_norm": 0.27027706800165135, + "learning_rate": 0.0008117241601460755, + "loss": 2.5046, + "step": 2679 + }, + { + "epoch": 0.3073042082330008, + "grad_norm": 0.2472294981545326, + "learning_rate": 0.0008115789500969309, + "loss": 2.4227, + "step": 2680 + }, + { + "epoch": 0.30741887398234147, + "grad_norm": 0.25270660724312616, + "learning_rate": 0.0008114336970714096, + "loss": 2.6043, + "step": 2681 + }, + { + "epoch": 0.3075335397316821, + "grad_norm": 0.2767914231370437, + "learning_rate": 0.0008112884010895461, + "loss": 2.4796, + "step": 2682 + }, + { + "epoch": 0.30764820548102284, + "grad_norm": 0.27987696269915713, + "learning_rate": 0.0008111430621713814, + "loss": 2.5603, + "step": 2683 + }, + { + "epoch": 0.3077628712303635, + "grad_norm": 0.2784712651880593, + "learning_rate": 0.0008109976803369623, + "loss": 2.5321, + "step": 2684 + }, + { + "epoch": 0.30787753697970416, + "grad_norm": 0.249073994299987, + "learning_rate": 0.0008108522556063411, + "loss": 2.7551, + "step": 2685 + }, + { + "epoch": 0.3079922027290448, + "grad_norm": 0.24350790496860594, + "learning_rate": 0.0008107067879995768, + "loss": 2.5378, + "step": 2686 + }, + { + "epoch": 0.30810686847838553, + "grad_norm": 0.2510122305622537, + "learning_rate": 0.0008105612775367337, + "loss": 2.6519, + "step": 2687 + }, + { + "epoch": 0.3082215342277262, + "grad_norm": 0.23305079242844132, + "learning_rate": 0.0008104157242378821, + "loss": 2.519, + "step": 2688 + }, + { + "epoch": 0.30833619997706685, + "grad_norm": 0.2591300012711176, + "learning_rate": 0.0008102701281230985, + "loss": 2.6377, + "step": 2689 + }, + { + "epoch": 0.3084508657264075, + "grad_norm": 0.25422273581982097, + "learning_rate": 0.0008101244892124651, + "loss": 2.5365, + "step": 2690 + }, + { + "epoch": 0.30856553147574817, + "grad_norm": 0.2532654516377513, + "learning_rate": 0.0008099788075260698, + "loss": 2.5476, + "step": 2691 + }, + { + "epoch": 0.3086801972250889, + "grad_norm": 0.26042341344518805, + "learning_rate": 0.0008098330830840066, + "loss": 2.4158, + "step": 2692 + }, + { + "epoch": 0.30879486297442954, + "grad_norm": 0.26647153041776145, + "learning_rate": 0.0008096873159063758, + "loss": 2.506, + "step": 2693 + }, + { + "epoch": 0.3089095287237702, + "grad_norm": 0.2678328220401086, + "learning_rate": 0.0008095415060132829, + "loss": 2.5318, + "step": 2694 + }, + { + "epoch": 0.30902419447311086, + "grad_norm": 0.26848607307906336, + "learning_rate": 0.0008093956534248395, + "loss": 2.5806, + "step": 2695 + }, + { + "epoch": 0.3091388602224516, + "grad_norm": 0.2981021101253301, + "learning_rate": 0.0008092497581611636, + "loss": 2.5648, + "step": 2696 + }, + { + "epoch": 0.30925352597179223, + "grad_norm": 0.2994617926537688, + "learning_rate": 0.0008091038202423781, + "loss": 2.6669, + "step": 2697 + }, + { + "epoch": 0.3093681917211329, + "grad_norm": 0.2746069578867281, + "learning_rate": 0.0008089578396886128, + "loss": 2.5688, + "step": 2698 + }, + { + "epoch": 0.30948285747047355, + "grad_norm": 0.29261380374451496, + "learning_rate": 0.0008088118165200026, + "loss": 2.6666, + "step": 2699 + }, + { + "epoch": 0.30959752321981426, + "grad_norm": 0.2539384341265657, + "learning_rate": 0.000808665750756689, + "loss": 2.5422, + "step": 2700 + }, + { + "epoch": 0.3097121889691549, + "grad_norm": 0.25576990037691166, + "learning_rate": 0.0008085196424188188, + "loss": 2.5166, + "step": 2701 + }, + { + "epoch": 0.3098268547184956, + "grad_norm": 0.22904459733372143, + "learning_rate": 0.0008083734915265448, + "loss": 2.5619, + "step": 2702 + }, + { + "epoch": 0.30994152046783624, + "grad_norm": 0.24781701338787185, + "learning_rate": 0.0008082272981000258, + "loss": 2.6609, + "step": 2703 + }, + { + "epoch": 0.31005618621717695, + "grad_norm": 0.24955570645334638, + "learning_rate": 0.0008080810621594264, + "loss": 2.5858, + "step": 2704 + }, + { + "epoch": 0.3101708519665176, + "grad_norm": 0.2578692584024744, + "learning_rate": 0.0008079347837249168, + "loss": 2.5622, + "step": 2705 + }, + { + "epoch": 0.31028551771585827, + "grad_norm": 0.23184892457891496, + "learning_rate": 0.0008077884628166738, + "loss": 2.617, + "step": 2706 + }, + { + "epoch": 0.31040018346519893, + "grad_norm": 0.2684669265473634, + "learning_rate": 0.0008076420994548792, + "loss": 2.5964, + "step": 2707 + }, + { + "epoch": 0.3105148492145396, + "grad_norm": 0.2624183159586972, + "learning_rate": 0.0008074956936597213, + "loss": 2.6638, + "step": 2708 + }, + { + "epoch": 0.3106295149638803, + "grad_norm": 0.25487151867290747, + "learning_rate": 0.0008073492454513938, + "loss": 2.5498, + "step": 2709 + }, + { + "epoch": 0.31074418071322096, + "grad_norm": 0.24230756186544447, + "learning_rate": 0.0008072027548500964, + "loss": 2.4159, + "step": 2710 + }, + { + "epoch": 0.3108588464625616, + "grad_norm": 0.2742094328852907, + "learning_rate": 0.0008070562218760349, + "loss": 2.5136, + "step": 2711 + }, + { + "epoch": 0.3109735122119023, + "grad_norm": 0.2326028214877704, + "learning_rate": 0.0008069096465494205, + "loss": 2.4249, + "step": 2712 + }, + { + "epoch": 0.311088177961243, + "grad_norm": 0.2522300416371095, + "learning_rate": 0.0008067630288904708, + "loss": 2.6436, + "step": 2713 + }, + { + "epoch": 0.31120284371058365, + "grad_norm": 0.2518281576806681, + "learning_rate": 0.0008066163689194086, + "loss": 2.5271, + "step": 2714 + }, + { + "epoch": 0.3113175094599243, + "grad_norm": 0.2502315045755176, + "learning_rate": 0.0008064696666564631, + "loss": 2.5947, + "step": 2715 + }, + { + "epoch": 0.31143217520926497, + "grad_norm": 0.2694345853205722, + "learning_rate": 0.0008063229221218686, + "loss": 2.6579, + "step": 2716 + }, + { + "epoch": 0.3115468409586057, + "grad_norm": 0.2521574112192198, + "learning_rate": 0.0008061761353358663, + "loss": 2.4974, + "step": 2717 + }, + { + "epoch": 0.31166150670794635, + "grad_norm": 0.26179401038550065, + "learning_rate": 0.0008060293063187023, + "loss": 2.5257, + "step": 2718 + }, + { + "epoch": 0.311776172457287, + "grad_norm": 0.26546986535215866, + "learning_rate": 0.0008058824350906288, + "loss": 2.5105, + "step": 2719 + }, + { + "epoch": 0.31189083820662766, + "grad_norm": 0.2668851846980171, + "learning_rate": 0.0008057355216719043, + "loss": 2.5784, + "step": 2720 + }, + { + "epoch": 0.3120055039559684, + "grad_norm": 0.251776404799129, + "learning_rate": 0.0008055885660827922, + "loss": 2.5008, + "step": 2721 + }, + { + "epoch": 0.31212016970530904, + "grad_norm": 0.2842423454495024, + "learning_rate": 0.0008054415683435625, + "loss": 2.7241, + "step": 2722 + }, + { + "epoch": 0.3122348354546497, + "grad_norm": 0.2701201811750471, + "learning_rate": 0.0008052945284744909, + "loss": 2.5242, + "step": 2723 + }, + { + "epoch": 0.31234950120399035, + "grad_norm": 0.25862254861137246, + "learning_rate": 0.0008051474464958584, + "loss": 2.5886, + "step": 2724 + }, + { + "epoch": 0.312464166953331, + "grad_norm": 0.27070114497362996, + "learning_rate": 0.0008050003224279521, + "loss": 2.5218, + "step": 2725 + }, + { + "epoch": 0.3125788327026717, + "grad_norm": 0.25952894474072247, + "learning_rate": 0.0008048531562910655, + "loss": 2.5008, + "step": 2726 + }, + { + "epoch": 0.3126934984520124, + "grad_norm": 0.24486997316840947, + "learning_rate": 0.0008047059481054967, + "loss": 2.5288, + "step": 2727 + }, + { + "epoch": 0.31280816420135305, + "grad_norm": 0.293489170532409, + "learning_rate": 0.0008045586978915508, + "loss": 2.6463, + "step": 2728 + }, + { + "epoch": 0.3129228299506937, + "grad_norm": 0.26298020601946676, + "learning_rate": 0.0008044114056695379, + "loss": 2.5373, + "step": 2729 + }, + { + "epoch": 0.3130374957000344, + "grad_norm": 0.281979728365036, + "learning_rate": 0.0008042640714597741, + "loss": 2.5573, + "step": 2730 + }, + { + "epoch": 0.3131521614493751, + "grad_norm": 0.29281781934445444, + "learning_rate": 0.0008041166952825816, + "loss": 2.5292, + "step": 2731 + }, + { + "epoch": 0.31326682719871574, + "grad_norm": 0.2658127679136106, + "learning_rate": 0.0008039692771582878, + "loss": 2.4615, + "step": 2732 + }, + { + "epoch": 0.3133814929480564, + "grad_norm": 0.28768141583692936, + "learning_rate": 0.0008038218171072264, + "loss": 2.6701, + "step": 2733 + }, + { + "epoch": 0.3134961586973971, + "grad_norm": 0.25830144294692137, + "learning_rate": 0.000803674315149737, + "loss": 2.5464, + "step": 2734 + }, + { + "epoch": 0.31361082444673777, + "grad_norm": 0.2852846254635785, + "learning_rate": 0.0008035267713061641, + "loss": 2.5928, + "step": 2735 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.23472164721643343, + "learning_rate": 0.0008033791855968589, + "loss": 2.4265, + "step": 2736 + }, + { + "epoch": 0.3138401559454191, + "grad_norm": 0.25493818323981204, + "learning_rate": 0.000803231558042178, + "loss": 2.6247, + "step": 2737 + }, + { + "epoch": 0.3139548216947598, + "grad_norm": 0.28023293351777306, + "learning_rate": 0.0008030838886624838, + "loss": 2.5063, + "step": 2738 + }, + { + "epoch": 0.31406948744410046, + "grad_norm": 0.2630246056169887, + "learning_rate": 0.0008029361774781446, + "loss": 2.394, + "step": 2739 + }, + { + "epoch": 0.3141841531934411, + "grad_norm": 0.2560965038303245, + "learning_rate": 0.0008027884245095338, + "loss": 2.4202, + "step": 2740 + }, + { + "epoch": 0.3142988189427818, + "grad_norm": 0.24807465840054496, + "learning_rate": 0.0008026406297770319, + "loss": 2.4311, + "step": 2741 + }, + { + "epoch": 0.31441348469212244, + "grad_norm": 0.2887535322603709, + "learning_rate": 0.0008024927933010237, + "loss": 2.3472, + "step": 2742 + }, + { + "epoch": 0.31452815044146315, + "grad_norm": 0.26549045528082954, + "learning_rate": 0.0008023449151019009, + "loss": 2.5869, + "step": 2743 + }, + { + "epoch": 0.3146428161908038, + "grad_norm": 0.2604158522416407, + "learning_rate": 0.0008021969952000603, + "loss": 2.4567, + "step": 2744 + }, + { + "epoch": 0.31475748194014447, + "grad_norm": 0.2904746956641578, + "learning_rate": 0.0008020490336159045, + "loss": 2.4965, + "step": 2745 + }, + { + "epoch": 0.3148721476894851, + "grad_norm": 0.27773170736459274, + "learning_rate": 0.0008019010303698422, + "loss": 2.6165, + "step": 2746 + }, + { + "epoch": 0.31498681343882584, + "grad_norm": 0.27519260312158306, + "learning_rate": 0.0008017529854822873, + "loss": 2.6651, + "step": 2747 + }, + { + "epoch": 0.3151014791881665, + "grad_norm": 0.3125738281926727, + "learning_rate": 0.0008016048989736602, + "loss": 2.6902, + "step": 2748 + }, + { + "epoch": 0.31521614493750716, + "grad_norm": 0.25423221436240917, + "learning_rate": 0.0008014567708643864, + "loss": 2.5691, + "step": 2749 + }, + { + "epoch": 0.3153308106868478, + "grad_norm": 0.28187838067164245, + "learning_rate": 0.0008013086011748972, + "loss": 2.5893, + "step": 2750 + }, + { + "epoch": 0.31544547643618853, + "grad_norm": 0.25368645867522216, + "learning_rate": 0.0008011603899256301, + "loss": 2.7142, + "step": 2751 + }, + { + "epoch": 0.3155601421855292, + "grad_norm": 0.27491745562039316, + "learning_rate": 0.0008010121371370277, + "loss": 2.5609, + "step": 2752 + }, + { + "epoch": 0.31567480793486985, + "grad_norm": 0.2781293228817274, + "learning_rate": 0.000800863842829539, + "loss": 2.3923, + "step": 2753 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.27348538943752926, + "learning_rate": 0.000800715507023618, + "loss": 2.4618, + "step": 2754 + }, + { + "epoch": 0.3159041394335512, + "grad_norm": 0.2667352028282389, + "learning_rate": 0.0008005671297397248, + "loss": 2.6094, + "step": 2755 + }, + { + "epoch": 0.3160188051828919, + "grad_norm": 0.26176336754292906, + "learning_rate": 0.0008004187109983257, + "loss": 2.495, + "step": 2756 + }, + { + "epoch": 0.31613347093223254, + "grad_norm": 0.25856853438779276, + "learning_rate": 0.0008002702508198918, + "loss": 2.5271, + "step": 2757 + }, + { + "epoch": 0.3162481366815732, + "grad_norm": 0.2697515561619566, + "learning_rate": 0.0008001217492249004, + "loss": 2.7077, + "step": 2758 + }, + { + "epoch": 0.31636280243091386, + "grad_norm": 0.2727689479532433, + "learning_rate": 0.0007999732062338347, + "loss": 2.6964, + "step": 2759 + }, + { + "epoch": 0.3164774681802546, + "grad_norm": 0.26509726297860264, + "learning_rate": 0.0007998246218671829, + "loss": 2.6899, + "step": 2760 + }, + { + "epoch": 0.31659213392959523, + "grad_norm": 0.2688068616215317, + "learning_rate": 0.00079967599614544, + "loss": 2.5834, + "step": 2761 + }, + { + "epoch": 0.3167067996789359, + "grad_norm": 0.25015862351846097, + "learning_rate": 0.0007995273290891056, + "loss": 2.4707, + "step": 2762 + }, + { + "epoch": 0.31682146542827655, + "grad_norm": 0.29203197185339563, + "learning_rate": 0.0007993786207186859, + "loss": 2.6624, + "step": 2763 + }, + { + "epoch": 0.31693613117761726, + "grad_norm": 0.21693830768901595, + "learning_rate": 0.000799229871054692, + "loss": 2.6257, + "step": 2764 + }, + { + "epoch": 0.3170507969269579, + "grad_norm": 0.24463681816752275, + "learning_rate": 0.0007990810801176411, + "loss": 2.6584, + "step": 2765 + }, + { + "epoch": 0.3171654626762986, + "grad_norm": 0.23992130071722484, + "learning_rate": 0.0007989322479280564, + "loss": 2.5523, + "step": 2766 + }, + { + "epoch": 0.31728012842563924, + "grad_norm": 0.2523002662428207, + "learning_rate": 0.000798783374506466, + "loss": 2.4852, + "step": 2767 + }, + { + "epoch": 0.31739479417497996, + "grad_norm": 0.24347098972075343, + "learning_rate": 0.0007986344598734048, + "loss": 2.454, + "step": 2768 + }, + { + "epoch": 0.3175094599243206, + "grad_norm": 0.2262762888043256, + "learning_rate": 0.0007984855040494122, + "loss": 2.473, + "step": 2769 + }, + { + "epoch": 0.3176241256736613, + "grad_norm": 0.2417610616217539, + "learning_rate": 0.0007983365070550339, + "loss": 2.5634, + "step": 2770 + }, + { + "epoch": 0.31773879142300193, + "grad_norm": 0.2632834339460108, + "learning_rate": 0.0007981874689108213, + "loss": 2.655, + "step": 2771 + }, + { + "epoch": 0.31785345717234265, + "grad_norm": 0.26639872155513394, + "learning_rate": 0.0007980383896373312, + "loss": 2.4897, + "step": 2772 + }, + { + "epoch": 0.3179681229216833, + "grad_norm": 0.2765302746988034, + "learning_rate": 0.0007978892692551265, + "loss": 2.4632, + "step": 2773 + }, + { + "epoch": 0.31808278867102396, + "grad_norm": 0.2834677864598343, + "learning_rate": 0.0007977401077847755, + "loss": 2.8824, + "step": 2774 + }, + { + "epoch": 0.3181974544203646, + "grad_norm": 0.2663689655277662, + "learning_rate": 0.0007975909052468518, + "loss": 2.5171, + "step": 2775 + }, + { + "epoch": 0.3183121201697053, + "grad_norm": 0.2449206846226601, + "learning_rate": 0.0007974416616619355, + "loss": 2.5471, + "step": 2776 + }, + { + "epoch": 0.318426785919046, + "grad_norm": 0.25571279625488785, + "learning_rate": 0.0007972923770506118, + "loss": 2.6343, + "step": 2777 + }, + { + "epoch": 0.31854145166838665, + "grad_norm": 0.24758240123761288, + "learning_rate": 0.0007971430514334715, + "loss": 2.4911, + "step": 2778 + }, + { + "epoch": 0.3186561174177273, + "grad_norm": 0.2822630117194901, + "learning_rate": 0.0007969936848311113, + "loss": 2.5072, + "step": 2779 + }, + { + "epoch": 0.318770783167068, + "grad_norm": 0.2793599018988377, + "learning_rate": 0.0007968442772641334, + "loss": 2.5835, + "step": 2780 + }, + { + "epoch": 0.3188854489164087, + "grad_norm": 0.27403182746574295, + "learning_rate": 0.000796694828753146, + "loss": 2.5953, + "step": 2781 + }, + { + "epoch": 0.31900011466574935, + "grad_norm": 0.24014038943632512, + "learning_rate": 0.0007965453393187624, + "loss": 2.6227, + "step": 2782 + }, + { + "epoch": 0.31911478041509, + "grad_norm": 0.2763432628460241, + "learning_rate": 0.000796395808981602, + "loss": 2.559, + "step": 2783 + }, + { + "epoch": 0.31922944616443066, + "grad_norm": 0.23799734984909474, + "learning_rate": 0.0007962462377622895, + "loss": 2.3974, + "step": 2784 + }, + { + "epoch": 0.3193441119137714, + "grad_norm": 0.283903704983406, + "learning_rate": 0.0007960966256814555, + "loss": 2.6166, + "step": 2785 + }, + { + "epoch": 0.31945877766311204, + "grad_norm": 0.2437832505668744, + "learning_rate": 0.0007959469727597359, + "loss": 2.5275, + "step": 2786 + }, + { + "epoch": 0.3195734434124527, + "grad_norm": 0.273707121802684, + "learning_rate": 0.0007957972790177729, + "loss": 2.5866, + "step": 2787 + }, + { + "epoch": 0.31968810916179335, + "grad_norm": 0.28004690217837547, + "learning_rate": 0.0007956475444762137, + "loss": 2.3797, + "step": 2788 + }, + { + "epoch": 0.31980277491113407, + "grad_norm": 0.26827576369003925, + "learning_rate": 0.0007954977691557112, + "loss": 2.6477, + "step": 2789 + }, + { + "epoch": 0.31991744066047473, + "grad_norm": 0.26214519874774866, + "learning_rate": 0.0007953479530769241, + "loss": 2.4592, + "step": 2790 + }, + { + "epoch": 0.3200321064098154, + "grad_norm": 0.23303009260242052, + "learning_rate": 0.0007951980962605168, + "loss": 2.4281, + "step": 2791 + }, + { + "epoch": 0.32014677215915605, + "grad_norm": 0.25066238317430645, + "learning_rate": 0.000795048198727159, + "loss": 2.4801, + "step": 2792 + }, + { + "epoch": 0.3202614379084967, + "grad_norm": 0.2836330464348542, + "learning_rate": 0.0007948982604975264, + "loss": 2.4841, + "step": 2793 + }, + { + "epoch": 0.3203761036578374, + "grad_norm": 0.29358355746930076, + "learning_rate": 0.0007947482815923001, + "loss": 2.6007, + "step": 2794 + }, + { + "epoch": 0.3204907694071781, + "grad_norm": 0.28705019498155937, + "learning_rate": 0.0007945982620321666, + "loss": 2.6222, + "step": 2795 + }, + { + "epoch": 0.32060543515651874, + "grad_norm": 0.2483678228089555, + "learning_rate": 0.0007944482018378185, + "loss": 2.6724, + "step": 2796 + }, + { + "epoch": 0.3207201009058594, + "grad_norm": 0.26782423867168276, + "learning_rate": 0.0007942981010299537, + "loss": 2.6557, + "step": 2797 + }, + { + "epoch": 0.3208347666552001, + "grad_norm": 0.2407086954012889, + "learning_rate": 0.0007941479596292756, + "loss": 2.5861, + "step": 2798 + }, + { + "epoch": 0.32094943240454077, + "grad_norm": 0.24774972243949178, + "learning_rate": 0.0007939977776564935, + "loss": 2.5715, + "step": 2799 + }, + { + "epoch": 0.3210640981538814, + "grad_norm": 0.26441328332595304, + "learning_rate": 0.0007938475551323221, + "loss": 2.5076, + "step": 2800 + }, + { + "epoch": 0.3211787639032221, + "grad_norm": 0.2394643466809427, + "learning_rate": 0.0007936972920774817, + "loss": 2.4372, + "step": 2801 + }, + { + "epoch": 0.3212934296525628, + "grad_norm": 0.26451877179185185, + "learning_rate": 0.000793546988512698, + "loss": 2.6105, + "step": 2802 + }, + { + "epoch": 0.32140809540190346, + "grad_norm": 0.25785014674822554, + "learning_rate": 0.0007933966444587031, + "loss": 2.5864, + "step": 2803 + }, + { + "epoch": 0.3215227611512441, + "grad_norm": 0.2693742148968166, + "learning_rate": 0.0007932462599362335, + "loss": 2.6268, + "step": 2804 + }, + { + "epoch": 0.3216374269005848, + "grad_norm": 0.2446786164322382, + "learning_rate": 0.0007930958349660323, + "loss": 2.5137, + "step": 2805 + }, + { + "epoch": 0.3217520926499255, + "grad_norm": 0.25502467829752334, + "learning_rate": 0.0007929453695688475, + "loss": 2.4641, + "step": 2806 + }, + { + "epoch": 0.32186675839926615, + "grad_norm": 0.2720720902427651, + "learning_rate": 0.000792794863765433, + "loss": 2.7138, + "step": 2807 + }, + { + "epoch": 0.3219814241486068, + "grad_norm": 0.3041731071757791, + "learning_rate": 0.0007926443175765483, + "loss": 2.5614, + "step": 2808 + }, + { + "epoch": 0.32209608989794747, + "grad_norm": 0.24676312579365384, + "learning_rate": 0.0007924937310229583, + "loss": 2.3599, + "step": 2809 + }, + { + "epoch": 0.3222107556472881, + "grad_norm": 0.2785274434895631, + "learning_rate": 0.0007923431041254335, + "loss": 2.4711, + "step": 2810 + }, + { + "epoch": 0.32232542139662884, + "grad_norm": 0.27688080833143375, + "learning_rate": 0.00079219243690475, + "loss": 2.5351, + "step": 2811 + }, + { + "epoch": 0.3224400871459695, + "grad_norm": 0.2883791278677757, + "learning_rate": 0.0007920417293816895, + "loss": 2.5583, + "step": 2812 + }, + { + "epoch": 0.32255475289531016, + "grad_norm": 0.2864312738508502, + "learning_rate": 0.0007918909815770394, + "loss": 2.6237, + "step": 2813 + }, + { + "epoch": 0.3226694186446508, + "grad_norm": 0.2650966063170184, + "learning_rate": 0.0007917401935115923, + "loss": 2.6403, + "step": 2814 + }, + { + "epoch": 0.32278408439399153, + "grad_norm": 0.24246663942191735, + "learning_rate": 0.0007915893652061466, + "loss": 2.584, + "step": 2815 + }, + { + "epoch": 0.3228987501433322, + "grad_norm": 0.24740388510383968, + "learning_rate": 0.000791438496681506, + "loss": 2.4981, + "step": 2816 + }, + { + "epoch": 0.32301341589267285, + "grad_norm": 0.275669739600022, + "learning_rate": 0.0007912875879584802, + "loss": 2.4866, + "step": 2817 + }, + { + "epoch": 0.3231280816420135, + "grad_norm": 0.26084347006336706, + "learning_rate": 0.0007911366390578841, + "loss": 2.3569, + "step": 2818 + }, + { + "epoch": 0.3232427473913542, + "grad_norm": 0.2344061901438677, + "learning_rate": 0.0007909856500005382, + "loss": 2.4777, + "step": 2819 + }, + { + "epoch": 0.3233574131406949, + "grad_norm": 0.2439786015431069, + "learning_rate": 0.0007908346208072686, + "loss": 2.4736, + "step": 2820 + }, + { + "epoch": 0.32347207889003554, + "grad_norm": 0.24218638585601474, + "learning_rate": 0.0007906835514989068, + "loss": 2.3964, + "step": 2821 + }, + { + "epoch": 0.3235867446393762, + "grad_norm": 0.23865145209032212, + "learning_rate": 0.0007905324420962901, + "loss": 2.4186, + "step": 2822 + }, + { + "epoch": 0.3237014103887169, + "grad_norm": 0.28132928295466353, + "learning_rate": 0.0007903812926202611, + "loss": 2.5713, + "step": 2823 + }, + { + "epoch": 0.3238160761380576, + "grad_norm": 0.31046520165071356, + "learning_rate": 0.0007902301030916679, + "loss": 2.6242, + "step": 2824 + }, + { + "epoch": 0.32393074188739823, + "grad_norm": 0.27325255005130056, + "learning_rate": 0.0007900788735313642, + "loss": 2.5407, + "step": 2825 + }, + { + "epoch": 0.3240454076367389, + "grad_norm": 0.25140306649157157, + "learning_rate": 0.0007899276039602094, + "loss": 2.5342, + "step": 2826 + }, + { + "epoch": 0.3241600733860796, + "grad_norm": 0.28537929220734265, + "learning_rate": 0.000789776294399068, + "loss": 2.6859, + "step": 2827 + }, + { + "epoch": 0.32427473913542026, + "grad_norm": 0.2671271911444393, + "learning_rate": 0.0007896249448688106, + "loss": 2.5106, + "step": 2828 + }, + { + "epoch": 0.3243894048847609, + "grad_norm": 0.25409723441991316, + "learning_rate": 0.0007894735553903127, + "loss": 2.5361, + "step": 2829 + }, + { + "epoch": 0.3245040706341016, + "grad_norm": 0.26652403393815277, + "learning_rate": 0.0007893221259844558, + "loss": 2.5533, + "step": 2830 + }, + { + "epoch": 0.32461873638344224, + "grad_norm": 0.27379868085659836, + "learning_rate": 0.0007891706566721266, + "loss": 2.6198, + "step": 2831 + }, + { + "epoch": 0.32473340213278296, + "grad_norm": 0.22681342881836883, + "learning_rate": 0.0007890191474742173, + "loss": 2.3829, + "step": 2832 + }, + { + "epoch": 0.3248480678821236, + "grad_norm": 0.23892736322183392, + "learning_rate": 0.0007888675984116258, + "loss": 2.5828, + "step": 2833 + }, + { + "epoch": 0.3249627336314643, + "grad_norm": 0.24048152724183752, + "learning_rate": 0.0007887160095052555, + "loss": 2.4545, + "step": 2834 + }, + { + "epoch": 0.32507739938080493, + "grad_norm": 0.25472926845466093, + "learning_rate": 0.000788564380776015, + "loss": 2.5582, + "step": 2835 + }, + { + "epoch": 0.32519206513014565, + "grad_norm": 0.2260893058046602, + "learning_rate": 0.000788412712244819, + "loss": 2.5609, + "step": 2836 + }, + { + "epoch": 0.3253067308794863, + "grad_norm": 0.28064529785732356, + "learning_rate": 0.0007882610039325867, + "loss": 2.5482, + "step": 2837 + }, + { + "epoch": 0.32542139662882696, + "grad_norm": 0.22439188050475212, + "learning_rate": 0.0007881092558602437, + "loss": 2.5169, + "step": 2838 + }, + { + "epoch": 0.3255360623781676, + "grad_norm": 0.24008033615671384, + "learning_rate": 0.0007879574680487209, + "loss": 2.6281, + "step": 2839 + }, + { + "epoch": 0.32565072812750834, + "grad_norm": 0.27526431123506817, + "learning_rate": 0.0007878056405189542, + "loss": 2.6883, + "step": 2840 + }, + { + "epoch": 0.325765393876849, + "grad_norm": 0.2744979686760225, + "learning_rate": 0.0007876537732918855, + "loss": 2.607, + "step": 2841 + }, + { + "epoch": 0.32588005962618966, + "grad_norm": 0.22289891469328385, + "learning_rate": 0.0007875018663884619, + "loss": 2.3838, + "step": 2842 + }, + { + "epoch": 0.3259947253755303, + "grad_norm": 0.2798770864686912, + "learning_rate": 0.0007873499198296361, + "loss": 2.5933, + "step": 2843 + }, + { + "epoch": 0.32610939112487103, + "grad_norm": 0.2575396719658231, + "learning_rate": 0.0007871979336363664, + "loss": 2.5121, + "step": 2844 + }, + { + "epoch": 0.3262240568742117, + "grad_norm": 0.26922923653053504, + "learning_rate": 0.0007870459078296162, + "loss": 2.528, + "step": 2845 + }, + { + "epoch": 0.32633872262355235, + "grad_norm": 0.25322752844208946, + "learning_rate": 0.0007868938424303545, + "loss": 2.4416, + "step": 2846 + }, + { + "epoch": 0.326453388372893, + "grad_norm": 0.2538662258094524, + "learning_rate": 0.000786741737459556, + "loss": 2.4377, + "step": 2847 + }, + { + "epoch": 0.32656805412223366, + "grad_norm": 0.24189717973684755, + "learning_rate": 0.0007865895929382007, + "loss": 2.5253, + "step": 2848 + }, + { + "epoch": 0.3266827198715744, + "grad_norm": 0.2688373634533083, + "learning_rate": 0.0007864374088872739, + "loss": 2.5152, + "step": 2849 + }, + { + "epoch": 0.32679738562091504, + "grad_norm": 0.28012949856809244, + "learning_rate": 0.0007862851853277664, + "loss": 2.5565, + "step": 2850 + }, + { + "epoch": 0.3269120513702557, + "grad_norm": 0.25456421485604575, + "learning_rate": 0.0007861329222806748, + "loss": 2.4943, + "step": 2851 + }, + { + "epoch": 0.32702671711959636, + "grad_norm": 0.22615743663347057, + "learning_rate": 0.0007859806197670007, + "loss": 2.5001, + "step": 2852 + }, + { + "epoch": 0.32714138286893707, + "grad_norm": 0.2589506373369928, + "learning_rate": 0.0007858282778077513, + "loss": 2.5648, + "step": 2853 + }, + { + "epoch": 0.32725604861827773, + "grad_norm": 0.26710779443082944, + "learning_rate": 0.0007856758964239396, + "loss": 2.4572, + "step": 2854 + }, + { + "epoch": 0.3273707143676184, + "grad_norm": 0.26219531440983723, + "learning_rate": 0.0007855234756365832, + "loss": 2.5781, + "step": 2855 + }, + { + "epoch": 0.32748538011695905, + "grad_norm": 0.2475063160137756, + "learning_rate": 0.0007853710154667062, + "loss": 2.3766, + "step": 2856 + }, + { + "epoch": 0.32760004586629976, + "grad_norm": 0.26762627190769117, + "learning_rate": 0.0007852185159353371, + "loss": 2.6682, + "step": 2857 + }, + { + "epoch": 0.3277147116156404, + "grad_norm": 0.30475359791353324, + "learning_rate": 0.0007850659770635104, + "loss": 2.5329, + "step": 2858 + }, + { + "epoch": 0.3278293773649811, + "grad_norm": 0.26807150913198946, + "learning_rate": 0.0007849133988722663, + "loss": 2.6813, + "step": 2859 + }, + { + "epoch": 0.32794404311432174, + "grad_norm": 0.2790196442806882, + "learning_rate": 0.0007847607813826496, + "loss": 2.6527, + "step": 2860 + }, + { + "epoch": 0.32805870886366245, + "grad_norm": 0.2764591667273137, + "learning_rate": 0.0007846081246157111, + "loss": 2.6602, + "step": 2861 + }, + { + "epoch": 0.3281733746130031, + "grad_norm": 0.2531657367419072, + "learning_rate": 0.000784455428592507, + "loss": 2.5534, + "step": 2862 + }, + { + "epoch": 0.32828804036234377, + "grad_norm": 0.24112452993303823, + "learning_rate": 0.0007843026933340988, + "loss": 2.487, + "step": 2863 + }, + { + "epoch": 0.32840270611168443, + "grad_norm": 0.27201995237372106, + "learning_rate": 0.0007841499188615533, + "loss": 2.5236, + "step": 2864 + }, + { + "epoch": 0.3285173718610251, + "grad_norm": 0.25679596670328025, + "learning_rate": 0.0007839971051959427, + "loss": 2.373, + "step": 2865 + }, + { + "epoch": 0.3286320376103658, + "grad_norm": 0.25598158059681514, + "learning_rate": 0.0007838442523583451, + "loss": 2.6375, + "step": 2866 + }, + { + "epoch": 0.32874670335970646, + "grad_norm": 0.24157278762459178, + "learning_rate": 0.0007836913603698434, + "loss": 2.4772, + "step": 2867 + }, + { + "epoch": 0.3288613691090471, + "grad_norm": 0.24300549690134113, + "learning_rate": 0.0007835384292515263, + "loss": 2.5608, + "step": 2868 + }, + { + "epoch": 0.3289760348583878, + "grad_norm": 0.23423658065328476, + "learning_rate": 0.0007833854590244875, + "loss": 2.5183, + "step": 2869 + }, + { + "epoch": 0.3290907006077285, + "grad_norm": 0.2633010515284913, + "learning_rate": 0.0007832324497098266, + "loss": 2.5514, + "step": 2870 + }, + { + "epoch": 0.32920536635706915, + "grad_norm": 0.25078891805469994, + "learning_rate": 0.0007830794013286479, + "loss": 2.4767, + "step": 2871 + }, + { + "epoch": 0.3293200321064098, + "grad_norm": 0.25320941166244093, + "learning_rate": 0.0007829263139020619, + "loss": 2.6326, + "step": 2872 + }, + { + "epoch": 0.32943469785575047, + "grad_norm": 0.2622802972303187, + "learning_rate": 0.0007827731874511841, + "loss": 2.5479, + "step": 2873 + }, + { + "epoch": 0.3295493636050912, + "grad_norm": 0.25869669665015843, + "learning_rate": 0.0007826200219971352, + "loss": 2.5387, + "step": 2874 + }, + { + "epoch": 0.32966402935443184, + "grad_norm": 0.2470348670177471, + "learning_rate": 0.0007824668175610412, + "loss": 2.5624, + "step": 2875 + }, + { + "epoch": 0.3297786951037725, + "grad_norm": 0.24449306216271172, + "learning_rate": 0.0007823135741640343, + "loss": 2.5067, + "step": 2876 + }, + { + "epoch": 0.32989336085311316, + "grad_norm": 0.258631219954575, + "learning_rate": 0.0007821602918272512, + "loss": 2.4725, + "step": 2877 + }, + { + "epoch": 0.3300080266024539, + "grad_norm": 0.2676561394074989, + "learning_rate": 0.0007820069705718342, + "loss": 2.6316, + "step": 2878 + }, + { + "epoch": 0.33012269235179453, + "grad_norm": 0.2565330499417993, + "learning_rate": 0.0007818536104189313, + "loss": 2.5035, + "step": 2879 + }, + { + "epoch": 0.3302373581011352, + "grad_norm": 0.22433423312559314, + "learning_rate": 0.0007817002113896954, + "loss": 2.4204, + "step": 2880 + }, + { + "epoch": 0.33035202385047585, + "grad_norm": 0.25392861009320566, + "learning_rate": 0.000781546773505285, + "loss": 2.4006, + "step": 2881 + }, + { + "epoch": 0.3304666895998165, + "grad_norm": 0.2762052958517115, + "learning_rate": 0.000781393296786864, + "loss": 2.5937, + "step": 2882 + }, + { + "epoch": 0.3305813553491572, + "grad_norm": 0.25738716911641596, + "learning_rate": 0.0007812397812556015, + "loss": 2.4689, + "step": 2883 + }, + { + "epoch": 0.3306960210984979, + "grad_norm": 0.2816323881872482, + "learning_rate": 0.0007810862269326722, + "loss": 2.5029, + "step": 2884 + }, + { + "epoch": 0.33081068684783854, + "grad_norm": 0.2469754387458031, + "learning_rate": 0.0007809326338392557, + "loss": 2.56, + "step": 2885 + }, + { + "epoch": 0.3309253525971792, + "grad_norm": 0.286507441268205, + "learning_rate": 0.0007807790019965376, + "loss": 2.5209, + "step": 2886 + }, + { + "epoch": 0.3310400183465199, + "grad_norm": 0.26586167785692216, + "learning_rate": 0.0007806253314257082, + "loss": 2.5099, + "step": 2887 + }, + { + "epoch": 0.3311546840958606, + "grad_norm": 0.2579354015847745, + "learning_rate": 0.0007804716221479637, + "loss": 2.6918, + "step": 2888 + }, + { + "epoch": 0.33126934984520123, + "grad_norm": 0.2467974256258055, + "learning_rate": 0.000780317874184505, + "loss": 2.5583, + "step": 2889 + }, + { + "epoch": 0.3313840155945419, + "grad_norm": 0.2485631789784952, + "learning_rate": 0.000780164087556539, + "loss": 2.5398, + "step": 2890 + }, + { + "epoch": 0.3314986813438826, + "grad_norm": 0.26586979682912076, + "learning_rate": 0.0007800102622852776, + "loss": 2.5726, + "step": 2891 + }, + { + "epoch": 0.33161334709322327, + "grad_norm": 0.24974367660962007, + "learning_rate": 0.0007798563983919379, + "loss": 2.4688, + "step": 2892 + }, + { + "epoch": 0.3317280128425639, + "grad_norm": 0.23674130435754995, + "learning_rate": 0.0007797024958977425, + "loss": 2.6279, + "step": 2893 + }, + { + "epoch": 0.3318426785919046, + "grad_norm": 0.2600862837713367, + "learning_rate": 0.0007795485548239196, + "loss": 2.3539, + "step": 2894 + }, + { + "epoch": 0.3319573443412453, + "grad_norm": 0.26076885681071876, + "learning_rate": 0.0007793945751917022, + "loss": 2.5097, + "step": 2895 + }, + { + "epoch": 0.33207201009058596, + "grad_norm": 0.25478438547384136, + "learning_rate": 0.0007792405570223289, + "loss": 2.4871, + "step": 2896 + }, + { + "epoch": 0.3321866758399266, + "grad_norm": 0.23152418019005155, + "learning_rate": 0.0007790865003370434, + "loss": 2.3685, + "step": 2897 + }, + { + "epoch": 0.3323013415892673, + "grad_norm": 0.250470056201604, + "learning_rate": 0.0007789324051570951, + "loss": 2.4857, + "step": 2898 + }, + { + "epoch": 0.33241600733860793, + "grad_norm": 0.25092723731338534, + "learning_rate": 0.0007787782715037387, + "loss": 2.5326, + "step": 2899 + }, + { + "epoch": 0.33253067308794865, + "grad_norm": 0.2597737706886953, + "learning_rate": 0.0007786240993982335, + "loss": 2.5676, + "step": 2900 + }, + { + "epoch": 0.3326453388372893, + "grad_norm": 0.2785390958471021, + "learning_rate": 0.0007784698888618449, + "loss": 2.5627, + "step": 2901 + }, + { + "epoch": 0.33276000458662996, + "grad_norm": 0.2470441938924738, + "learning_rate": 0.0007783156399158433, + "loss": 2.6115, + "step": 2902 + }, + { + "epoch": 0.3328746703359706, + "grad_norm": 0.25079928940062823, + "learning_rate": 0.0007781613525815043, + "loss": 2.4622, + "step": 2903 + }, + { + "epoch": 0.33298933608531134, + "grad_norm": 0.25997731792901885, + "learning_rate": 0.000778007026880109, + "loss": 2.544, + "step": 2904 + }, + { + "epoch": 0.333104001834652, + "grad_norm": 0.26888709544463824, + "learning_rate": 0.0007778526628329436, + "loss": 2.5282, + "step": 2905 + }, + { + "epoch": 0.33321866758399266, + "grad_norm": 0.2541356331798265, + "learning_rate": 0.0007776982604612996, + "loss": 2.6122, + "step": 2906 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.23375030562616259, + "learning_rate": 0.0007775438197864743, + "loss": 2.4158, + "step": 2907 + }, + { + "epoch": 0.33344799908267403, + "grad_norm": 0.24713240742152193, + "learning_rate": 0.0007773893408297692, + "loss": 2.4595, + "step": 2908 + }, + { + "epoch": 0.3335626648320147, + "grad_norm": 0.23394362750954667, + "learning_rate": 0.0007772348236124922, + "loss": 2.5929, + "step": 2909 + }, + { + "epoch": 0.33367733058135535, + "grad_norm": 0.22579589201461356, + "learning_rate": 0.0007770802681559558, + "loss": 2.4603, + "step": 2910 + }, + { + "epoch": 0.333791996330696, + "grad_norm": 0.22074776591913334, + "learning_rate": 0.0007769256744814781, + "loss": 2.5446, + "step": 2911 + }, + { + "epoch": 0.3339066620800367, + "grad_norm": 0.2575889827970145, + "learning_rate": 0.0007767710426103822, + "loss": 2.6346, + "step": 2912 + }, + { + "epoch": 0.3340213278293774, + "grad_norm": 0.24263932705014443, + "learning_rate": 0.0007766163725639967, + "loss": 2.5663, + "step": 2913 + }, + { + "epoch": 0.33413599357871804, + "grad_norm": 0.28500909641827976, + "learning_rate": 0.0007764616643636557, + "loss": 2.4592, + "step": 2914 + }, + { + "epoch": 0.3342506593280587, + "grad_norm": 0.276958263549589, + "learning_rate": 0.0007763069180306976, + "loss": 2.7844, + "step": 2915 + }, + { + "epoch": 0.33436532507739936, + "grad_norm": 0.26777068898211687, + "learning_rate": 0.0007761521335864672, + "loss": 2.5052, + "step": 2916 + }, + { + "epoch": 0.33447999082674007, + "grad_norm": 0.2654534216979825, + "learning_rate": 0.0007759973110523137, + "loss": 2.5486, + "step": 2917 + }, + { + "epoch": 0.33459465657608073, + "grad_norm": 0.24357040836165986, + "learning_rate": 0.0007758424504495925, + "loss": 2.5203, + "step": 2918 + }, + { + "epoch": 0.3347093223254214, + "grad_norm": 0.2519007125891757, + "learning_rate": 0.000775687551799663, + "loss": 2.6177, + "step": 2919 + }, + { + "epoch": 0.33482398807476205, + "grad_norm": 0.24111344799593543, + "learning_rate": 0.0007755326151238908, + "loss": 2.4844, + "step": 2920 + }, + { + "epoch": 0.33493865382410276, + "grad_norm": 0.2658739967907742, + "learning_rate": 0.0007753776404436466, + "loss": 2.7989, + "step": 2921 + }, + { + "epoch": 0.3350533195734434, + "grad_norm": 0.2755928882664132, + "learning_rate": 0.000775222627780306, + "loss": 2.6179, + "step": 2922 + }, + { + "epoch": 0.3351679853227841, + "grad_norm": 0.2689571432359613, + "learning_rate": 0.0007750675771552502, + "loss": 2.438, + "step": 2923 + }, + { + "epoch": 0.33528265107212474, + "grad_norm": 0.2789348927192148, + "learning_rate": 0.0007749124885898654, + "loss": 2.4638, + "step": 2924 + }, + { + "epoch": 0.33539731682146545, + "grad_norm": 0.26687965480680065, + "learning_rate": 0.0007747573621055431, + "loss": 2.4063, + "step": 2925 + }, + { + "epoch": 0.3355119825708061, + "grad_norm": 0.2530200537716329, + "learning_rate": 0.00077460219772368, + "loss": 2.5039, + "step": 2926 + }, + { + "epoch": 0.33562664832014677, + "grad_norm": 0.2507589320010678, + "learning_rate": 0.0007744469954656781, + "loss": 2.4869, + "step": 2927 + }, + { + "epoch": 0.33574131406948743, + "grad_norm": 0.24559990788066305, + "learning_rate": 0.0007742917553529447, + "loss": 2.4736, + "step": 2928 + }, + { + "epoch": 0.33585597981882814, + "grad_norm": 0.2804129707274445, + "learning_rate": 0.000774136477406892, + "loss": 2.5437, + "step": 2929 + }, + { + "epoch": 0.3359706455681688, + "grad_norm": 0.2666963945378615, + "learning_rate": 0.0007739811616489378, + "loss": 2.5724, + "step": 2930 + }, + { + "epoch": 0.33608531131750946, + "grad_norm": 0.24127313344187443, + "learning_rate": 0.0007738258081005049, + "loss": 2.5016, + "step": 2931 + }, + { + "epoch": 0.3361999770668501, + "grad_norm": 0.2540536743753185, + "learning_rate": 0.0007736704167830216, + "loss": 2.5301, + "step": 2932 + }, + { + "epoch": 0.3363146428161908, + "grad_norm": 0.258622248942266, + "learning_rate": 0.0007735149877179206, + "loss": 2.5181, + "step": 2933 + }, + { + "epoch": 0.3364293085655315, + "grad_norm": 0.27092779076837875, + "learning_rate": 0.0007733595209266408, + "loss": 2.5372, + "step": 2934 + }, + { + "epoch": 0.33654397431487215, + "grad_norm": 0.2643764416319443, + "learning_rate": 0.000773204016430626, + "loss": 2.4051, + "step": 2935 + }, + { + "epoch": 0.3366586400642128, + "grad_norm": 0.2436467722822796, + "learning_rate": 0.0007730484742513247, + "loss": 2.5717, + "step": 2936 + }, + { + "epoch": 0.33677330581355347, + "grad_norm": 0.2691065214152806, + "learning_rate": 0.0007728928944101912, + "loss": 2.6144, + "step": 2937 + }, + { + "epoch": 0.3368879715628942, + "grad_norm": 0.26660740097316216, + "learning_rate": 0.0007727372769286846, + "loss": 2.5468, + "step": 2938 + }, + { + "epoch": 0.33700263731223484, + "grad_norm": 0.24292099877583428, + "learning_rate": 0.0007725816218282697, + "loss": 2.665, + "step": 2939 + }, + { + "epoch": 0.3371173030615755, + "grad_norm": 0.2559916866013264, + "learning_rate": 0.0007724259291304159, + "loss": 2.6007, + "step": 2940 + }, + { + "epoch": 0.33723196881091616, + "grad_norm": 0.2658481145495126, + "learning_rate": 0.0007722701988565979, + "loss": 2.6243, + "step": 2941 + }, + { + "epoch": 0.3373466345602569, + "grad_norm": 0.24887045515215772, + "learning_rate": 0.0007721144310282961, + "loss": 2.6149, + "step": 2942 + }, + { + "epoch": 0.33746130030959753, + "grad_norm": 0.24105938825348663, + "learning_rate": 0.0007719586256669955, + "loss": 2.3526, + "step": 2943 + }, + { + "epoch": 0.3375759660589382, + "grad_norm": 0.253880925837504, + "learning_rate": 0.0007718027827941865, + "loss": 2.4064, + "step": 2944 + }, + { + "epoch": 0.33769063180827885, + "grad_norm": 0.25307183867774163, + "learning_rate": 0.0007716469024313645, + "loss": 2.4774, + "step": 2945 + }, + { + "epoch": 0.33780529755761957, + "grad_norm": 0.25485812836908117, + "learning_rate": 0.0007714909846000304, + "loss": 2.4907, + "step": 2946 + }, + { + "epoch": 0.3379199633069602, + "grad_norm": 0.24256073978103967, + "learning_rate": 0.0007713350293216903, + "loss": 2.5684, + "step": 2947 + }, + { + "epoch": 0.3380346290563009, + "grad_norm": 0.2663710823480498, + "learning_rate": 0.0007711790366178548, + "loss": 2.4919, + "step": 2948 + }, + { + "epoch": 0.33814929480564154, + "grad_norm": 0.24078042823645204, + "learning_rate": 0.0007710230065100404, + "loss": 2.577, + "step": 2949 + }, + { + "epoch": 0.3382639605549822, + "grad_norm": 0.27333028735444287, + "learning_rate": 0.0007708669390197683, + "loss": 2.5028, + "step": 2950 + }, + { + "epoch": 0.3383786263043229, + "grad_norm": 0.28078291062776034, + "learning_rate": 0.0007707108341685654, + "loss": 2.4797, + "step": 2951 + }, + { + "epoch": 0.3384932920536636, + "grad_norm": 0.26571437910235857, + "learning_rate": 0.000770554691977963, + "loss": 2.4723, + "step": 2952 + }, + { + "epoch": 0.33860795780300423, + "grad_norm": 0.2595220390992353, + "learning_rate": 0.0007703985124694981, + "loss": 2.6255, + "step": 2953 + }, + { + "epoch": 0.3387226235523449, + "grad_norm": 0.2661329018190098, + "learning_rate": 0.0007702422956647126, + "loss": 2.3811, + "step": 2954 + }, + { + "epoch": 0.3388372893016856, + "grad_norm": 0.24383355167946313, + "learning_rate": 0.0007700860415851538, + "loss": 2.4187, + "step": 2955 + }, + { + "epoch": 0.33895195505102627, + "grad_norm": 0.2549238872091199, + "learning_rate": 0.0007699297502523739, + "loss": 2.6562, + "step": 2956 + }, + { + "epoch": 0.3390666208003669, + "grad_norm": 0.25393262292628055, + "learning_rate": 0.0007697734216879302, + "loss": 2.5924, + "step": 2957 + }, + { + "epoch": 0.3391812865497076, + "grad_norm": 0.27513140320631274, + "learning_rate": 0.0007696170559133853, + "loss": 2.5301, + "step": 2958 + }, + { + "epoch": 0.3392959522990483, + "grad_norm": 0.2555350968067638, + "learning_rate": 0.000769460652950307, + "loss": 2.562, + "step": 2959 + }, + { + "epoch": 0.33941061804838896, + "grad_norm": 0.24454674157758655, + "learning_rate": 0.0007693042128202679, + "loss": 2.5924, + "step": 2960 + }, + { + "epoch": 0.3395252837977296, + "grad_norm": 0.28580779354578895, + "learning_rate": 0.0007691477355448461, + "loss": 2.6632, + "step": 2961 + }, + { + "epoch": 0.3396399495470703, + "grad_norm": 0.24727391212859587, + "learning_rate": 0.0007689912211456247, + "loss": 2.6048, + "step": 2962 + }, + { + "epoch": 0.339754615296411, + "grad_norm": 0.22307183281991919, + "learning_rate": 0.0007688346696441917, + "loss": 2.6055, + "step": 2963 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.2654186989873798, + "learning_rate": 0.0007686780810621406, + "loss": 2.5193, + "step": 2964 + }, + { + "epoch": 0.3399839467950923, + "grad_norm": 0.25401760459979744, + "learning_rate": 0.0007685214554210693, + "loss": 2.5376, + "step": 2965 + }, + { + "epoch": 0.34009861254443297, + "grad_norm": 0.24695574170682183, + "learning_rate": 0.0007683647927425821, + "loss": 2.5818, + "step": 2966 + }, + { + "epoch": 0.3402132782937736, + "grad_norm": 0.2475300189385152, + "learning_rate": 0.0007682080930482871, + "loss": 2.5006, + "step": 2967 + }, + { + "epoch": 0.34032794404311434, + "grad_norm": 0.2500474595151521, + "learning_rate": 0.0007680513563597982, + "loss": 2.5712, + "step": 2968 + }, + { + "epoch": 0.340442609792455, + "grad_norm": 0.26243643607979755, + "learning_rate": 0.0007678945826987343, + "loss": 2.5314, + "step": 2969 + }, + { + "epoch": 0.34055727554179566, + "grad_norm": 0.2450371235674348, + "learning_rate": 0.0007677377720867189, + "loss": 2.5319, + "step": 2970 + }, + { + "epoch": 0.3406719412911363, + "grad_norm": 0.2562268478490193, + "learning_rate": 0.0007675809245453818, + "loss": 2.5706, + "step": 2971 + }, + { + "epoch": 0.34078660704047703, + "grad_norm": 0.22696883125296485, + "learning_rate": 0.0007674240400963566, + "loss": 2.593, + "step": 2972 + }, + { + "epoch": 0.3409012727898177, + "grad_norm": 0.2519680268261953, + "learning_rate": 0.0007672671187612826, + "loss": 2.5785, + "step": 2973 + }, + { + "epoch": 0.34101593853915835, + "grad_norm": 0.2871220551382858, + "learning_rate": 0.0007671101605618041, + "loss": 2.4925, + "step": 2974 + }, + { + "epoch": 0.341130604288499, + "grad_norm": 0.27811607011491035, + "learning_rate": 0.0007669531655195705, + "loss": 2.642, + "step": 2975 + }, + { + "epoch": 0.3412452700378397, + "grad_norm": 0.265498662506417, + "learning_rate": 0.0007667961336562364, + "loss": 2.5985, + "step": 2976 + }, + { + "epoch": 0.3413599357871804, + "grad_norm": 0.24148283234660736, + "learning_rate": 0.0007666390649934612, + "loss": 2.5031, + "step": 2977 + }, + { + "epoch": 0.34147460153652104, + "grad_norm": 0.2663542440394786, + "learning_rate": 0.0007664819595529095, + "loss": 2.5747, + "step": 2978 + }, + { + "epoch": 0.3415892672858617, + "grad_norm": 0.24736492781882619, + "learning_rate": 0.0007663248173562513, + "loss": 2.4586, + "step": 2979 + }, + { + "epoch": 0.3417039330352024, + "grad_norm": 0.3000220046303528, + "learning_rate": 0.000766167638425161, + "loss": 2.6917, + "step": 2980 + }, + { + "epoch": 0.34181859878454307, + "grad_norm": 0.2514162263534708, + "learning_rate": 0.0007660104227813188, + "loss": 2.5457, + "step": 2981 + }, + { + "epoch": 0.34193326453388373, + "grad_norm": 0.24549564078531316, + "learning_rate": 0.0007658531704464092, + "loss": 2.4773, + "step": 2982 + }, + { + "epoch": 0.3420479302832244, + "grad_norm": 0.25615240120524024, + "learning_rate": 0.0007656958814421225, + "loss": 2.4922, + "step": 2983 + }, + { + "epoch": 0.34216259603256505, + "grad_norm": 0.38849471317410544, + "learning_rate": 0.0007655385557901534, + "loss": 2.4524, + "step": 2984 + }, + { + "epoch": 0.34227726178190576, + "grad_norm": 0.2664877711008682, + "learning_rate": 0.0007653811935122022, + "loss": 2.3778, + "step": 2985 + }, + { + "epoch": 0.3423919275312464, + "grad_norm": 0.2757237160329828, + "learning_rate": 0.0007652237946299741, + "loss": 2.4821, + "step": 2986 + }, + { + "epoch": 0.3425065932805871, + "grad_norm": 0.22540035763140626, + "learning_rate": 0.000765066359165179, + "loss": 2.3811, + "step": 2987 + }, + { + "epoch": 0.34262125902992774, + "grad_norm": 0.24671826215786202, + "learning_rate": 0.0007649088871395324, + "loss": 2.5621, + "step": 2988 + }, + { + "epoch": 0.34273592477926845, + "grad_norm": 0.2573451800082805, + "learning_rate": 0.0007647513785747545, + "loss": 2.4763, + "step": 2989 + }, + { + "epoch": 0.3428505905286091, + "grad_norm": 0.2624394179226302, + "learning_rate": 0.0007645938334925704, + "loss": 2.6558, + "step": 2990 + }, + { + "epoch": 0.34296525627794977, + "grad_norm": 0.2644386131436197, + "learning_rate": 0.0007644362519147106, + "loss": 2.6265, + "step": 2991 + }, + { + "epoch": 0.34307992202729043, + "grad_norm": 0.25490853028229826, + "learning_rate": 0.0007642786338629106, + "loss": 2.4722, + "step": 2992 + }, + { + "epoch": 0.34319458777663114, + "grad_norm": 0.2752647679231488, + "learning_rate": 0.0007641209793589105, + "loss": 2.4972, + "step": 2993 + }, + { + "epoch": 0.3433092535259718, + "grad_norm": 0.30573805652931685, + "learning_rate": 0.0007639632884244561, + "loss": 2.5842, + "step": 2994 + }, + { + "epoch": 0.34342391927531246, + "grad_norm": 0.28279249326815376, + "learning_rate": 0.0007638055610812974, + "loss": 2.564, + "step": 2995 + }, + { + "epoch": 0.3435385850246531, + "grad_norm": 0.25922334773156375, + "learning_rate": 0.0007636477973511903, + "loss": 2.5169, + "step": 2996 + }, + { + "epoch": 0.34365325077399383, + "grad_norm": 0.24460716789736825, + "learning_rate": 0.0007634899972558951, + "loss": 2.4617, + "step": 2997 + }, + { + "epoch": 0.3437679165233345, + "grad_norm": 0.25940833264309615, + "learning_rate": 0.0007633321608171774, + "loss": 2.519, + "step": 2998 + }, + { + "epoch": 0.34388258227267515, + "grad_norm": 0.2280352684905453, + "learning_rate": 0.0007631742880568075, + "loss": 2.6092, + "step": 2999 + }, + { + "epoch": 0.3439972480220158, + "grad_norm": 0.26456777333686393, + "learning_rate": 0.0007630163789965614, + "loss": 2.5571, + "step": 3000 + }, + { + "epoch": 0.34411191377135647, + "grad_norm": 0.24853708531929045, + "learning_rate": 0.0007628584336582192, + "loss": 2.6016, + "step": 3001 + }, + { + "epoch": 0.3442265795206972, + "grad_norm": 0.2724459120312537, + "learning_rate": 0.0007627004520635666, + "loss": 2.5635, + "step": 3002 + }, + { + "epoch": 0.34434124527003784, + "grad_norm": 0.250540945215181, + "learning_rate": 0.0007625424342343943, + "loss": 2.5375, + "step": 3003 + }, + { + "epoch": 0.3444559110193785, + "grad_norm": 0.25383084293124786, + "learning_rate": 0.0007623843801924977, + "loss": 2.4034, + "step": 3004 + }, + { + "epoch": 0.34457057676871916, + "grad_norm": 0.247241570571851, + "learning_rate": 0.0007622262899596772, + "loss": 2.5627, + "step": 3005 + }, + { + "epoch": 0.3446852425180599, + "grad_norm": 0.2518870979872506, + "learning_rate": 0.0007620681635577386, + "loss": 2.7534, + "step": 3006 + }, + { + "epoch": 0.34479990826740053, + "grad_norm": 0.31003686086764876, + "learning_rate": 0.0007619100010084923, + "loss": 2.5854, + "step": 3007 + }, + { + "epoch": 0.3449145740167412, + "grad_norm": 0.25970709221011673, + "learning_rate": 0.0007617518023337538, + "loss": 2.6026, + "step": 3008 + }, + { + "epoch": 0.34502923976608185, + "grad_norm": 0.2528730828336293, + "learning_rate": 0.0007615935675553436, + "loss": 2.5886, + "step": 3009 + }, + { + "epoch": 0.34514390551542257, + "grad_norm": 0.25850658491346484, + "learning_rate": 0.0007614352966950871, + "loss": 2.3926, + "step": 3010 + }, + { + "epoch": 0.3452585712647632, + "grad_norm": 0.24742109870443613, + "learning_rate": 0.000761276989774815, + "loss": 2.4262, + "step": 3011 + }, + { + "epoch": 0.3453732370141039, + "grad_norm": 0.26757324110838554, + "learning_rate": 0.0007611186468163625, + "loss": 2.4976, + "step": 3012 + }, + { + "epoch": 0.34548790276344454, + "grad_norm": 0.27267669632638913, + "learning_rate": 0.0007609602678415699, + "loss": 2.5956, + "step": 3013 + }, + { + "epoch": 0.34560256851278526, + "grad_norm": 0.24636746760184622, + "learning_rate": 0.0007608018528722829, + "loss": 2.5059, + "step": 3014 + }, + { + "epoch": 0.3457172342621259, + "grad_norm": 0.25430587581224134, + "learning_rate": 0.0007606434019303514, + "loss": 2.5686, + "step": 3015 + }, + { + "epoch": 0.3458319000114666, + "grad_norm": 0.2465568613738964, + "learning_rate": 0.0007604849150376311, + "loss": 2.5116, + "step": 3016 + }, + { + "epoch": 0.34594656576080723, + "grad_norm": 0.24350998852489353, + "learning_rate": 0.0007603263922159822, + "loss": 2.5396, + "step": 3017 + }, + { + "epoch": 0.3460612315101479, + "grad_norm": 0.2601429319251723, + "learning_rate": 0.0007601678334872695, + "loss": 2.4812, + "step": 3018 + }, + { + "epoch": 0.3461758972594886, + "grad_norm": 0.2509550834724402, + "learning_rate": 0.0007600092388733635, + "loss": 2.5338, + "step": 3019 + }, + { + "epoch": 0.34629056300882927, + "grad_norm": 0.24654957398410943, + "learning_rate": 0.0007598506083961394, + "loss": 2.5427, + "step": 3020 + }, + { + "epoch": 0.3464052287581699, + "grad_norm": 0.26482297319793857, + "learning_rate": 0.0007596919420774768, + "loss": 2.4525, + "step": 3021 + }, + { + "epoch": 0.3465198945075106, + "grad_norm": 0.23488542310921207, + "learning_rate": 0.0007595332399392611, + "loss": 2.6023, + "step": 3022 + }, + { + "epoch": 0.3466345602568513, + "grad_norm": 0.271611728371233, + "learning_rate": 0.0007593745020033822, + "loss": 2.427, + "step": 3023 + }, + { + "epoch": 0.34674922600619196, + "grad_norm": 0.26388422875120565, + "learning_rate": 0.0007592157282917347, + "loss": 2.6405, + "step": 3024 + }, + { + "epoch": 0.3468638917555326, + "grad_norm": 0.2551837231129936, + "learning_rate": 0.0007590569188262186, + "loss": 2.4518, + "step": 3025 + }, + { + "epoch": 0.3469785575048733, + "grad_norm": 0.23604713964510807, + "learning_rate": 0.0007588980736287389, + "loss": 2.5128, + "step": 3026 + }, + { + "epoch": 0.347093223254214, + "grad_norm": 0.25082333103527926, + "learning_rate": 0.0007587391927212046, + "loss": 2.6474, + "step": 3027 + }, + { + "epoch": 0.34720788900355465, + "grad_norm": 0.2599287492152934, + "learning_rate": 0.0007585802761255309, + "loss": 2.483, + "step": 3028 + }, + { + "epoch": 0.3473225547528953, + "grad_norm": 0.24360808791440647, + "learning_rate": 0.0007584213238636372, + "loss": 2.5432, + "step": 3029 + }, + { + "epoch": 0.34743722050223597, + "grad_norm": 0.2581195754027987, + "learning_rate": 0.0007582623359574476, + "loss": 2.6509, + "step": 3030 + }, + { + "epoch": 0.3475518862515767, + "grad_norm": 0.24709694268970234, + "learning_rate": 0.0007581033124288918, + "loss": 2.4277, + "step": 3031 + }, + { + "epoch": 0.34766655200091734, + "grad_norm": 0.24540447347796454, + "learning_rate": 0.0007579442532999039, + "loss": 2.666, + "step": 3032 + }, + { + "epoch": 0.347781217750258, + "grad_norm": 0.2384026030227758, + "learning_rate": 0.0007577851585924231, + "loss": 2.5315, + "step": 3033 + }, + { + "epoch": 0.34789588349959866, + "grad_norm": 0.2604710275234728, + "learning_rate": 0.0007576260283283935, + "loss": 2.6304, + "step": 3034 + }, + { + "epoch": 0.3480105492489393, + "grad_norm": 0.28006978530364646, + "learning_rate": 0.0007574668625297641, + "loss": 2.5378, + "step": 3035 + }, + { + "epoch": 0.34812521499828003, + "grad_norm": 0.24665911761772866, + "learning_rate": 0.000757307661218489, + "loss": 2.4881, + "step": 3036 + }, + { + "epoch": 0.3482398807476207, + "grad_norm": 0.2586897352334971, + "learning_rate": 0.0007571484244165266, + "loss": 2.5092, + "step": 3037 + }, + { + "epoch": 0.34835454649696135, + "grad_norm": 0.25338860279966463, + "learning_rate": 0.0007569891521458405, + "loss": 2.539, + "step": 3038 + }, + { + "epoch": 0.348469212246302, + "grad_norm": 0.238887311418351, + "learning_rate": 0.0007568298444283999, + "loss": 2.5425, + "step": 3039 + }, + { + "epoch": 0.3485838779956427, + "grad_norm": 0.2710474392251138, + "learning_rate": 0.0007566705012861777, + "loss": 2.5659, + "step": 3040 + }, + { + "epoch": 0.3486985437449834, + "grad_norm": 0.2689009419089519, + "learning_rate": 0.0007565111227411524, + "loss": 2.5482, + "step": 3041 + }, + { + "epoch": 0.34881320949432404, + "grad_norm": 0.25264475299225364, + "learning_rate": 0.0007563517088153074, + "loss": 2.5167, + "step": 3042 + }, + { + "epoch": 0.3489278752436647, + "grad_norm": 0.2722166270629138, + "learning_rate": 0.0007561922595306305, + "loss": 2.4338, + "step": 3043 + }, + { + "epoch": 0.3490425409930054, + "grad_norm": 0.24868386124446248, + "learning_rate": 0.000756032774909115, + "loss": 2.65, + "step": 3044 + }, + { + "epoch": 0.34915720674234607, + "grad_norm": 0.2548102050097299, + "learning_rate": 0.0007558732549727586, + "loss": 2.6326, + "step": 3045 + }, + { + "epoch": 0.34927187249168673, + "grad_norm": 0.23867235509564047, + "learning_rate": 0.0007557136997435641, + "loss": 2.6259, + "step": 3046 + }, + { + "epoch": 0.3493865382410274, + "grad_norm": 0.24935824921190605, + "learning_rate": 0.000755554109243539, + "loss": 2.4261, + "step": 3047 + }, + { + "epoch": 0.3495012039903681, + "grad_norm": 0.294644462224729, + "learning_rate": 0.0007553944834946958, + "loss": 2.5452, + "step": 3048 + }, + { + "epoch": 0.34961586973970876, + "grad_norm": 0.26575038988103444, + "learning_rate": 0.0007552348225190519, + "loss": 2.5476, + "step": 3049 + }, + { + "epoch": 0.3497305354890494, + "grad_norm": 0.2497505957206534, + "learning_rate": 0.0007550751263386295, + "loss": 2.4672, + "step": 3050 + }, + { + "epoch": 0.3498452012383901, + "grad_norm": 0.2925600416503518, + "learning_rate": 0.0007549153949754557, + "loss": 2.4983, + "step": 3051 + }, + { + "epoch": 0.34995986698773074, + "grad_norm": 0.28910989034936235, + "learning_rate": 0.0007547556284515621, + "loss": 2.425, + "step": 3052 + }, + { + "epoch": 0.35007453273707145, + "grad_norm": 0.24001250617812137, + "learning_rate": 0.0007545958267889856, + "loss": 2.612, + "step": 3053 + }, + { + "epoch": 0.3501891984864121, + "grad_norm": 0.2482439816866991, + "learning_rate": 0.0007544359900097681, + "loss": 2.5203, + "step": 3054 + }, + { + "epoch": 0.35030386423575277, + "grad_norm": 0.2735427601812003, + "learning_rate": 0.0007542761181359556, + "loss": 2.524, + "step": 3055 + }, + { + "epoch": 0.35041852998509343, + "grad_norm": 0.24853383551240452, + "learning_rate": 0.0007541162111895994, + "loss": 2.5842, + "step": 3056 + }, + { + "epoch": 0.35053319573443414, + "grad_norm": 0.22286100485411198, + "learning_rate": 0.000753956269192756, + "loss": 2.4441, + "step": 3057 + }, + { + "epoch": 0.3506478614837748, + "grad_norm": 0.2541443833706939, + "learning_rate": 0.0007537962921674861, + "loss": 2.572, + "step": 3058 + }, + { + "epoch": 0.35076252723311546, + "grad_norm": 0.24377239883658305, + "learning_rate": 0.0007536362801358554, + "loss": 2.5892, + "step": 3059 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 0.26127626455922626, + "learning_rate": 0.0007534762331199345, + "loss": 2.6009, + "step": 3060 + }, + { + "epoch": 0.35099185873179684, + "grad_norm": 0.2756143796026638, + "learning_rate": 0.0007533161511417992, + "loss": 2.549, + "step": 3061 + }, + { + "epoch": 0.3511065244811375, + "grad_norm": 0.21981769867096165, + "learning_rate": 0.0007531560342235293, + "loss": 2.4849, + "step": 3062 + }, + { + "epoch": 0.35122119023047815, + "grad_norm": 0.2553058340087511, + "learning_rate": 0.0007529958823872101, + "loss": 2.5306, + "step": 3063 + }, + { + "epoch": 0.3513358559798188, + "grad_norm": 0.23016603320069604, + "learning_rate": 0.0007528356956549316, + "loss": 2.4898, + "step": 3064 + }, + { + "epoch": 0.3514505217291595, + "grad_norm": 0.2512041926921274, + "learning_rate": 0.0007526754740487881, + "loss": 2.5357, + "step": 3065 + }, + { + "epoch": 0.3515651874785002, + "grad_norm": 0.26328016476043703, + "learning_rate": 0.0007525152175908796, + "loss": 2.4987, + "step": 3066 + }, + { + "epoch": 0.35167985322784084, + "grad_norm": 0.26189030475327185, + "learning_rate": 0.0007523549263033103, + "loss": 2.5318, + "step": 3067 + }, + { + "epoch": 0.3517945189771815, + "grad_norm": 0.2583804657276663, + "learning_rate": 0.000752194600208189, + "loss": 2.5268, + "step": 3068 + }, + { + "epoch": 0.35190918472652216, + "grad_norm": 0.27073690091360375, + "learning_rate": 0.00075203423932763, + "loss": 2.3489, + "step": 3069 + }, + { + "epoch": 0.3520238504758629, + "grad_norm": 0.25978555372581147, + "learning_rate": 0.0007518738436837518, + "loss": 2.5029, + "step": 3070 + }, + { + "epoch": 0.35213851622520354, + "grad_norm": 0.2560339117060919, + "learning_rate": 0.000751713413298678, + "loss": 2.6239, + "step": 3071 + }, + { + "epoch": 0.3522531819745442, + "grad_norm": 0.2606779990326901, + "learning_rate": 0.0007515529481945372, + "loss": 2.5985, + "step": 3072 + }, + { + "epoch": 0.35236784772388485, + "grad_norm": 0.23552578199006022, + "learning_rate": 0.000751392448393462, + "loss": 2.551, + "step": 3073 + }, + { + "epoch": 0.35248251347322557, + "grad_norm": 0.2272784036750427, + "learning_rate": 0.0007512319139175905, + "loss": 2.367, + "step": 3074 + }, + { + "epoch": 0.3525971792225662, + "grad_norm": 0.2649427614841162, + "learning_rate": 0.0007510713447890653, + "loss": 2.5089, + "step": 3075 + }, + { + "epoch": 0.3527118449719069, + "grad_norm": 0.2462458394809935, + "learning_rate": 0.0007509107410300342, + "loss": 2.5211, + "step": 3076 + }, + { + "epoch": 0.35282651072124754, + "grad_norm": 0.2718851883736064, + "learning_rate": 0.0007507501026626491, + "loss": 2.3798, + "step": 3077 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.244551511956551, + "learning_rate": 0.000750589429709067, + "loss": 2.459, + "step": 3078 + }, + { + "epoch": 0.3530558422199289, + "grad_norm": 0.24192850420680795, + "learning_rate": 0.0007504287221914499, + "loss": 2.5986, + "step": 3079 + }, + { + "epoch": 0.3531705079692696, + "grad_norm": 0.22681378685831374, + "learning_rate": 0.0007502679801319641, + "loss": 2.5195, + "step": 3080 + }, + { + "epoch": 0.35328517371861023, + "grad_norm": 0.26785347958701206, + "learning_rate": 0.0007501072035527807, + "loss": 2.6528, + "step": 3081 + }, + { + "epoch": 0.35339983946795095, + "grad_norm": 0.2433764983441952, + "learning_rate": 0.0007499463924760764, + "loss": 2.5756, + "step": 3082 + }, + { + "epoch": 0.3535145052172916, + "grad_norm": 0.24064123909052318, + "learning_rate": 0.0007497855469240316, + "loss": 2.6152, + "step": 3083 + }, + { + "epoch": 0.35362917096663227, + "grad_norm": 0.24396816597769877, + "learning_rate": 0.0007496246669188319, + "loss": 2.5216, + "step": 3084 + }, + { + "epoch": 0.3537438367159729, + "grad_norm": 0.24642070592575777, + "learning_rate": 0.0007494637524826677, + "loss": 2.4653, + "step": 3085 + }, + { + "epoch": 0.3538585024653136, + "grad_norm": 0.23056413802132453, + "learning_rate": 0.000749302803637734, + "loss": 2.569, + "step": 3086 + }, + { + "epoch": 0.3539731682146543, + "grad_norm": 0.2718753377162669, + "learning_rate": 0.0007491418204062307, + "loss": 2.6735, + "step": 3087 + }, + { + "epoch": 0.35408783396399496, + "grad_norm": 0.23621785272136475, + "learning_rate": 0.0007489808028103622, + "loss": 2.4658, + "step": 3088 + }, + { + "epoch": 0.3542024997133356, + "grad_norm": 0.2441662715414475, + "learning_rate": 0.0007488197508723383, + "loss": 2.4597, + "step": 3089 + }, + { + "epoch": 0.3543171654626763, + "grad_norm": 0.24450045183394317, + "learning_rate": 0.0007486586646143725, + "loss": 2.5648, + "step": 3090 + }, + { + "epoch": 0.354431831212017, + "grad_norm": 0.26829804338749536, + "learning_rate": 0.0007484975440586838, + "loss": 2.535, + "step": 3091 + }, + { + "epoch": 0.35454649696135765, + "grad_norm": 0.26462531583581067, + "learning_rate": 0.0007483363892274958, + "loss": 2.5631, + "step": 3092 + }, + { + "epoch": 0.3546611627106983, + "grad_norm": 0.2620082907421261, + "learning_rate": 0.0007481752001430364, + "loss": 2.6509, + "step": 3093 + }, + { + "epoch": 0.35477582846003897, + "grad_norm": 0.25095729846041204, + "learning_rate": 0.000748013976827539, + "loss": 2.5392, + "step": 3094 + }, + { + "epoch": 0.3548904942093797, + "grad_norm": 0.24126590301942513, + "learning_rate": 0.0007478527193032409, + "loss": 2.6683, + "step": 3095 + }, + { + "epoch": 0.35500515995872034, + "grad_norm": 0.27080000600568355, + "learning_rate": 0.0007476914275923848, + "loss": 2.5882, + "step": 3096 + }, + { + "epoch": 0.355119825708061, + "grad_norm": 0.2567890732043505, + "learning_rate": 0.0007475301017172177, + "loss": 2.605, + "step": 3097 + }, + { + "epoch": 0.35523449145740166, + "grad_norm": 0.2701327770313484, + "learning_rate": 0.0007473687416999913, + "loss": 2.6006, + "step": 3098 + }, + { + "epoch": 0.35534915720674237, + "grad_norm": 0.25779980883817855, + "learning_rate": 0.0007472073475629624, + "loss": 2.5761, + "step": 3099 + }, + { + "epoch": 0.35546382295608303, + "grad_norm": 0.26403933252099265, + "learning_rate": 0.000747045919328392, + "loss": 2.4724, + "step": 3100 + }, + { + "epoch": 0.3555784887054237, + "grad_norm": 0.2335864184272574, + "learning_rate": 0.0007468844570185462, + "loss": 2.5974, + "step": 3101 + }, + { + "epoch": 0.35569315445476435, + "grad_norm": 0.24983549330983407, + "learning_rate": 0.0007467229606556955, + "loss": 2.5246, + "step": 3102 + }, + { + "epoch": 0.355807820204105, + "grad_norm": 0.28539044203481995, + "learning_rate": 0.0007465614302621155, + "loss": 2.7229, + "step": 3103 + }, + { + "epoch": 0.3559224859534457, + "grad_norm": 0.23687183625235972, + "learning_rate": 0.0007463998658600861, + "loss": 2.5572, + "step": 3104 + }, + { + "epoch": 0.3560371517027864, + "grad_norm": 0.26135377648656194, + "learning_rate": 0.0007462382674718919, + "loss": 2.5261, + "step": 3105 + }, + { + "epoch": 0.35615181745212704, + "grad_norm": 0.2312464893201061, + "learning_rate": 0.0007460766351198225, + "loss": 2.5296, + "step": 3106 + }, + { + "epoch": 0.3562664832014677, + "grad_norm": 0.2602927730487249, + "learning_rate": 0.0007459149688261719, + "loss": 2.4481, + "step": 3107 + }, + { + "epoch": 0.3563811489508084, + "grad_norm": 0.2242617653451122, + "learning_rate": 0.0007457532686132389, + "loss": 2.5021, + "step": 3108 + }, + { + "epoch": 0.35649581470014907, + "grad_norm": 0.2542063379751769, + "learning_rate": 0.0007455915345033271, + "loss": 2.5574, + "step": 3109 + }, + { + "epoch": 0.35661048044948973, + "grad_norm": 0.2795169365549323, + "learning_rate": 0.0007454297665187442, + "loss": 2.5236, + "step": 3110 + }, + { + "epoch": 0.3567251461988304, + "grad_norm": 0.25933121948296556, + "learning_rate": 0.0007452679646818037, + "loss": 2.5367, + "step": 3111 + }, + { + "epoch": 0.3568398119481711, + "grad_norm": 0.23722468774965372, + "learning_rate": 0.0007451061290148224, + "loss": 2.3433, + "step": 3112 + }, + { + "epoch": 0.35695447769751176, + "grad_norm": 0.24217836537842158, + "learning_rate": 0.000744944259540123, + "loss": 2.51, + "step": 3113 + }, + { + "epoch": 0.3570691434468524, + "grad_norm": 0.25039696908830755, + "learning_rate": 0.000744782356280032, + "loss": 2.4594, + "step": 3114 + }, + { + "epoch": 0.3571838091961931, + "grad_norm": 0.2428749110976045, + "learning_rate": 0.0007446204192568807, + "loss": 2.5136, + "step": 3115 + }, + { + "epoch": 0.3572984749455338, + "grad_norm": 0.2488758501385027, + "learning_rate": 0.0007444584484930057, + "loss": 2.482, + "step": 3116 + }, + { + "epoch": 0.35741314069487445, + "grad_norm": 0.261497838893265, + "learning_rate": 0.0007442964440107476, + "loss": 2.5315, + "step": 3117 + }, + { + "epoch": 0.3575278064442151, + "grad_norm": 0.26693339938227467, + "learning_rate": 0.0007441344058324515, + "loss": 2.4584, + "step": 3118 + }, + { + "epoch": 0.35764247219355577, + "grad_norm": 0.2561420382489682, + "learning_rate": 0.0007439723339804679, + "loss": 2.6031, + "step": 3119 + }, + { + "epoch": 0.35775713794289643, + "grad_norm": 0.2562933807935695, + "learning_rate": 0.0007438102284771513, + "loss": 2.5643, + "step": 3120 + }, + { + "epoch": 0.35787180369223714, + "grad_norm": 0.2536944027439819, + "learning_rate": 0.0007436480893448611, + "loss": 2.3698, + "step": 3121 + }, + { + "epoch": 0.3579864694415778, + "grad_norm": 0.2786183871586788, + "learning_rate": 0.0007434859166059616, + "loss": 2.5176, + "step": 3122 + }, + { + "epoch": 0.35810113519091846, + "grad_norm": 0.27828097414799635, + "learning_rate": 0.0007433237102828209, + "loss": 2.4585, + "step": 3123 + }, + { + "epoch": 0.3582158009402591, + "grad_norm": 0.2772068944608024, + "learning_rate": 0.0007431614703978125, + "loss": 2.5602, + "step": 3124 + }, + { + "epoch": 0.35833046668959984, + "grad_norm": 0.23336193353175308, + "learning_rate": 0.0007429991969733144, + "loss": 2.6417, + "step": 3125 + }, + { + "epoch": 0.3584451324389405, + "grad_norm": 0.2511228245756997, + "learning_rate": 0.0007428368900317092, + "loss": 2.4885, + "step": 3126 + }, + { + "epoch": 0.35855979818828115, + "grad_norm": 0.2583519132978108, + "learning_rate": 0.0007426745495953838, + "loss": 2.6091, + "step": 3127 + }, + { + "epoch": 0.3586744639376218, + "grad_norm": 0.2524640976963248, + "learning_rate": 0.00074251217568673, + "loss": 2.5446, + "step": 3128 + }, + { + "epoch": 0.3587891296869625, + "grad_norm": 0.23365453582426485, + "learning_rate": 0.0007423497683281444, + "loss": 2.4352, + "step": 3129 + }, + { + "epoch": 0.3589037954363032, + "grad_norm": 0.25032972853542, + "learning_rate": 0.0007421873275420277, + "loss": 2.5963, + "step": 3130 + }, + { + "epoch": 0.35901846118564384, + "grad_norm": 0.26103965283631186, + "learning_rate": 0.0007420248533507858, + "loss": 2.6379, + "step": 3131 + }, + { + "epoch": 0.3591331269349845, + "grad_norm": 0.291924118554959, + "learning_rate": 0.000741862345776829, + "loss": 2.521, + "step": 3132 + }, + { + "epoch": 0.3592477926843252, + "grad_norm": 0.2714307171327002, + "learning_rate": 0.0007416998048425716, + "loss": 2.5741, + "step": 3133 + }, + { + "epoch": 0.3593624584336659, + "grad_norm": 0.25127692416718456, + "learning_rate": 0.0007415372305704334, + "loss": 2.4767, + "step": 3134 + }, + { + "epoch": 0.35947712418300654, + "grad_norm": 0.271734111623691, + "learning_rate": 0.0007413746229828384, + "loss": 2.5986, + "step": 3135 + }, + { + "epoch": 0.3595917899323472, + "grad_norm": 0.2533711797925703, + "learning_rate": 0.0007412119821022153, + "loss": 2.4182, + "step": 3136 + }, + { + "epoch": 0.35970645568168785, + "grad_norm": 0.24779051516424047, + "learning_rate": 0.000741049307950997, + "loss": 2.5297, + "step": 3137 + }, + { + "epoch": 0.35982112143102857, + "grad_norm": 0.270192011798646, + "learning_rate": 0.0007408866005516215, + "loss": 2.6237, + "step": 3138 + }, + { + "epoch": 0.3599357871803692, + "grad_norm": 0.2406907088169097, + "learning_rate": 0.0007407238599265313, + "loss": 2.3516, + "step": 3139 + }, + { + "epoch": 0.3600504529297099, + "grad_norm": 0.2677697900180917, + "learning_rate": 0.0007405610860981731, + "loss": 2.6636, + "step": 3140 + }, + { + "epoch": 0.36016511867905054, + "grad_norm": 0.26232909617419553, + "learning_rate": 0.0007403982790889987, + "loss": 2.4711, + "step": 3141 + }, + { + "epoch": 0.36027978442839126, + "grad_norm": 0.26266834866067457, + "learning_rate": 0.0007402354389214642, + "loss": 2.562, + "step": 3142 + }, + { + "epoch": 0.3603944501777319, + "grad_norm": 0.2617115661948789, + "learning_rate": 0.0007400725656180298, + "loss": 2.518, + "step": 3143 + }, + { + "epoch": 0.3605091159270726, + "grad_norm": 0.24209658973999842, + "learning_rate": 0.0007399096592011616, + "loss": 2.5176, + "step": 3144 + }, + { + "epoch": 0.36062378167641324, + "grad_norm": 0.24993664326035045, + "learning_rate": 0.0007397467196933287, + "loss": 2.5033, + "step": 3145 + }, + { + "epoch": 0.36073844742575395, + "grad_norm": 0.24526713087173183, + "learning_rate": 0.000739583747117006, + "loss": 2.5369, + "step": 3146 + }, + { + "epoch": 0.3608531131750946, + "grad_norm": 0.22896931674997084, + "learning_rate": 0.0007394207414946723, + "loss": 2.4551, + "step": 3147 + }, + { + "epoch": 0.36096777892443527, + "grad_norm": 0.2774416856452001, + "learning_rate": 0.0007392577028488109, + "loss": 2.4096, + "step": 3148 + }, + { + "epoch": 0.3610824446737759, + "grad_norm": 0.24102181101106812, + "learning_rate": 0.0007390946312019102, + "loss": 2.5575, + "step": 3149 + }, + { + "epoch": 0.36119711042311664, + "grad_norm": 0.242008286580097, + "learning_rate": 0.0007389315265764626, + "loss": 2.5322, + "step": 3150 + }, + { + "epoch": 0.3613117761724573, + "grad_norm": 0.25490807445818875, + "learning_rate": 0.0007387683889949655, + "loss": 2.5105, + "step": 3151 + }, + { + "epoch": 0.36142644192179796, + "grad_norm": 0.22616711936224826, + "learning_rate": 0.0007386052184799204, + "loss": 2.4694, + "step": 3152 + }, + { + "epoch": 0.3615411076711386, + "grad_norm": 0.261212111431264, + "learning_rate": 0.0007384420150538336, + "loss": 2.5564, + "step": 3153 + }, + { + "epoch": 0.3616557734204793, + "grad_norm": 0.2339104101510996, + "learning_rate": 0.000738278778739216, + "loss": 2.5533, + "step": 3154 + }, + { + "epoch": 0.36177043916982, + "grad_norm": 0.23432215359359915, + "learning_rate": 0.0007381155095585827, + "loss": 2.5362, + "step": 3155 + }, + { + "epoch": 0.36188510491916065, + "grad_norm": 0.24958540801299817, + "learning_rate": 0.000737952207534454, + "loss": 2.5934, + "step": 3156 + }, + { + "epoch": 0.3619997706685013, + "grad_norm": 0.2586432324025952, + "learning_rate": 0.000737788872689354, + "loss": 2.525, + "step": 3157 + }, + { + "epoch": 0.36211443641784197, + "grad_norm": 0.25110139790699343, + "learning_rate": 0.0007376255050458116, + "loss": 2.5845, + "step": 3158 + }, + { + "epoch": 0.3622291021671827, + "grad_norm": 0.23297146131186758, + "learning_rate": 0.0007374621046263603, + "loss": 2.4288, + "step": 3159 + }, + { + "epoch": 0.36234376791652334, + "grad_norm": 0.2413993292073154, + "learning_rate": 0.0007372986714535381, + "loss": 2.5158, + "step": 3160 + }, + { + "epoch": 0.362458433665864, + "grad_norm": 0.26260594317519703, + "learning_rate": 0.0007371352055498876, + "loss": 2.5239, + "step": 3161 + }, + { + "epoch": 0.36257309941520466, + "grad_norm": 0.26102502591721705, + "learning_rate": 0.0007369717069379558, + "loss": 2.4975, + "step": 3162 + }, + { + "epoch": 0.3626877651645454, + "grad_norm": 0.2636504912807116, + "learning_rate": 0.0007368081756402939, + "loss": 2.6207, + "step": 3163 + }, + { + "epoch": 0.36280243091388603, + "grad_norm": 0.25344038533017216, + "learning_rate": 0.0007366446116794583, + "loss": 2.6166, + "step": 3164 + }, + { + "epoch": 0.3629170966632267, + "grad_norm": 0.2525686570130536, + "learning_rate": 0.0007364810150780091, + "loss": 2.5102, + "step": 3165 + }, + { + "epoch": 0.36303176241256735, + "grad_norm": 0.25523542114331427, + "learning_rate": 0.0007363173858585119, + "loss": 2.6065, + "step": 3166 + }, + { + "epoch": 0.36314642816190806, + "grad_norm": 0.22510147332865776, + "learning_rate": 0.000736153724043536, + "loss": 2.5973, + "step": 3167 + }, + { + "epoch": 0.3632610939112487, + "grad_norm": 0.2705255182844913, + "learning_rate": 0.000735990029655655, + "loss": 2.4549, + "step": 3168 + }, + { + "epoch": 0.3633757596605894, + "grad_norm": 0.263845768160062, + "learning_rate": 0.0007358263027174481, + "loss": 2.4938, + "step": 3169 + }, + { + "epoch": 0.36349042540993004, + "grad_norm": 0.22700588945144862, + "learning_rate": 0.0007356625432514979, + "loss": 2.4462, + "step": 3170 + }, + { + "epoch": 0.3636050911592707, + "grad_norm": 0.23398545061393705, + "learning_rate": 0.000735498751280392, + "loss": 2.4971, + "step": 3171 + }, + { + "epoch": 0.3637197569086114, + "grad_norm": 0.24765427397815926, + "learning_rate": 0.0007353349268267224, + "loss": 2.547, + "step": 3172 + }, + { + "epoch": 0.3638344226579521, + "grad_norm": 0.25692269905414156, + "learning_rate": 0.0007351710699130856, + "loss": 2.6207, + "step": 3173 + }, + { + "epoch": 0.36394908840729273, + "grad_norm": 0.26636382433693206, + "learning_rate": 0.0007350071805620823, + "loss": 2.5768, + "step": 3174 + }, + { + "epoch": 0.3640637541566334, + "grad_norm": 0.24393471039972586, + "learning_rate": 0.000734843258796318, + "loss": 2.5278, + "step": 3175 + }, + { + "epoch": 0.3641784199059741, + "grad_norm": 0.2575181026348928, + "learning_rate": 0.0007346793046384031, + "loss": 2.4372, + "step": 3176 + }, + { + "epoch": 0.36429308565531476, + "grad_norm": 0.2566293234573205, + "learning_rate": 0.0007345153181109511, + "loss": 2.5635, + "step": 3177 + }, + { + "epoch": 0.3644077514046554, + "grad_norm": 0.2632488479659653, + "learning_rate": 0.0007343512992365815, + "loss": 2.5153, + "step": 3178 + }, + { + "epoch": 0.3645224171539961, + "grad_norm": 0.2632026448900477, + "learning_rate": 0.0007341872480379172, + "loss": 2.5599, + "step": 3179 + }, + { + "epoch": 0.3646370829033368, + "grad_norm": 0.2502691935441752, + "learning_rate": 0.0007340231645375861, + "loss": 2.4435, + "step": 3180 + }, + { + "epoch": 0.36475174865267745, + "grad_norm": 0.2722110248376904, + "learning_rate": 0.0007338590487582202, + "loss": 2.5919, + "step": 3181 + }, + { + "epoch": 0.3648664144020181, + "grad_norm": 0.2554796875525832, + "learning_rate": 0.0007336949007224565, + "loss": 2.5571, + "step": 3182 + }, + { + "epoch": 0.36498108015135877, + "grad_norm": 0.22032470629196668, + "learning_rate": 0.0007335307204529356, + "loss": 2.5323, + "step": 3183 + }, + { + "epoch": 0.3650957459006995, + "grad_norm": 0.24047608808170987, + "learning_rate": 0.0007333665079723035, + "loss": 2.5185, + "step": 3184 + }, + { + "epoch": 0.36521041165004015, + "grad_norm": 0.26194170074787787, + "learning_rate": 0.00073320226330321, + "loss": 2.5665, + "step": 3185 + }, + { + "epoch": 0.3653250773993808, + "grad_norm": 0.24549036575306524, + "learning_rate": 0.0007330379864683096, + "loss": 2.4265, + "step": 3186 + }, + { + "epoch": 0.36543974314872146, + "grad_norm": 0.24583555473668625, + "learning_rate": 0.0007328736774902609, + "loss": 2.6855, + "step": 3187 + }, + { + "epoch": 0.3655544088980621, + "grad_norm": 0.2523588950172083, + "learning_rate": 0.0007327093363917274, + "loss": 2.4945, + "step": 3188 + }, + { + "epoch": 0.36566907464740284, + "grad_norm": 0.267660448077342, + "learning_rate": 0.0007325449631953769, + "loss": 2.4409, + "step": 3189 + }, + { + "epoch": 0.3657837403967435, + "grad_norm": 0.24664680897560612, + "learning_rate": 0.0007323805579238812, + "loss": 2.448, + "step": 3190 + }, + { + "epoch": 0.36589840614608415, + "grad_norm": 0.2583086638800528, + "learning_rate": 0.0007322161205999173, + "loss": 2.5507, + "step": 3191 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.2773166536605837, + "learning_rate": 0.000732051651246166, + "loss": 2.461, + "step": 3192 + }, + { + "epoch": 0.3661277376447655, + "grad_norm": 0.25899012348965544, + "learning_rate": 0.0007318871498853126, + "loss": 2.5581, + "step": 3193 + }, + { + "epoch": 0.3662424033941062, + "grad_norm": 0.24444413567267353, + "learning_rate": 0.0007317226165400473, + "loss": 2.6294, + "step": 3194 + }, + { + "epoch": 0.36635706914344685, + "grad_norm": 0.26457645257840207, + "learning_rate": 0.0007315580512330638, + "loss": 2.3897, + "step": 3195 + }, + { + "epoch": 0.3664717348927875, + "grad_norm": 0.2646956862078107, + "learning_rate": 0.000731393453987061, + "loss": 2.4889, + "step": 3196 + }, + { + "epoch": 0.3665864006421282, + "grad_norm": 0.22422556577523256, + "learning_rate": 0.0007312288248247423, + "loss": 2.4349, + "step": 3197 + }, + { + "epoch": 0.3667010663914689, + "grad_norm": 0.26564650084219754, + "learning_rate": 0.0007310641637688147, + "loss": 2.5783, + "step": 3198 + }, + { + "epoch": 0.36681573214080954, + "grad_norm": 0.24384006103754421, + "learning_rate": 0.0007308994708419901, + "loss": 2.4479, + "step": 3199 + }, + { + "epoch": 0.3669303978901502, + "grad_norm": 0.2415774468850789, + "learning_rate": 0.0007307347460669849, + "loss": 2.5699, + "step": 3200 + }, + { + "epoch": 0.3670450636394909, + "grad_norm": 0.26267245780875026, + "learning_rate": 0.0007305699894665196, + "loss": 2.5368, + "step": 3201 + }, + { + "epoch": 0.36715972938883157, + "grad_norm": 0.2801556207979414, + "learning_rate": 0.0007304052010633195, + "loss": 2.4933, + "step": 3202 + }, + { + "epoch": 0.3672743951381722, + "grad_norm": 0.2666971938531318, + "learning_rate": 0.0007302403808801136, + "loss": 2.5245, + "step": 3203 + }, + { + "epoch": 0.3673890608875129, + "grad_norm": 0.25236476566392807, + "learning_rate": 0.0007300755289396362, + "loss": 2.4845, + "step": 3204 + }, + { + "epoch": 0.36750372663685354, + "grad_norm": 0.2623797754877452, + "learning_rate": 0.000729910645264625, + "loss": 2.6673, + "step": 3205 + }, + { + "epoch": 0.36761839238619426, + "grad_norm": 0.24395328534456118, + "learning_rate": 0.000729745729877823, + "loss": 2.5293, + "step": 3206 + }, + { + "epoch": 0.3677330581355349, + "grad_norm": 0.25853618255640115, + "learning_rate": 0.0007295807828019767, + "loss": 2.6044, + "step": 3207 + }, + { + "epoch": 0.3678477238848756, + "grad_norm": 0.23019980342081223, + "learning_rate": 0.0007294158040598378, + "loss": 2.4894, + "step": 3208 + }, + { + "epoch": 0.36796238963421624, + "grad_norm": 0.25609561532275954, + "learning_rate": 0.0007292507936741616, + "loss": 2.5574, + "step": 3209 + }, + { + "epoch": 0.36807705538355695, + "grad_norm": 0.29422786273248425, + "learning_rate": 0.0007290857516677084, + "loss": 2.5858, + "step": 3210 + }, + { + "epoch": 0.3681917211328976, + "grad_norm": 0.2740218549789989, + "learning_rate": 0.0007289206780632427, + "loss": 2.4877, + "step": 3211 + }, + { + "epoch": 0.36830638688223827, + "grad_norm": 0.2675330483448109, + "learning_rate": 0.0007287555728835329, + "loss": 2.5066, + "step": 3212 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.25298404141476627, + "learning_rate": 0.0007285904361513522, + "loss": 2.5734, + "step": 3213 + }, + { + "epoch": 0.36853571838091964, + "grad_norm": 0.23717492208207117, + "learning_rate": 0.0007284252678894785, + "loss": 2.5246, + "step": 3214 + }, + { + "epoch": 0.3686503841302603, + "grad_norm": 0.2464028436615288, + "learning_rate": 0.0007282600681206929, + "loss": 2.5411, + "step": 3215 + }, + { + "epoch": 0.36876504987960096, + "grad_norm": 0.22870968620181448, + "learning_rate": 0.0007280948368677822, + "loss": 2.4523, + "step": 3216 + }, + { + "epoch": 0.3688797156289416, + "grad_norm": 0.24914657658671732, + "learning_rate": 0.0007279295741535367, + "loss": 2.5281, + "step": 3217 + }, + { + "epoch": 0.36899438137828233, + "grad_norm": 0.2345395972347272, + "learning_rate": 0.0007277642800007509, + "loss": 2.4514, + "step": 3218 + }, + { + "epoch": 0.369109047127623, + "grad_norm": 0.24618260636673986, + "learning_rate": 0.0007275989544322244, + "loss": 2.5058, + "step": 3219 + }, + { + "epoch": 0.36922371287696365, + "grad_norm": 0.246210083517168, + "learning_rate": 0.0007274335974707606, + "loss": 2.5728, + "step": 3220 + }, + { + "epoch": 0.3693383786263043, + "grad_norm": 0.27387977449719164, + "learning_rate": 0.0007272682091391671, + "loss": 2.4722, + "step": 3221 + }, + { + "epoch": 0.36945304437564497, + "grad_norm": 0.2668782157906226, + "learning_rate": 0.0007271027894602567, + "loss": 2.4624, + "step": 3222 + }, + { + "epoch": 0.3695677101249857, + "grad_norm": 0.24270577338292615, + "learning_rate": 0.0007269373384568451, + "loss": 2.3549, + "step": 3223 + }, + { + "epoch": 0.36968237587432634, + "grad_norm": 0.4314882449595344, + "learning_rate": 0.0007267718561517535, + "loss": 2.4365, + "step": 3224 + }, + { + "epoch": 0.369797041623667, + "grad_norm": 0.2669270447000771, + "learning_rate": 0.0007266063425678071, + "loss": 2.4471, + "step": 3225 + }, + { + "epoch": 0.36991170737300766, + "grad_norm": 0.2877288765648235, + "learning_rate": 0.0007264407977278354, + "loss": 2.4115, + "step": 3226 + }, + { + "epoch": 0.3700263731223484, + "grad_norm": 0.27120779434849474, + "learning_rate": 0.0007262752216546718, + "loss": 2.5605, + "step": 3227 + }, + { + "epoch": 0.37014103887168903, + "grad_norm": 0.2850911905268917, + "learning_rate": 0.0007261096143711545, + "loss": 2.4066, + "step": 3228 + }, + { + "epoch": 0.3702557046210297, + "grad_norm": 0.23631875609104616, + "learning_rate": 0.0007259439759001262, + "loss": 2.5284, + "step": 3229 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.2348722450075581, + "learning_rate": 0.0007257783062644333, + "loss": 2.4982, + "step": 3230 + }, + { + "epoch": 0.37048503611971106, + "grad_norm": 0.24340610980812102, + "learning_rate": 0.0007256126054869265, + "loss": 2.3834, + "step": 3231 + }, + { + "epoch": 0.3705997018690517, + "grad_norm": 0.26540718268036606, + "learning_rate": 0.0007254468735904616, + "loss": 2.5349, + "step": 3232 + }, + { + "epoch": 0.3707143676183924, + "grad_norm": 0.25597993580404177, + "learning_rate": 0.0007252811105978977, + "loss": 2.6193, + "step": 3233 + }, + { + "epoch": 0.37082903336773304, + "grad_norm": 0.25188527250381537, + "learning_rate": 0.000725115316532099, + "loss": 2.5288, + "step": 3234 + }, + { + "epoch": 0.37094369911707376, + "grad_norm": 0.26164758334789684, + "learning_rate": 0.0007249494914159332, + "loss": 2.586, + "step": 3235 + }, + { + "epoch": 0.3710583648664144, + "grad_norm": 0.23298366594038572, + "learning_rate": 0.0007247836352722733, + "loss": 2.4729, + "step": 3236 + }, + { + "epoch": 0.3711730306157551, + "grad_norm": 0.24603293215721309, + "learning_rate": 0.0007246177481239956, + "loss": 2.504, + "step": 3237 + }, + { + "epoch": 0.37128769636509573, + "grad_norm": 0.27903410293486464, + "learning_rate": 0.0007244518299939811, + "loss": 2.6111, + "step": 3238 + }, + { + "epoch": 0.3714023621144364, + "grad_norm": 0.2474475409142102, + "learning_rate": 0.0007242858809051152, + "loss": 2.5273, + "step": 3239 + }, + { + "epoch": 0.3715170278637771, + "grad_norm": 0.25753646906554684, + "learning_rate": 0.000724119900880287, + "loss": 2.5063, + "step": 3240 + }, + { + "epoch": 0.37163169361311776, + "grad_norm": 0.24352827498873375, + "learning_rate": 0.000723953889942391, + "loss": 2.4917, + "step": 3241 + }, + { + "epoch": 0.3717463593624584, + "grad_norm": 0.23353627447301376, + "learning_rate": 0.0007237878481143246, + "loss": 2.6147, + "step": 3242 + }, + { + "epoch": 0.3718610251117991, + "grad_norm": 0.24812268917828073, + "learning_rate": 0.0007236217754189903, + "loss": 2.5877, + "step": 3243 + }, + { + "epoch": 0.3719756908611398, + "grad_norm": 0.2537598939287037, + "learning_rate": 0.0007234556718792948, + "loss": 2.6926, + "step": 3244 + }, + { + "epoch": 0.37209035661048045, + "grad_norm": 0.22557097894618303, + "learning_rate": 0.0007232895375181488, + "loss": 2.4396, + "step": 3245 + }, + { + "epoch": 0.3722050223598211, + "grad_norm": 0.2630671910890379, + "learning_rate": 0.0007231233723584674, + "loss": 2.6861, + "step": 3246 + }, + { + "epoch": 0.3723196881091618, + "grad_norm": 0.2402911148950178, + "learning_rate": 0.0007229571764231699, + "loss": 2.4917, + "step": 3247 + }, + { + "epoch": 0.3724343538585025, + "grad_norm": 0.26405903773815587, + "learning_rate": 0.0007227909497351799, + "loss": 2.4953, + "step": 3248 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 0.26365237594792307, + "learning_rate": 0.000722624692317425, + "loss": 2.4865, + "step": 3249 + }, + { + "epoch": 0.3726636853571838, + "grad_norm": 0.2950275571909416, + "learning_rate": 0.0007224584041928374, + "loss": 2.5685, + "step": 3250 + }, + { + "epoch": 0.37277835110652446, + "grad_norm": 0.2507892023120786, + "learning_rate": 0.0007222920853843538, + "loss": 2.434, + "step": 3251 + }, + { + "epoch": 0.3728930168558652, + "grad_norm": 0.2753633085575958, + "learning_rate": 0.0007221257359149139, + "loss": 2.4592, + "step": 3252 + }, + { + "epoch": 0.37300768260520584, + "grad_norm": 0.24953854409271162, + "learning_rate": 0.0007219593558074629, + "loss": 2.5606, + "step": 3253 + }, + { + "epoch": 0.3731223483545465, + "grad_norm": 0.25734073797470497, + "learning_rate": 0.0007217929450849497, + "loss": 2.4357, + "step": 3254 + }, + { + "epoch": 0.37323701410388715, + "grad_norm": 0.23523324818467595, + "learning_rate": 0.0007216265037703276, + "loss": 2.453, + "step": 3255 + }, + { + "epoch": 0.3733516798532278, + "grad_norm": 0.2630094457018264, + "learning_rate": 0.0007214600318865538, + "loss": 2.5151, + "step": 3256 + }, + { + "epoch": 0.37346634560256853, + "grad_norm": 0.2630306631139327, + "learning_rate": 0.00072129352945659, + "loss": 2.4308, + "step": 3257 + }, + { + "epoch": 0.3735810113519092, + "grad_norm": 0.239780504673349, + "learning_rate": 0.000721126996503402, + "loss": 2.4762, + "step": 3258 + }, + { + "epoch": 0.37369567710124985, + "grad_norm": 0.24981416550407418, + "learning_rate": 0.0007209604330499599, + "loss": 2.6404, + "step": 3259 + }, + { + "epoch": 0.3738103428505905, + "grad_norm": 0.2292347240679528, + "learning_rate": 0.000720793839119238, + "loss": 2.6289, + "step": 3260 + }, + { + "epoch": 0.3739250085999312, + "grad_norm": 0.24431923119452312, + "learning_rate": 0.0007206272147342147, + "loss": 2.4456, + "step": 3261 + }, + { + "epoch": 0.3740396743492719, + "grad_norm": 0.2235047476141396, + "learning_rate": 0.0007204605599178728, + "loss": 2.4433, + "step": 3262 + }, + { + "epoch": 0.37415434009861254, + "grad_norm": 0.2558017428134492, + "learning_rate": 0.0007202938746931988, + "loss": 2.4333, + "step": 3263 + }, + { + "epoch": 0.3742690058479532, + "grad_norm": 0.27186364589481404, + "learning_rate": 0.0007201271590831841, + "loss": 2.5518, + "step": 3264 + }, + { + "epoch": 0.3743836715972939, + "grad_norm": 0.2577681055124668, + "learning_rate": 0.0007199604131108237, + "loss": 2.4012, + "step": 3265 + }, + { + "epoch": 0.37449833734663457, + "grad_norm": 0.281984562414622, + "learning_rate": 0.0007197936367991174, + "loss": 2.6898, + "step": 3266 + }, + { + "epoch": 0.3746130030959752, + "grad_norm": 0.2476130337581546, + "learning_rate": 0.0007196268301710684, + "loss": 2.5822, + "step": 3267 + }, + { + "epoch": 0.3747276688453159, + "grad_norm": 0.2585415917956713, + "learning_rate": 0.0007194599932496845, + "loss": 2.5309, + "step": 3268 + }, + { + "epoch": 0.3748423345946566, + "grad_norm": 0.25735917392389623, + "learning_rate": 0.000719293126057978, + "loss": 2.4814, + "step": 3269 + }, + { + "epoch": 0.37495700034399726, + "grad_norm": 0.24819746876095786, + "learning_rate": 0.0007191262286189649, + "loss": 2.4799, + "step": 3270 + }, + { + "epoch": 0.3750716660933379, + "grad_norm": 0.25917638685699074, + "learning_rate": 0.0007189593009556651, + "loss": 2.4459, + "step": 3271 + }, + { + "epoch": 0.3751863318426786, + "grad_norm": 0.24224178365774368, + "learning_rate": 0.0007187923430911039, + "loss": 2.4042, + "step": 3272 + }, + { + "epoch": 0.37530099759201924, + "grad_norm": 0.25414993525651836, + "learning_rate": 0.0007186253550483094, + "loss": 2.5821, + "step": 3273 + }, + { + "epoch": 0.37541566334135995, + "grad_norm": 0.2637319042643623, + "learning_rate": 0.0007184583368503146, + "loss": 2.4998, + "step": 3274 + }, + { + "epoch": 0.3755303290907006, + "grad_norm": 0.25101210864732776, + "learning_rate": 0.0007182912885201563, + "loss": 2.3071, + "step": 3275 + }, + { + "epoch": 0.37564499484004127, + "grad_norm": 0.2551360116957763, + "learning_rate": 0.0007181242100808759, + "loss": 2.4781, + "step": 3276 + }, + { + "epoch": 0.3757596605893819, + "grad_norm": 0.24886997041682357, + "learning_rate": 0.0007179571015555184, + "loss": 2.4702, + "step": 3277 + }, + { + "epoch": 0.37587432633872264, + "grad_norm": 0.28515522678036376, + "learning_rate": 0.0007177899629671335, + "loss": 2.5128, + "step": 3278 + }, + { + "epoch": 0.3759889920880633, + "grad_norm": 0.23683921806412583, + "learning_rate": 0.0007176227943387747, + "loss": 2.4161, + "step": 3279 + }, + { + "epoch": 0.37610365783740396, + "grad_norm": 0.2782393439960414, + "learning_rate": 0.0007174555956934996, + "loss": 2.4495, + "step": 3280 + }, + { + "epoch": 0.3762183235867446, + "grad_norm": 0.24963479356192292, + "learning_rate": 0.00071728836705437, + "loss": 2.4749, + "step": 3281 + }, + { + "epoch": 0.37633298933608533, + "grad_norm": 0.2842593860215262, + "learning_rate": 0.0007171211084444525, + "loss": 2.4916, + "step": 3282 + }, + { + "epoch": 0.376447655085426, + "grad_norm": 0.28217433601905123, + "learning_rate": 0.0007169538198868164, + "loss": 2.4329, + "step": 3283 + }, + { + "epoch": 0.37656232083476665, + "grad_norm": 0.2472455826753323, + "learning_rate": 0.0007167865014045365, + "loss": 2.4198, + "step": 3284 + }, + { + "epoch": 0.3766769865841073, + "grad_norm": 0.2804890030616936, + "learning_rate": 0.0007166191530206909, + "loss": 2.5317, + "step": 3285 + }, + { + "epoch": 0.376791652333448, + "grad_norm": 0.2926108196531338, + "learning_rate": 0.0007164517747583625, + "loss": 2.5855, + "step": 3286 + }, + { + "epoch": 0.3769063180827887, + "grad_norm": 0.2675069136044682, + "learning_rate": 0.0007162843666406376, + "loss": 2.4876, + "step": 3287 + }, + { + "epoch": 0.37702098383212934, + "grad_norm": 0.2683306624872134, + "learning_rate": 0.0007161169286906071, + "loss": 2.5256, + "step": 3288 + }, + { + "epoch": 0.37713564958147, + "grad_norm": 0.2400710929245723, + "learning_rate": 0.0007159494609313659, + "loss": 2.4482, + "step": 3289 + }, + { + "epoch": 0.37725031533081066, + "grad_norm": 0.29350463104060703, + "learning_rate": 0.0007157819633860129, + "loss": 2.4206, + "step": 3290 + }, + { + "epoch": 0.3773649810801514, + "grad_norm": 0.27010895531891316, + "learning_rate": 0.0007156144360776514, + "loss": 2.4724, + "step": 3291 + }, + { + "epoch": 0.37747964682949203, + "grad_norm": 0.2714227386542217, + "learning_rate": 0.0007154468790293882, + "loss": 2.4909, + "step": 3292 + }, + { + "epoch": 0.3775943125788327, + "grad_norm": 0.2561517048708508, + "learning_rate": 0.0007152792922643348, + "loss": 2.4928, + "step": 3293 + }, + { + "epoch": 0.37770897832817335, + "grad_norm": 0.26605625160005275, + "learning_rate": 0.0007151116758056066, + "loss": 2.6399, + "step": 3294 + }, + { + "epoch": 0.37782364407751406, + "grad_norm": 0.25441285432841027, + "learning_rate": 0.0007149440296763234, + "loss": 2.4553, + "step": 3295 + }, + { + "epoch": 0.3779383098268547, + "grad_norm": 0.24323779990089406, + "learning_rate": 0.0007147763538996083, + "loss": 2.4778, + "step": 3296 + }, + { + "epoch": 0.3780529755761954, + "grad_norm": 0.2868573711085698, + "learning_rate": 0.0007146086484985892, + "loss": 2.5466, + "step": 3297 + }, + { + "epoch": 0.37816764132553604, + "grad_norm": 0.24877904328385905, + "learning_rate": 0.000714440913496398, + "loss": 2.5608, + "step": 3298 + }, + { + "epoch": 0.37828230707487676, + "grad_norm": 0.2384267621750337, + "learning_rate": 0.0007142731489161703, + "loss": 2.5078, + "step": 3299 + }, + { + "epoch": 0.3783969728242174, + "grad_norm": 0.27211266765664244, + "learning_rate": 0.0007141053547810459, + "loss": 2.4924, + "step": 3300 + }, + { + "epoch": 0.3785116385735581, + "grad_norm": 0.2363945689063107, + "learning_rate": 0.0007139375311141693, + "loss": 2.5695, + "step": 3301 + }, + { + "epoch": 0.37862630432289873, + "grad_norm": 0.25944902731150676, + "learning_rate": 0.0007137696779386883, + "loss": 2.5451, + "step": 3302 + }, + { + "epoch": 0.37874097007223945, + "grad_norm": 0.2436715019095733, + "learning_rate": 0.0007136017952777549, + "loss": 2.6362, + "step": 3303 + }, + { + "epoch": 0.3788556358215801, + "grad_norm": 0.24953043135186878, + "learning_rate": 0.0007134338831545257, + "loss": 2.4352, + "step": 3304 + }, + { + "epoch": 0.37897030157092076, + "grad_norm": 0.2495108751703956, + "learning_rate": 0.0007132659415921605, + "loss": 2.6006, + "step": 3305 + }, + { + "epoch": 0.3790849673202614, + "grad_norm": 0.22230315546549498, + "learning_rate": 0.0007130979706138241, + "loss": 2.535, + "step": 3306 + }, + { + "epoch": 0.3791996330696021, + "grad_norm": 0.23781559852240367, + "learning_rate": 0.0007129299702426845, + "loss": 2.4698, + "step": 3307 + }, + { + "epoch": 0.3793142988189428, + "grad_norm": 0.2298514437584039, + "learning_rate": 0.0007127619405019143, + "loss": 2.4339, + "step": 3308 + }, + { + "epoch": 0.37942896456828346, + "grad_norm": 0.23273957183336502, + "learning_rate": 0.0007125938814146901, + "loss": 2.6607, + "step": 3309 + }, + { + "epoch": 0.3795436303176241, + "grad_norm": 0.22860370399677724, + "learning_rate": 0.0007124257930041924, + "loss": 2.559, + "step": 3310 + }, + { + "epoch": 0.3796582960669648, + "grad_norm": 0.26940035801422235, + "learning_rate": 0.0007122576752936058, + "loss": 2.5228, + "step": 3311 + }, + { + "epoch": 0.3797729618163055, + "grad_norm": 0.3138729181103564, + "learning_rate": 0.0007120895283061187, + "loss": 2.508, + "step": 3312 + }, + { + "epoch": 0.37988762756564615, + "grad_norm": 0.2531077710613849, + "learning_rate": 0.000711921352064924, + "loss": 2.6082, + "step": 3313 + }, + { + "epoch": 0.3800022933149868, + "grad_norm": 0.2528142608022852, + "learning_rate": 0.0007117531465932185, + "loss": 2.4632, + "step": 3314 + }, + { + "epoch": 0.38011695906432746, + "grad_norm": 0.2607900089624971, + "learning_rate": 0.0007115849119142026, + "loss": 2.4894, + "step": 3315 + }, + { + "epoch": 0.3802316248136682, + "grad_norm": 0.24983596822560497, + "learning_rate": 0.0007114166480510815, + "loss": 2.5723, + "step": 3316 + }, + { + "epoch": 0.38034629056300884, + "grad_norm": 0.23058757652317827, + "learning_rate": 0.0007112483550270639, + "loss": 2.5334, + "step": 3317 + }, + { + "epoch": 0.3804609563123495, + "grad_norm": 0.2537451053345036, + "learning_rate": 0.000711080032865362, + "loss": 2.3385, + "step": 3318 + }, + { + "epoch": 0.38057562206169016, + "grad_norm": 0.2716560183977039, + "learning_rate": 0.0007109116815891936, + "loss": 2.608, + "step": 3319 + }, + { + "epoch": 0.38069028781103087, + "grad_norm": 0.24624924368343779, + "learning_rate": 0.0007107433012217788, + "loss": 2.4975, + "step": 3320 + }, + { + "epoch": 0.38080495356037153, + "grad_norm": 0.26167266229930625, + "learning_rate": 0.0007105748917863427, + "loss": 2.523, + "step": 3321 + }, + { + "epoch": 0.3809196193097122, + "grad_norm": 0.24446805610781225, + "learning_rate": 0.0007104064533061144, + "loss": 2.512, + "step": 3322 + }, + { + "epoch": 0.38103428505905285, + "grad_norm": 0.22864118414425114, + "learning_rate": 0.0007102379858043264, + "loss": 2.4643, + "step": 3323 + }, + { + "epoch": 0.3811489508083935, + "grad_norm": 0.22163741342962576, + "learning_rate": 0.0007100694893042159, + "loss": 2.4351, + "step": 3324 + }, + { + "epoch": 0.3812636165577342, + "grad_norm": 0.24346505461253604, + "learning_rate": 0.0007099009638290235, + "loss": 2.5602, + "step": 3325 + }, + { + "epoch": 0.3813782823070749, + "grad_norm": 0.25406309761678675, + "learning_rate": 0.0007097324094019943, + "loss": 2.5139, + "step": 3326 + }, + { + "epoch": 0.38149294805641554, + "grad_norm": 0.24958389959018776, + "learning_rate": 0.000709563826046377, + "loss": 2.5542, + "step": 3327 + }, + { + "epoch": 0.3816076138057562, + "grad_norm": 0.25530414864736733, + "learning_rate": 0.0007093952137854247, + "loss": 2.4556, + "step": 3328 + }, + { + "epoch": 0.3817222795550969, + "grad_norm": 0.2596914308737866, + "learning_rate": 0.0007092265726423941, + "loss": 2.3767, + "step": 3329 + }, + { + "epoch": 0.38183694530443757, + "grad_norm": 0.26460260558623416, + "learning_rate": 0.0007090579026405458, + "loss": 2.505, + "step": 3330 + }, + { + "epoch": 0.38195161105377823, + "grad_norm": 0.25373030800212676, + "learning_rate": 0.0007088892038031449, + "loss": 2.6526, + "step": 3331 + }, + { + "epoch": 0.3820662768031189, + "grad_norm": 0.24950715628023037, + "learning_rate": 0.0007087204761534603, + "loss": 2.5383, + "step": 3332 + }, + { + "epoch": 0.3821809425524596, + "grad_norm": 0.23607081532048016, + "learning_rate": 0.0007085517197147645, + "loss": 2.4631, + "step": 3333 + }, + { + "epoch": 0.38229560830180026, + "grad_norm": 0.25358651989888337, + "learning_rate": 0.0007083829345103343, + "loss": 2.3509, + "step": 3334 + }, + { + "epoch": 0.3824102740511409, + "grad_norm": 0.2585353368431906, + "learning_rate": 0.0007082141205634505, + "loss": 2.5429, + "step": 3335 + }, + { + "epoch": 0.3825249398004816, + "grad_norm": 0.2674027725811041, + "learning_rate": 0.0007080452778973976, + "loss": 2.4928, + "step": 3336 + }, + { + "epoch": 0.3826396055498223, + "grad_norm": 0.24283100874926547, + "learning_rate": 0.0007078764065354643, + "loss": 2.441, + "step": 3337 + }, + { + "epoch": 0.38275427129916295, + "grad_norm": 0.25387582088542593, + "learning_rate": 0.0007077075065009433, + "loss": 2.4478, + "step": 3338 + }, + { + "epoch": 0.3828689370485036, + "grad_norm": 0.2456607941012106, + "learning_rate": 0.000707538577817131, + "loss": 2.6036, + "step": 3339 + }, + { + "epoch": 0.38298360279784427, + "grad_norm": 0.23931052301147723, + "learning_rate": 0.0007073696205073278, + "loss": 2.544, + "step": 3340 + }, + { + "epoch": 0.383098268547185, + "grad_norm": 0.2736046249965013, + "learning_rate": 0.0007072006345948385, + "loss": 2.5929, + "step": 3341 + }, + { + "epoch": 0.38321293429652564, + "grad_norm": 0.23083186537481612, + "learning_rate": 0.0007070316201029711, + "loss": 2.551, + "step": 3342 + }, + { + "epoch": 0.3833276000458663, + "grad_norm": 0.23528617571556937, + "learning_rate": 0.0007068625770550381, + "loss": 2.5754, + "step": 3343 + }, + { + "epoch": 0.38344226579520696, + "grad_norm": 0.22540987100634446, + "learning_rate": 0.0007066935054743559, + "loss": 2.5924, + "step": 3344 + }, + { + "epoch": 0.3835569315445476, + "grad_norm": 0.26224861677081374, + "learning_rate": 0.0007065244053842444, + "loss": 2.6004, + "step": 3345 + }, + { + "epoch": 0.38367159729388833, + "grad_norm": 0.24170142545142867, + "learning_rate": 0.0007063552768080279, + "loss": 2.4819, + "step": 3346 + }, + { + "epoch": 0.383786263043229, + "grad_norm": 0.24283165869625842, + "learning_rate": 0.0007061861197690347, + "loss": 2.5367, + "step": 3347 + }, + { + "epoch": 0.38390092879256965, + "grad_norm": 0.2513011323788103, + "learning_rate": 0.0007060169342905962, + "loss": 2.6464, + "step": 3348 + }, + { + "epoch": 0.3840155945419103, + "grad_norm": 0.2560748726793121, + "learning_rate": 0.0007058477203960488, + "loss": 2.6137, + "step": 3349 + }, + { + "epoch": 0.384130260291251, + "grad_norm": 0.2571614124123322, + "learning_rate": 0.0007056784781087322, + "loss": 2.4735, + "step": 3350 + }, + { + "epoch": 0.3842449260405917, + "grad_norm": 0.2677970324336668, + "learning_rate": 0.0007055092074519903, + "loss": 2.3965, + "step": 3351 + }, + { + "epoch": 0.38435959178993234, + "grad_norm": 0.28981485398703594, + "learning_rate": 0.0007053399084491703, + "loss": 2.4689, + "step": 3352 + }, + { + "epoch": 0.384474257539273, + "grad_norm": 0.2788058522678535, + "learning_rate": 0.0007051705811236241, + "loss": 2.4455, + "step": 3353 + }, + { + "epoch": 0.3845889232886137, + "grad_norm": 0.26058513884293627, + "learning_rate": 0.0007050012254987073, + "loss": 2.5213, + "step": 3354 + }, + { + "epoch": 0.3847035890379544, + "grad_norm": 0.2574722865085449, + "learning_rate": 0.000704831841597779, + "loss": 2.4439, + "step": 3355 + }, + { + "epoch": 0.38481825478729503, + "grad_norm": 0.2684976977094693, + "learning_rate": 0.0007046624294442026, + "loss": 2.5769, + "step": 3356 + }, + { + "epoch": 0.3849329205366357, + "grad_norm": 0.2686916145778126, + "learning_rate": 0.0007044929890613454, + "loss": 2.5885, + "step": 3357 + }, + { + "epoch": 0.3850475862859764, + "grad_norm": 0.26210879128715947, + "learning_rate": 0.0007043235204725783, + "loss": 2.4613, + "step": 3358 + }, + { + "epoch": 0.38516225203531707, + "grad_norm": 0.2336636478780257, + "learning_rate": 0.0007041540237012762, + "loss": 2.4964, + "step": 3359 + }, + { + "epoch": 0.3852769177846577, + "grad_norm": 0.24694928705261687, + "learning_rate": 0.0007039844987708182, + "loss": 2.5592, + "step": 3360 + }, + { + "epoch": 0.3853915835339984, + "grad_norm": 0.2533691713941954, + "learning_rate": 0.0007038149457045868, + "loss": 2.5027, + "step": 3361 + }, + { + "epoch": 0.38550624928333904, + "grad_norm": 0.270405223648937, + "learning_rate": 0.0007036453645259688, + "loss": 2.5241, + "step": 3362 + }, + { + "epoch": 0.38562091503267976, + "grad_norm": 0.22962353767848545, + "learning_rate": 0.0007034757552583545, + "loss": 2.6364, + "step": 3363 + }, + { + "epoch": 0.3857355807820204, + "grad_norm": 0.28689541682222786, + "learning_rate": 0.0007033061179251385, + "loss": 2.7531, + "step": 3364 + }, + { + "epoch": 0.3858502465313611, + "grad_norm": 0.264123249959899, + "learning_rate": 0.0007031364525497187, + "loss": 2.4431, + "step": 3365 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 0.23635801444780508, + "learning_rate": 0.0007029667591554975, + "loss": 2.458, + "step": 3366 + }, + { + "epoch": 0.38607957803004245, + "grad_norm": 0.23520158403652955, + "learning_rate": 0.0007027970377658809, + "loss": 2.5543, + "step": 3367 + }, + { + "epoch": 0.3861942437793831, + "grad_norm": 0.2588827576846166, + "learning_rate": 0.0007026272884042784, + "loss": 2.5377, + "step": 3368 + }, + { + "epoch": 0.38630890952872377, + "grad_norm": 0.25949575503752687, + "learning_rate": 0.0007024575110941041, + "loss": 2.4821, + "step": 3369 + }, + { + "epoch": 0.3864235752780644, + "grad_norm": 0.26892930430821027, + "learning_rate": 0.0007022877058587751, + "loss": 2.5521, + "step": 3370 + }, + { + "epoch": 0.38653824102740514, + "grad_norm": 0.2594161084094227, + "learning_rate": 0.0007021178727217131, + "loss": 2.4754, + "step": 3371 + }, + { + "epoch": 0.3866529067767458, + "grad_norm": 0.24833004195939623, + "learning_rate": 0.0007019480117063433, + "loss": 2.4935, + "step": 3372 + }, + { + "epoch": 0.38676757252608646, + "grad_norm": 0.24074450862409577, + "learning_rate": 0.0007017781228360948, + "loss": 2.5899, + "step": 3373 + }, + { + "epoch": 0.3868822382754271, + "grad_norm": 0.2398259014342528, + "learning_rate": 0.0007016082061344005, + "loss": 2.6236, + "step": 3374 + }, + { + "epoch": 0.38699690402476783, + "grad_norm": 0.24888152024426077, + "learning_rate": 0.000701438261624697, + "loss": 2.4112, + "step": 3375 + }, + { + "epoch": 0.3871115697741085, + "grad_norm": 0.24400361753616306, + "learning_rate": 0.0007012682893304254, + "loss": 2.5067, + "step": 3376 + }, + { + "epoch": 0.38722623552344915, + "grad_norm": 0.26077018264134383, + "learning_rate": 0.0007010982892750296, + "loss": 2.5201, + "step": 3377 + }, + { + "epoch": 0.3873409012727898, + "grad_norm": 0.2602928750228207, + "learning_rate": 0.0007009282614819581, + "loss": 2.4977, + "step": 3378 + }, + { + "epoch": 0.38745556702213046, + "grad_norm": 0.27300911599832545, + "learning_rate": 0.000700758205974663, + "loss": 2.5493, + "step": 3379 + }, + { + "epoch": 0.3875702327714712, + "grad_norm": 0.2539265346184591, + "learning_rate": 0.0007005881227766001, + "loss": 2.4786, + "step": 3380 + }, + { + "epoch": 0.38768489852081184, + "grad_norm": 0.23683777016482896, + "learning_rate": 0.0007004180119112293, + "loss": 2.3898, + "step": 3381 + }, + { + "epoch": 0.3877995642701525, + "grad_norm": 0.24763603992682945, + "learning_rate": 0.0007002478734020141, + "loss": 2.4997, + "step": 3382 + }, + { + "epoch": 0.38791423001949316, + "grad_norm": 0.22645043594248754, + "learning_rate": 0.0007000777072724218, + "loss": 2.5235, + "step": 3383 + }, + { + "epoch": 0.38802889576883387, + "grad_norm": 0.2683503216602483, + "learning_rate": 0.0006999075135459235, + "loss": 2.6094, + "step": 3384 + }, + { + "epoch": 0.38814356151817453, + "grad_norm": 0.24670818833312091, + "learning_rate": 0.0006997372922459944, + "loss": 2.5092, + "step": 3385 + }, + { + "epoch": 0.3882582272675152, + "grad_norm": 0.25397577423702644, + "learning_rate": 0.0006995670433961132, + "loss": 2.5657, + "step": 3386 + }, + { + "epoch": 0.38837289301685585, + "grad_norm": 0.2652067494326082, + "learning_rate": 0.0006993967670197624, + "loss": 2.4736, + "step": 3387 + }, + { + "epoch": 0.38848755876619656, + "grad_norm": 0.23549094916148403, + "learning_rate": 0.0006992264631404284, + "loss": 2.3756, + "step": 3388 + }, + { + "epoch": 0.3886022245155372, + "grad_norm": 0.25191899809067586, + "learning_rate": 0.0006990561317816016, + "loss": 2.5228, + "step": 3389 + }, + { + "epoch": 0.3887168902648779, + "grad_norm": 0.22677073268131437, + "learning_rate": 0.0006988857729667754, + "loss": 2.3681, + "step": 3390 + }, + { + "epoch": 0.38883155601421854, + "grad_norm": 0.24297554904185603, + "learning_rate": 0.0006987153867194484, + "loss": 2.5997, + "step": 3391 + }, + { + "epoch": 0.38894622176355925, + "grad_norm": 0.2716224793322931, + "learning_rate": 0.0006985449730631215, + "loss": 2.5984, + "step": 3392 + }, + { + "epoch": 0.3890608875128999, + "grad_norm": 0.3064722635186922, + "learning_rate": 0.0006983745320212998, + "loss": 2.4407, + "step": 3393 + }, + { + "epoch": 0.38917555326224057, + "grad_norm": 0.26051324921512525, + "learning_rate": 0.0006982040636174932, + "loss": 2.4945, + "step": 3394 + }, + { + "epoch": 0.38929021901158123, + "grad_norm": 0.2697298584349351, + "learning_rate": 0.0006980335678752141, + "loss": 2.4284, + "step": 3395 + }, + { + "epoch": 0.3894048847609219, + "grad_norm": 0.27946306299529206, + "learning_rate": 0.000697863044817979, + "loss": 2.5016, + "step": 3396 + }, + { + "epoch": 0.3895195505102626, + "grad_norm": 0.2868389543045102, + "learning_rate": 0.0006976924944693086, + "loss": 2.6699, + "step": 3397 + }, + { + "epoch": 0.38963421625960326, + "grad_norm": 0.237027763218194, + "learning_rate": 0.0006975219168527269, + "loss": 2.4717, + "step": 3398 + }, + { + "epoch": 0.3897488820089439, + "grad_norm": 0.23429753676065615, + "learning_rate": 0.000697351311991762, + "loss": 2.5792, + "step": 3399 + }, + { + "epoch": 0.3898635477582846, + "grad_norm": 0.23140582421065567, + "learning_rate": 0.0006971806799099452, + "loss": 2.4249, + "step": 3400 + }, + { + "epoch": 0.3899782135076253, + "grad_norm": 0.256279317365194, + "learning_rate": 0.0006970100206308126, + "loss": 2.7002, + "step": 3401 + }, + { + "epoch": 0.39009287925696595, + "grad_norm": 0.27720273391679867, + "learning_rate": 0.0006968393341779027, + "loss": 2.4539, + "step": 3402 + }, + { + "epoch": 0.3902075450063066, + "grad_norm": 0.2534174190871776, + "learning_rate": 0.0006966686205747588, + "loss": 2.446, + "step": 3403 + }, + { + "epoch": 0.39032221075564727, + "grad_norm": 0.2711533442650797, + "learning_rate": 0.0006964978798449276, + "loss": 2.5753, + "step": 3404 + }, + { + "epoch": 0.390436876504988, + "grad_norm": 0.2530624955289696, + "learning_rate": 0.0006963271120119594, + "loss": 2.4999, + "step": 3405 + }, + { + "epoch": 0.39055154225432864, + "grad_norm": 0.2369098573350872, + "learning_rate": 0.0006961563170994085, + "loss": 2.4111, + "step": 3406 + }, + { + "epoch": 0.3906662080036693, + "grad_norm": 0.24942123435146613, + "learning_rate": 0.0006959854951308328, + "loss": 2.4199, + "step": 3407 + }, + { + "epoch": 0.39078087375300996, + "grad_norm": 0.2625258239123289, + "learning_rate": 0.0006958146461297938, + "loss": 2.5919, + "step": 3408 + }, + { + "epoch": 0.3908955395023507, + "grad_norm": 0.23449644844918574, + "learning_rate": 0.0006956437701198568, + "loss": 2.5345, + "step": 3409 + }, + { + "epoch": 0.39101020525169133, + "grad_norm": 0.28390174516792493, + "learning_rate": 0.0006954728671245911, + "loss": 2.6242, + "step": 3410 + }, + { + "epoch": 0.391124871001032, + "grad_norm": 0.2833875103169099, + "learning_rate": 0.0006953019371675695, + "loss": 2.5364, + "step": 3411 + }, + { + "epoch": 0.39123953675037265, + "grad_norm": 0.25935329274473834, + "learning_rate": 0.0006951309802723685, + "loss": 2.428, + "step": 3412 + }, + { + "epoch": 0.3913542024997133, + "grad_norm": 0.22963252659029773, + "learning_rate": 0.0006949599964625682, + "loss": 2.4956, + "step": 3413 + }, + { + "epoch": 0.391468868249054, + "grad_norm": 0.23401401530948931, + "learning_rate": 0.0006947889857617526, + "loss": 2.537, + "step": 3414 + }, + { + "epoch": 0.3915835339983947, + "grad_norm": 0.27170989784952354, + "learning_rate": 0.0006946179481935095, + "loss": 2.4996, + "step": 3415 + }, + { + "epoch": 0.39169819974773534, + "grad_norm": 0.254465599905227, + "learning_rate": 0.0006944468837814302, + "loss": 2.4143, + "step": 3416 + }, + { + "epoch": 0.391812865497076, + "grad_norm": 0.2371091358321495, + "learning_rate": 0.00069427579254911, + "loss": 2.5095, + "step": 3417 + }, + { + "epoch": 0.3919275312464167, + "grad_norm": 0.23945438402459795, + "learning_rate": 0.0006941046745201472, + "loss": 2.445, + "step": 3418 + }, + { + "epoch": 0.3920421969957574, + "grad_norm": 0.24635620659937724, + "learning_rate": 0.0006939335297181447, + "loss": 2.5096, + "step": 3419 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.25276313256154886, + "learning_rate": 0.0006937623581667082, + "loss": 2.5869, + "step": 3420 + }, + { + "epoch": 0.3922715284944387, + "grad_norm": 0.2532248805180257, + "learning_rate": 0.000693591159889448, + "loss": 2.6309, + "step": 3421 + }, + { + "epoch": 0.3923861942437794, + "grad_norm": 0.24398906472736517, + "learning_rate": 0.0006934199349099775, + "loss": 2.5085, + "step": 3422 + }, + { + "epoch": 0.39250085999312007, + "grad_norm": 0.22956572760697563, + "learning_rate": 0.0006932486832519139, + "loss": 2.4465, + "step": 3423 + }, + { + "epoch": 0.3926155257424607, + "grad_norm": 0.2538271743899817, + "learning_rate": 0.000693077404938878, + "loss": 2.4636, + "step": 3424 + }, + { + "epoch": 0.3927301914918014, + "grad_norm": 0.24382766978951204, + "learning_rate": 0.0006929060999944945, + "loss": 2.4959, + "step": 3425 + }, + { + "epoch": 0.3928448572411421, + "grad_norm": 0.27106006418220685, + "learning_rate": 0.0006927347684423918, + "loss": 2.447, + "step": 3426 + }, + { + "epoch": 0.39295952299048276, + "grad_norm": 0.243147548846084, + "learning_rate": 0.0006925634103062015, + "loss": 2.4928, + "step": 3427 + }, + { + "epoch": 0.3930741887398234, + "grad_norm": 0.2595088625397045, + "learning_rate": 0.0006923920256095594, + "loss": 2.581, + "step": 3428 + }, + { + "epoch": 0.3931888544891641, + "grad_norm": 0.24535816753156603, + "learning_rate": 0.0006922206143761047, + "loss": 2.5249, + "step": 3429 + }, + { + "epoch": 0.39330352023850473, + "grad_norm": 0.23466974526859236, + "learning_rate": 0.0006920491766294803, + "loss": 2.4399, + "step": 3430 + }, + { + "epoch": 0.39341818598784545, + "grad_norm": 0.22392533723702146, + "learning_rate": 0.0006918777123933326, + "loss": 2.5055, + "step": 3431 + }, + { + "epoch": 0.3935328517371861, + "grad_norm": 0.24880237902738353, + "learning_rate": 0.0006917062216913123, + "loss": 2.5228, + "step": 3432 + }, + { + "epoch": 0.39364751748652677, + "grad_norm": 0.22670082143900633, + "learning_rate": 0.0006915347045470728, + "loss": 2.4754, + "step": 3433 + }, + { + "epoch": 0.3937621832358674, + "grad_norm": 0.24247326165471242, + "learning_rate": 0.0006913631609842718, + "loss": 2.5569, + "step": 3434 + }, + { + "epoch": 0.39387684898520814, + "grad_norm": 0.24460390931253906, + "learning_rate": 0.0006911915910265703, + "loss": 2.4534, + "step": 3435 + }, + { + "epoch": 0.3939915147345488, + "grad_norm": 0.24531734119030324, + "learning_rate": 0.0006910199946976337, + "loss": 2.4521, + "step": 3436 + }, + { + "epoch": 0.39410618048388946, + "grad_norm": 0.2211504094104953, + "learning_rate": 0.0006908483720211296, + "loss": 2.4809, + "step": 3437 + }, + { + "epoch": 0.3942208462332301, + "grad_norm": 0.21903256213047728, + "learning_rate": 0.0006906767230207306, + "loss": 2.3215, + "step": 3438 + }, + { + "epoch": 0.39433551198257083, + "grad_norm": 0.2556524374140686, + "learning_rate": 0.0006905050477201124, + "loss": 2.5591, + "step": 3439 + }, + { + "epoch": 0.3944501777319115, + "grad_norm": 0.23604180320237061, + "learning_rate": 0.0006903333461429539, + "loss": 2.4818, + "step": 3440 + }, + { + "epoch": 0.39456484348125215, + "grad_norm": 0.2414555034243416, + "learning_rate": 0.0006901616183129386, + "loss": 2.5066, + "step": 3441 + }, + { + "epoch": 0.3946795092305928, + "grad_norm": 0.2516564418903787, + "learning_rate": 0.0006899898642537531, + "loss": 2.6177, + "step": 3442 + }, + { + "epoch": 0.3947941749799335, + "grad_norm": 0.22462701663331472, + "learning_rate": 0.000689818083989087, + "loss": 2.5115, + "step": 3443 + }, + { + "epoch": 0.3949088407292742, + "grad_norm": 0.24372804352616514, + "learning_rate": 0.0006896462775426346, + "loss": 2.4744, + "step": 3444 + }, + { + "epoch": 0.39502350647861484, + "grad_norm": 0.2424426429532698, + "learning_rate": 0.0006894744449380932, + "loss": 2.3823, + "step": 3445 + }, + { + "epoch": 0.3951381722279555, + "grad_norm": 0.2650612312537781, + "learning_rate": 0.0006893025861991639, + "loss": 2.4716, + "step": 3446 + }, + { + "epoch": 0.39525283797729616, + "grad_norm": 0.2544222718183807, + "learning_rate": 0.0006891307013495513, + "loss": 2.4273, + "step": 3447 + }, + { + "epoch": 0.39536750372663687, + "grad_norm": 0.2524410260924627, + "learning_rate": 0.0006889587904129634, + "loss": 2.3621, + "step": 3448 + }, + { + "epoch": 0.39548216947597753, + "grad_norm": 0.2756653013225916, + "learning_rate": 0.0006887868534131124, + "loss": 2.5206, + "step": 3449 + }, + { + "epoch": 0.3955968352253182, + "grad_norm": 0.2383878767816064, + "learning_rate": 0.0006886148903737135, + "loss": 2.4146, + "step": 3450 + }, + { + "epoch": 0.39571150097465885, + "grad_norm": 0.25308982538327024, + "learning_rate": 0.0006884429013184858, + "loss": 2.5237, + "step": 3451 + }, + { + "epoch": 0.39582616672399956, + "grad_norm": 0.2202443264895399, + "learning_rate": 0.000688270886271152, + "loss": 2.4316, + "step": 3452 + }, + { + "epoch": 0.3959408324733402, + "grad_norm": 0.27338372851270565, + "learning_rate": 0.0006880988452554382, + "loss": 2.4589, + "step": 3453 + }, + { + "epoch": 0.3960554982226809, + "grad_norm": 0.2648600848676643, + "learning_rate": 0.0006879267782950742, + "loss": 2.4844, + "step": 3454 + }, + { + "epoch": 0.39617016397202154, + "grad_norm": 0.240593248731978, + "learning_rate": 0.0006877546854137933, + "loss": 2.445, + "step": 3455 + }, + { + "epoch": 0.39628482972136225, + "grad_norm": 0.23166279368435969, + "learning_rate": 0.0006875825666353324, + "loss": 2.4602, + "step": 3456 + }, + { + "epoch": 0.3963994954707029, + "grad_norm": 0.25207666298206705, + "learning_rate": 0.0006874104219834322, + "loss": 2.4716, + "step": 3457 + }, + { + "epoch": 0.39651416122004357, + "grad_norm": 0.23325698193040936, + "learning_rate": 0.0006872382514818365, + "loss": 2.5047, + "step": 3458 + }, + { + "epoch": 0.39662882696938423, + "grad_norm": 0.25142353316306637, + "learning_rate": 0.0006870660551542932, + "loss": 2.7074, + "step": 3459 + }, + { + "epoch": 0.39674349271872494, + "grad_norm": 0.26247366942673633, + "learning_rate": 0.0006868938330245534, + "loss": 2.537, + "step": 3460 + }, + { + "epoch": 0.3968581584680656, + "grad_norm": 0.21995668989746636, + "learning_rate": 0.000686721585116372, + "loss": 2.5214, + "step": 3461 + }, + { + "epoch": 0.39697282421740626, + "grad_norm": 0.2336001441475452, + "learning_rate": 0.0006865493114535068, + "loss": 2.4334, + "step": 3462 + }, + { + "epoch": 0.3970874899667469, + "grad_norm": 0.27731185119659035, + "learning_rate": 0.0006863770120597204, + "loss": 2.4409, + "step": 3463 + }, + { + "epoch": 0.3972021557160876, + "grad_norm": 0.25157418462555853, + "learning_rate": 0.0006862046869587777, + "loss": 2.4784, + "step": 3464 + }, + { + "epoch": 0.3973168214654283, + "grad_norm": 0.22525231345997476, + "learning_rate": 0.0006860323361744477, + "loss": 2.5486, + "step": 3465 + }, + { + "epoch": 0.39743148721476895, + "grad_norm": 0.2364383170061946, + "learning_rate": 0.0006858599597305033, + "loss": 2.4256, + "step": 3466 + }, + { + "epoch": 0.3975461529641096, + "grad_norm": 0.2551873672426385, + "learning_rate": 0.0006856875576507201, + "loss": 2.4882, + "step": 3467 + }, + { + "epoch": 0.39766081871345027, + "grad_norm": 0.24986593911146548, + "learning_rate": 0.0006855151299588778, + "loss": 2.4913, + "step": 3468 + }, + { + "epoch": 0.397775484462791, + "grad_norm": 0.2536870339259371, + "learning_rate": 0.0006853426766787597, + "loss": 2.3994, + "step": 3469 + }, + { + "epoch": 0.39789015021213164, + "grad_norm": 0.2525393496704512, + "learning_rate": 0.0006851701978341522, + "loss": 2.442, + "step": 3470 + }, + { + "epoch": 0.3980048159614723, + "grad_norm": 0.2757037585261663, + "learning_rate": 0.0006849976934488456, + "loss": 2.5837, + "step": 3471 + }, + { + "epoch": 0.39811948171081296, + "grad_norm": 0.28814508659361027, + "learning_rate": 0.0006848251635466336, + "loss": 2.5607, + "step": 3472 + }, + { + "epoch": 0.3982341474601537, + "grad_norm": 0.2534401532808621, + "learning_rate": 0.0006846526081513134, + "loss": 2.4603, + "step": 3473 + }, + { + "epoch": 0.39834881320949433, + "grad_norm": 0.26584622023382487, + "learning_rate": 0.0006844800272866856, + "loss": 2.5796, + "step": 3474 + }, + { + "epoch": 0.398463478958835, + "grad_norm": 0.2518383117018804, + "learning_rate": 0.0006843074209765545, + "loss": 2.4725, + "step": 3475 + }, + { + "epoch": 0.39857814470817565, + "grad_norm": 0.2739666290635124, + "learning_rate": 0.0006841347892447281, + "loss": 2.5288, + "step": 3476 + }, + { + "epoch": 0.39869281045751637, + "grad_norm": 0.2734179509054692, + "learning_rate": 0.0006839621321150174, + "loss": 2.3651, + "step": 3477 + }, + { + "epoch": 0.398807476206857, + "grad_norm": 0.260977339350182, + "learning_rate": 0.0006837894496112371, + "loss": 2.6469, + "step": 3478 + }, + { + "epoch": 0.3989221419561977, + "grad_norm": 0.26027170895926666, + "learning_rate": 0.0006836167417572056, + "loss": 2.5171, + "step": 3479 + }, + { + "epoch": 0.39903680770553834, + "grad_norm": 0.25059843801228093, + "learning_rate": 0.0006834440085767447, + "loss": 2.5446, + "step": 3480 + }, + { + "epoch": 0.399151473454879, + "grad_norm": 0.24360400055498876, + "learning_rate": 0.0006832712500936795, + "loss": 2.3917, + "step": 3481 + }, + { + "epoch": 0.3992661392042197, + "grad_norm": 0.2314381083023964, + "learning_rate": 0.0006830984663318391, + "loss": 2.4277, + "step": 3482 + }, + { + "epoch": 0.3993808049535604, + "grad_norm": 0.23477266980800515, + "learning_rate": 0.0006829256573150551, + "loss": 2.5436, + "step": 3483 + }, + { + "epoch": 0.39949547070290103, + "grad_norm": 0.26287394114319207, + "learning_rate": 0.0006827528230671636, + "loss": 2.4673, + "step": 3484 + }, + { + "epoch": 0.3996101364522417, + "grad_norm": 0.22709965455363912, + "learning_rate": 0.0006825799636120038, + "loss": 2.3757, + "step": 3485 + }, + { + "epoch": 0.3997248022015824, + "grad_norm": 0.23607424202524577, + "learning_rate": 0.0006824070789734184, + "loss": 2.4539, + "step": 3486 + }, + { + "epoch": 0.39983946795092307, + "grad_norm": 0.24428311751606094, + "learning_rate": 0.0006822341691752532, + "loss": 2.4794, + "step": 3487 + }, + { + "epoch": 0.3999541337002637, + "grad_norm": 0.23607446147673483, + "learning_rate": 0.0006820612342413583, + "loss": 2.3958, + "step": 3488 + }, + { + "epoch": 0.4000687994496044, + "grad_norm": 0.22663082098054665, + "learning_rate": 0.0006818882741955866, + "loss": 2.5992, + "step": 3489 + }, + { + "epoch": 0.4001834651989451, + "grad_norm": 0.2433514403086265, + "learning_rate": 0.0006817152890617943, + "loss": 2.5298, + "step": 3490 + }, + { + "epoch": 0.40029813094828576, + "grad_norm": 0.23454574950120408, + "learning_rate": 0.0006815422788638418, + "loss": 2.3761, + "step": 3491 + }, + { + "epoch": 0.4004127966976264, + "grad_norm": 0.25589519711579845, + "learning_rate": 0.0006813692436255926, + "loss": 2.4322, + "step": 3492 + }, + { + "epoch": 0.4005274624469671, + "grad_norm": 0.2749015801238749, + "learning_rate": 0.0006811961833709132, + "loss": 2.4968, + "step": 3493 + }, + { + "epoch": 0.4006421281963078, + "grad_norm": 0.23676183333455014, + "learning_rate": 0.0006810230981236743, + "loss": 2.6439, + "step": 3494 + }, + { + "epoch": 0.40075679394564845, + "grad_norm": 0.26372049753974247, + "learning_rate": 0.0006808499879077496, + "loss": 2.5209, + "step": 3495 + }, + { + "epoch": 0.4008714596949891, + "grad_norm": 0.27357719851324697, + "learning_rate": 0.0006806768527470165, + "loss": 2.6535, + "step": 3496 + }, + { + "epoch": 0.40098612544432977, + "grad_norm": 0.2412935658786769, + "learning_rate": 0.0006805036926653556, + "loss": 2.5792, + "step": 3497 + }, + { + "epoch": 0.4011007911936704, + "grad_norm": 0.25358931216611796, + "learning_rate": 0.0006803305076866509, + "loss": 2.5416, + "step": 3498 + }, + { + "epoch": 0.40121545694301114, + "grad_norm": 0.2801363104409284, + "learning_rate": 0.0006801572978347901, + "loss": 2.5074, + "step": 3499 + }, + { + "epoch": 0.4013301226923518, + "grad_norm": 0.26628841587940444, + "learning_rate": 0.0006799840631336642, + "loss": 2.5701, + "step": 3500 + }, + { + "epoch": 0.40144478844169246, + "grad_norm": 0.2455082170331015, + "learning_rate": 0.0006798108036071677, + "loss": 2.4808, + "step": 3501 + }, + { + "epoch": 0.4015594541910331, + "grad_norm": 0.2462018854171415, + "learning_rate": 0.0006796375192791982, + "loss": 2.4976, + "step": 3502 + }, + { + "epoch": 0.40167411994037383, + "grad_norm": 0.2598295656209238, + "learning_rate": 0.000679464210173657, + "loss": 2.5126, + "step": 3503 + }, + { + "epoch": 0.4017887856897145, + "grad_norm": 0.2698869037579159, + "learning_rate": 0.0006792908763144492, + "loss": 2.5185, + "step": 3504 + }, + { + "epoch": 0.40190345143905515, + "grad_norm": 0.24562462027196158, + "learning_rate": 0.0006791175177254825, + "loss": 2.6687, + "step": 3505 + }, + { + "epoch": 0.4020181171883958, + "grad_norm": 0.2533011231506365, + "learning_rate": 0.0006789441344306684, + "loss": 2.6081, + "step": 3506 + }, + { + "epoch": 0.4021327829377365, + "grad_norm": 0.2151082870041164, + "learning_rate": 0.0006787707264539222, + "loss": 2.3799, + "step": 3507 + }, + { + "epoch": 0.4022474486870772, + "grad_norm": 0.2249546366806936, + "learning_rate": 0.0006785972938191617, + "loss": 2.4518, + "step": 3508 + }, + { + "epoch": 0.40236211443641784, + "grad_norm": 0.2410686366347411, + "learning_rate": 0.0006784238365503089, + "loss": 2.3536, + "step": 3509 + }, + { + "epoch": 0.4024767801857585, + "grad_norm": 0.2510781475727969, + "learning_rate": 0.000678250354671289, + "loss": 2.5769, + "step": 3510 + }, + { + "epoch": 0.4025914459350992, + "grad_norm": 0.270632605685379, + "learning_rate": 0.0006780768482060305, + "loss": 2.6068, + "step": 3511 + }, + { + "epoch": 0.40270611168443987, + "grad_norm": 0.27269221596085413, + "learning_rate": 0.0006779033171784652, + "loss": 2.5001, + "step": 3512 + }, + { + "epoch": 0.40282077743378053, + "grad_norm": 0.24945750276100276, + "learning_rate": 0.0006777297616125283, + "loss": 2.5231, + "step": 3513 + }, + { + "epoch": 0.4029354431831212, + "grad_norm": 0.2825273753769796, + "learning_rate": 0.0006775561815321589, + "loss": 2.6388, + "step": 3514 + }, + { + "epoch": 0.40305010893246185, + "grad_norm": 0.2707006795252678, + "learning_rate": 0.0006773825769612984, + "loss": 2.5987, + "step": 3515 + }, + { + "epoch": 0.40316477468180256, + "grad_norm": 0.26793658776465445, + "learning_rate": 0.000677208947923893, + "loss": 2.4096, + "step": 3516 + }, + { + "epoch": 0.4032794404311432, + "grad_norm": 0.24159784148288316, + "learning_rate": 0.0006770352944438911, + "loss": 2.4356, + "step": 3517 + }, + { + "epoch": 0.4033941061804839, + "grad_norm": 0.24652558370778413, + "learning_rate": 0.0006768616165452447, + "loss": 2.5892, + "step": 3518 + }, + { + "epoch": 0.40350877192982454, + "grad_norm": 0.22357933453286, + "learning_rate": 0.0006766879142519098, + "loss": 2.5574, + "step": 3519 + }, + { + "epoch": 0.40362343767916525, + "grad_norm": 0.270288053307248, + "learning_rate": 0.0006765141875878449, + "loss": 2.5329, + "step": 3520 + }, + { + "epoch": 0.4037381034285059, + "grad_norm": 0.24454405274851926, + "learning_rate": 0.0006763404365770126, + "loss": 2.4554, + "step": 3521 + }, + { + "epoch": 0.40385276917784657, + "grad_norm": 0.24533602135902838, + "learning_rate": 0.0006761666612433786, + "loss": 2.4467, + "step": 3522 + }, + { + "epoch": 0.40396743492718723, + "grad_norm": 0.26331189250185794, + "learning_rate": 0.0006759928616109115, + "loss": 2.3823, + "step": 3523 + }, + { + "epoch": 0.40408210067652794, + "grad_norm": 0.26912352846662463, + "learning_rate": 0.0006758190377035839, + "loss": 2.5483, + "step": 3524 + }, + { + "epoch": 0.4041967664258686, + "grad_norm": 0.2575097892956589, + "learning_rate": 0.0006756451895453715, + "loss": 2.4343, + "step": 3525 + }, + { + "epoch": 0.40431143217520926, + "grad_norm": 0.2327774492911776, + "learning_rate": 0.0006754713171602533, + "loss": 2.3842, + "step": 3526 + }, + { + "epoch": 0.4044260979245499, + "grad_norm": 0.2956897531261791, + "learning_rate": 0.0006752974205722117, + "loss": 2.4856, + "step": 3527 + }, + { + "epoch": 0.40454076367389064, + "grad_norm": 0.22320166242892028, + "learning_rate": 0.0006751234998052324, + "loss": 2.4065, + "step": 3528 + }, + { + "epoch": 0.4046554294232313, + "grad_norm": 0.2536882302132791, + "learning_rate": 0.0006749495548833044, + "loss": 2.4622, + "step": 3529 + }, + { + "epoch": 0.40477009517257195, + "grad_norm": 0.2575695863121073, + "learning_rate": 0.0006747755858304203, + "loss": 2.4104, + "step": 3530 + }, + { + "epoch": 0.4048847609219126, + "grad_norm": 0.2497124793145354, + "learning_rate": 0.0006746015926705755, + "loss": 2.5257, + "step": 3531 + }, + { + "epoch": 0.40499942667125327, + "grad_norm": 0.25799276291059586, + "learning_rate": 0.0006744275754277694, + "loss": 2.4628, + "step": 3532 + }, + { + "epoch": 0.405114092420594, + "grad_norm": 0.21248725076372407, + "learning_rate": 0.0006742535341260038, + "loss": 2.528, + "step": 3533 + }, + { + "epoch": 0.40522875816993464, + "grad_norm": 0.2527115637799467, + "learning_rate": 0.000674079468789285, + "loss": 2.5432, + "step": 3534 + }, + { + "epoch": 0.4053434239192753, + "grad_norm": 0.24634507453950957, + "learning_rate": 0.0006739053794416217, + "loss": 2.5107, + "step": 3535 + }, + { + "epoch": 0.40545808966861596, + "grad_norm": 0.2387080978589809, + "learning_rate": 0.0006737312661070263, + "loss": 2.4136, + "step": 3536 + }, + { + "epoch": 0.4055727554179567, + "grad_norm": 0.26738517616855206, + "learning_rate": 0.0006735571288095144, + "loss": 2.3522, + "step": 3537 + }, + { + "epoch": 0.40568742116729734, + "grad_norm": 0.2551480931225143, + "learning_rate": 0.0006733829675731047, + "loss": 2.5338, + "step": 3538 + }, + { + "epoch": 0.405802086916638, + "grad_norm": 0.27697632795609856, + "learning_rate": 0.0006732087824218197, + "loss": 2.5844, + "step": 3539 + }, + { + "epoch": 0.40591675266597865, + "grad_norm": 0.24403603088332065, + "learning_rate": 0.0006730345733796847, + "loss": 2.4592, + "step": 3540 + }, + { + "epoch": 0.40603141841531937, + "grad_norm": 0.27218519247192985, + "learning_rate": 0.0006728603404707288, + "loss": 2.4427, + "step": 3541 + }, + { + "epoch": 0.40614608416466, + "grad_norm": 0.2485417122820804, + "learning_rate": 0.0006726860837189839, + "loss": 2.5799, + "step": 3542 + }, + { + "epoch": 0.4062607499140007, + "grad_norm": 0.26977814709661535, + "learning_rate": 0.0006725118031484855, + "loss": 2.6267, + "step": 3543 + }, + { + "epoch": 0.40637541566334134, + "grad_norm": 0.27824614542165405, + "learning_rate": 0.0006723374987832722, + "loss": 2.6168, + "step": 3544 + }, + { + "epoch": 0.40649008141268206, + "grad_norm": 0.220886661938332, + "learning_rate": 0.000672163170647386, + "loss": 2.348, + "step": 3545 + }, + { + "epoch": 0.4066047471620227, + "grad_norm": 0.24014108784041616, + "learning_rate": 0.0006719888187648721, + "loss": 2.6736, + "step": 3546 + }, + { + "epoch": 0.4067194129113634, + "grad_norm": 0.24548970801057948, + "learning_rate": 0.000671814443159779, + "loss": 2.5024, + "step": 3547 + }, + { + "epoch": 0.40683407866070403, + "grad_norm": 0.27866067147075346, + "learning_rate": 0.0006716400438561588, + "loss": 2.5526, + "step": 3548 + }, + { + "epoch": 0.4069487444100447, + "grad_norm": 0.24828534859770182, + "learning_rate": 0.000671465620878066, + "loss": 2.457, + "step": 3549 + }, + { + "epoch": 0.4070634101593854, + "grad_norm": 0.24418153140688964, + "learning_rate": 0.0006712911742495593, + "loss": 2.558, + "step": 3550 + }, + { + "epoch": 0.40717807590872607, + "grad_norm": 0.24879033242886733, + "learning_rate": 0.0006711167039947003, + "loss": 2.6623, + "step": 3551 + }, + { + "epoch": 0.4072927416580667, + "grad_norm": 0.23076712626394916, + "learning_rate": 0.0006709422101375537, + "loss": 2.382, + "step": 3552 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 0.233241823757704, + "learning_rate": 0.0006707676927021878, + "loss": 2.4201, + "step": 3553 + }, + { + "epoch": 0.4075220731567481, + "grad_norm": 0.2518317952991465, + "learning_rate": 0.0006705931517126738, + "loss": 2.4629, + "step": 3554 + }, + { + "epoch": 0.40763673890608876, + "grad_norm": 0.218267423793303, + "learning_rate": 0.0006704185871930861, + "loss": 2.4346, + "step": 3555 + }, + { + "epoch": 0.4077514046554294, + "grad_norm": 0.24477868515496426, + "learning_rate": 0.0006702439991675029, + "loss": 2.4029, + "step": 3556 + }, + { + "epoch": 0.4078660704047701, + "grad_norm": 0.24641100097704066, + "learning_rate": 0.0006700693876600052, + "loss": 2.5767, + "step": 3557 + }, + { + "epoch": 0.4079807361541108, + "grad_norm": 0.27800826495805664, + "learning_rate": 0.0006698947526946774, + "loss": 2.5602, + "step": 3558 + }, + { + "epoch": 0.40809540190345145, + "grad_norm": 0.2629583839524337, + "learning_rate": 0.0006697200942956068, + "loss": 2.55, + "step": 3559 + }, + { + "epoch": 0.4082100676527921, + "grad_norm": 0.2788832333582697, + "learning_rate": 0.0006695454124868843, + "loss": 2.572, + "step": 3560 + }, + { + "epoch": 0.40832473340213277, + "grad_norm": 0.25564654251794505, + "learning_rate": 0.0006693707072926041, + "loss": 2.3577, + "step": 3561 + }, + { + "epoch": 0.4084393991514735, + "grad_norm": 0.23763690261754442, + "learning_rate": 0.0006691959787368633, + "loss": 2.5355, + "step": 3562 + }, + { + "epoch": 0.40855406490081414, + "grad_norm": 0.2644517325209059, + "learning_rate": 0.0006690212268437623, + "loss": 2.6073, + "step": 3563 + }, + { + "epoch": 0.4086687306501548, + "grad_norm": 0.257926060801365, + "learning_rate": 0.000668846451637405, + "loss": 2.5808, + "step": 3564 + }, + { + "epoch": 0.40878339639949546, + "grad_norm": 0.27320232523302707, + "learning_rate": 0.0006686716531418981, + "loss": 2.4574, + "step": 3565 + }, + { + "epoch": 0.4088980621488361, + "grad_norm": 0.23680436642003821, + "learning_rate": 0.000668496831381352, + "loss": 2.5671, + "step": 3566 + }, + { + "epoch": 0.40901272789817683, + "grad_norm": 0.2504073988783255, + "learning_rate": 0.0006683219863798797, + "loss": 2.4922, + "step": 3567 + }, + { + "epoch": 0.4091273936475175, + "grad_norm": 0.24519806424163562, + "learning_rate": 0.0006681471181615979, + "loss": 2.5479, + "step": 3568 + }, + { + "epoch": 0.40924205939685815, + "grad_norm": 0.22703365616722754, + "learning_rate": 0.0006679722267506265, + "loss": 2.5293, + "step": 3569 + }, + { + "epoch": 0.4093567251461988, + "grad_norm": 0.23784346246107996, + "learning_rate": 0.0006677973121710881, + "loss": 2.5702, + "step": 3570 + }, + { + "epoch": 0.4094713908955395, + "grad_norm": 0.2526280044762824, + "learning_rate": 0.000667622374447109, + "loss": 2.5173, + "step": 3571 + }, + { + "epoch": 0.4095860566448802, + "grad_norm": 0.24647167452323582, + "learning_rate": 0.0006674474136028186, + "loss": 2.6485, + "step": 3572 + }, + { + "epoch": 0.40970072239422084, + "grad_norm": 0.2410423997947158, + "learning_rate": 0.0006672724296623492, + "loss": 2.4487, + "step": 3573 + }, + { + "epoch": 0.4098153881435615, + "grad_norm": 0.2219595850749613, + "learning_rate": 0.0006670974226498367, + "loss": 2.5494, + "step": 3574 + }, + { + "epoch": 0.4099300538929022, + "grad_norm": 0.2672669502068693, + "learning_rate": 0.0006669223925894199, + "loss": 2.5907, + "step": 3575 + }, + { + "epoch": 0.41004471964224287, + "grad_norm": 0.25574896327663926, + "learning_rate": 0.0006667473395052411, + "loss": 2.4905, + "step": 3576 + }, + { + "epoch": 0.41015938539158353, + "grad_norm": 0.27217312100435875, + "learning_rate": 0.000666572263421445, + "loss": 2.4457, + "step": 3577 + }, + { + "epoch": 0.4102740511409242, + "grad_norm": 0.23218071631371606, + "learning_rate": 0.0006663971643621803, + "loss": 2.6276, + "step": 3578 + }, + { + "epoch": 0.4103887168902649, + "grad_norm": 0.24049867652940113, + "learning_rate": 0.0006662220423515987, + "loss": 2.487, + "step": 3579 + }, + { + "epoch": 0.41050338263960556, + "grad_norm": 0.25520560062515957, + "learning_rate": 0.0006660468974138549, + "loss": 2.5403, + "step": 3580 + }, + { + "epoch": 0.4106180483889462, + "grad_norm": 0.26877062694512405, + "learning_rate": 0.0006658717295731067, + "loss": 2.5569, + "step": 3581 + }, + { + "epoch": 0.4107327141382869, + "grad_norm": 0.2656020202464822, + "learning_rate": 0.0006656965388535153, + "loss": 2.5911, + "step": 3582 + }, + { + "epoch": 0.41084737988762754, + "grad_norm": 0.21984036435657617, + "learning_rate": 0.0006655213252792446, + "loss": 2.5534, + "step": 3583 + }, + { + "epoch": 0.41096204563696825, + "grad_norm": 0.2507994646409042, + "learning_rate": 0.0006653460888744625, + "loss": 2.4291, + "step": 3584 + }, + { + "epoch": 0.4110767113863089, + "grad_norm": 0.25558310988053434, + "learning_rate": 0.000665170829663339, + "loss": 2.4945, + "step": 3585 + }, + { + "epoch": 0.41119137713564957, + "grad_norm": 0.23885816357006967, + "learning_rate": 0.000664995547670048, + "loss": 2.4038, + "step": 3586 + }, + { + "epoch": 0.41130604288499023, + "grad_norm": 0.23752428725937302, + "learning_rate": 0.0006648202429187664, + "loss": 2.4505, + "step": 3587 + }, + { + "epoch": 0.41142070863433094, + "grad_norm": 0.25642889873560293, + "learning_rate": 0.0006646449154336739, + "loss": 2.5711, + "step": 3588 + }, + { + "epoch": 0.4115353743836716, + "grad_norm": 0.2494335750827775, + "learning_rate": 0.0006644695652389538, + "loss": 2.643, + "step": 3589 + }, + { + "epoch": 0.41165004013301226, + "grad_norm": 0.2308929404375352, + "learning_rate": 0.0006642941923587923, + "loss": 2.4435, + "step": 3590 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.2833753046214647, + "learning_rate": 0.0006641187968173788, + "loss": 2.5904, + "step": 3591 + }, + { + "epoch": 0.41187937163169364, + "grad_norm": 0.2623056127707425, + "learning_rate": 0.0006639433786389054, + "loss": 2.4975, + "step": 3592 + }, + { + "epoch": 0.4119940373810343, + "grad_norm": 0.2563522846831175, + "learning_rate": 0.0006637679378475681, + "loss": 2.5313, + "step": 3593 + }, + { + "epoch": 0.41210870313037495, + "grad_norm": 0.23622099991246473, + "learning_rate": 0.0006635924744675656, + "loss": 2.4567, + "step": 3594 + }, + { + "epoch": 0.4122233688797156, + "grad_norm": 0.2557073747048083, + "learning_rate": 0.0006634169885230994, + "loss": 2.5882, + "step": 3595 + }, + { + "epoch": 0.4123380346290563, + "grad_norm": 0.23446513582048864, + "learning_rate": 0.0006632414800383747, + "loss": 2.607, + "step": 3596 + }, + { + "epoch": 0.412452700378397, + "grad_norm": 0.27078964868869115, + "learning_rate": 0.0006630659490375996, + "loss": 2.4946, + "step": 3597 + }, + { + "epoch": 0.41256736612773764, + "grad_norm": 0.2510349246887973, + "learning_rate": 0.000662890395544985, + "loss": 2.4983, + "step": 3598 + }, + { + "epoch": 0.4126820318770783, + "grad_norm": 0.2624048298693548, + "learning_rate": 0.0006627148195847455, + "loss": 2.6635, + "step": 3599 + }, + { + "epoch": 0.41279669762641896, + "grad_norm": 0.23455693596253335, + "learning_rate": 0.000662539221181098, + "loss": 2.5564, + "step": 3600 + }, + { + "epoch": 0.4129113633757597, + "grad_norm": 0.25613737118542623, + "learning_rate": 0.0006623636003582633, + "loss": 2.4028, + "step": 3601 + }, + { + "epoch": 0.41302602912510034, + "grad_norm": 0.24004788742980104, + "learning_rate": 0.0006621879571404649, + "loss": 2.3777, + "step": 3602 + }, + { + "epoch": 0.413140694874441, + "grad_norm": 0.23932669434737375, + "learning_rate": 0.0006620122915519295, + "loss": 2.4375, + "step": 3603 + }, + { + "epoch": 0.41325536062378165, + "grad_norm": 0.24911292126860454, + "learning_rate": 0.0006618366036168867, + "loss": 2.6164, + "step": 3604 + }, + { + "epoch": 0.41337002637312237, + "grad_norm": 0.25841433158259774, + "learning_rate": 0.0006616608933595692, + "loss": 2.3431, + "step": 3605 + }, + { + "epoch": 0.413484692122463, + "grad_norm": 0.269625998866878, + "learning_rate": 0.0006614851608042131, + "loss": 2.5073, + "step": 3606 + }, + { + "epoch": 0.4135993578718037, + "grad_norm": 0.24972738773855116, + "learning_rate": 0.0006613094059750573, + "loss": 2.5458, + "step": 3607 + }, + { + "epoch": 0.41371402362114434, + "grad_norm": 0.26382713628262017, + "learning_rate": 0.0006611336288963436, + "loss": 2.5807, + "step": 3608 + }, + { + "epoch": 0.41382868937048506, + "grad_norm": 0.22421890646451972, + "learning_rate": 0.0006609578295923174, + "loss": 2.4474, + "step": 3609 + }, + { + "epoch": 0.4139433551198257, + "grad_norm": 0.2465673718023568, + "learning_rate": 0.0006607820080872267, + "loss": 2.4774, + "step": 3610 + }, + { + "epoch": 0.4140580208691664, + "grad_norm": 0.24079104017880662, + "learning_rate": 0.0006606061644053227, + "loss": 2.5858, + "step": 3611 + }, + { + "epoch": 0.41417268661850704, + "grad_norm": 0.2683715937947471, + "learning_rate": 0.0006604302985708599, + "loss": 2.4917, + "step": 3612 + }, + { + "epoch": 0.41428735236784775, + "grad_norm": 0.24963967133269516, + "learning_rate": 0.0006602544106080955, + "loss": 2.5565, + "step": 3613 + }, + { + "epoch": 0.4144020181171884, + "grad_norm": 0.24601157467161391, + "learning_rate": 0.0006600785005412897, + "loss": 2.4923, + "step": 3614 + }, + { + "epoch": 0.41451668386652907, + "grad_norm": 0.26679595250387034, + "learning_rate": 0.0006599025683947062, + "loss": 2.5377, + "step": 3615 + }, + { + "epoch": 0.4146313496158697, + "grad_norm": 0.2390027519268333, + "learning_rate": 0.0006597266141926115, + "loss": 2.4857, + "step": 3616 + }, + { + "epoch": 0.4147460153652104, + "grad_norm": 0.25384683547884496, + "learning_rate": 0.000659550637959275, + "loss": 2.4733, + "step": 3617 + }, + { + "epoch": 0.4148606811145511, + "grad_norm": 0.23825501486501358, + "learning_rate": 0.0006593746397189692, + "loss": 2.5325, + "step": 3618 + }, + { + "epoch": 0.41497534686389176, + "grad_norm": 0.23378454436654672, + "learning_rate": 0.00065919861949597, + "loss": 2.4638, + "step": 3619 + }, + { + "epoch": 0.4150900126132324, + "grad_norm": 0.22843532370253755, + "learning_rate": 0.0006590225773145556, + "loss": 2.346, + "step": 3620 + }, + { + "epoch": 0.4152046783625731, + "grad_norm": 0.2422495616373672, + "learning_rate": 0.000658846513199008, + "loss": 2.4214, + "step": 3621 + }, + { + "epoch": 0.4153193441119138, + "grad_norm": 0.2568047936888267, + "learning_rate": 0.0006586704271736119, + "loss": 2.5909, + "step": 3622 + }, + { + "epoch": 0.41543400986125445, + "grad_norm": 0.2531581575864393, + "learning_rate": 0.0006584943192626549, + "loss": 2.3493, + "step": 3623 + }, + { + "epoch": 0.4155486756105951, + "grad_norm": 0.254469352707778, + "learning_rate": 0.0006583181894904277, + "loss": 2.4973, + "step": 3624 + }, + { + "epoch": 0.41566334135993577, + "grad_norm": 0.2414600144394221, + "learning_rate": 0.000658142037881224, + "loss": 2.5058, + "step": 3625 + }, + { + "epoch": 0.4157780071092765, + "grad_norm": 0.24872457122834565, + "learning_rate": 0.0006579658644593407, + "loss": 2.575, + "step": 3626 + }, + { + "epoch": 0.41589267285861714, + "grad_norm": 0.25032493262135647, + "learning_rate": 0.0006577896692490775, + "loss": 2.3848, + "step": 3627 + }, + { + "epoch": 0.4160073386079578, + "grad_norm": 0.22987268254333176, + "learning_rate": 0.0006576134522747371, + "loss": 2.5329, + "step": 3628 + }, + { + "epoch": 0.41612200435729846, + "grad_norm": 0.2525234213833264, + "learning_rate": 0.0006574372135606254, + "loss": 2.4661, + "step": 3629 + }, + { + "epoch": 0.4162366701066392, + "grad_norm": 0.26996768285629813, + "learning_rate": 0.0006572609531310511, + "loss": 2.5118, + "step": 3630 + }, + { + "epoch": 0.41635133585597983, + "grad_norm": 0.23804295742950773, + "learning_rate": 0.0006570846710103261, + "loss": 2.4934, + "step": 3631 + }, + { + "epoch": 0.4164660016053205, + "grad_norm": 0.25968217599653287, + "learning_rate": 0.0006569083672227649, + "loss": 2.5503, + "step": 3632 + }, + { + "epoch": 0.41658066735466115, + "grad_norm": 0.22898437997056312, + "learning_rate": 0.0006567320417926855, + "loss": 2.3892, + "step": 3633 + }, + { + "epoch": 0.4166953331040018, + "grad_norm": 0.2344040277561669, + "learning_rate": 0.0006565556947444085, + "loss": 2.5231, + "step": 3634 + }, + { + "epoch": 0.4168099988533425, + "grad_norm": 0.25379413443016824, + "learning_rate": 0.0006563793261022575, + "loss": 2.43, + "step": 3635 + }, + { + "epoch": 0.4169246646026832, + "grad_norm": 0.259298949060862, + "learning_rate": 0.0006562029358905593, + "loss": 2.6198, + "step": 3636 + }, + { + "epoch": 0.41703933035202384, + "grad_norm": 0.24669975065654337, + "learning_rate": 0.0006560265241336438, + "loss": 2.4646, + "step": 3637 + }, + { + "epoch": 0.4171539961013645, + "grad_norm": 0.24109463815641527, + "learning_rate": 0.0006558500908558434, + "loss": 2.4667, + "step": 3638 + }, + { + "epoch": 0.4172686618507052, + "grad_norm": 0.2338632750540656, + "learning_rate": 0.0006556736360814935, + "loss": 2.5078, + "step": 3639 + }, + { + "epoch": 0.4173833276000459, + "grad_norm": 0.2581976408432289, + "learning_rate": 0.000655497159834933, + "loss": 2.5959, + "step": 3640 + }, + { + "epoch": 0.41749799334938653, + "grad_norm": 0.24082886457624678, + "learning_rate": 0.0006553206621405037, + "loss": 2.5883, + "step": 3641 + }, + { + "epoch": 0.4176126590987272, + "grad_norm": 0.24071361211778752, + "learning_rate": 0.0006551441430225493, + "loss": 2.4476, + "step": 3642 + }, + { + "epoch": 0.4177273248480679, + "grad_norm": 0.2306028421093017, + "learning_rate": 0.0006549676025054179, + "loss": 2.4317, + "step": 3643 + }, + { + "epoch": 0.41784199059740856, + "grad_norm": 0.28742091641305784, + "learning_rate": 0.0006547910406134597, + "loss": 2.5942, + "step": 3644 + }, + { + "epoch": 0.4179566563467492, + "grad_norm": 0.2565525780662309, + "learning_rate": 0.000654614457371028, + "loss": 2.4474, + "step": 3645 + }, + { + "epoch": 0.4180713220960899, + "grad_norm": 0.24571613791616312, + "learning_rate": 0.000654437852802479, + "loss": 2.4489, + "step": 3646 + }, + { + "epoch": 0.4181859878454306, + "grad_norm": 0.2605831659380862, + "learning_rate": 0.0006542612269321722, + "loss": 2.3085, + "step": 3647 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.2849931032812933, + "learning_rate": 0.0006540845797844697, + "loss": 2.5407, + "step": 3648 + }, + { + "epoch": 0.4184153193441119, + "grad_norm": 0.24949532036004737, + "learning_rate": 0.0006539079113837363, + "loss": 2.4224, + "step": 3649 + }, + { + "epoch": 0.4185299850934526, + "grad_norm": 0.2343269251897938, + "learning_rate": 0.0006537312217543404, + "loss": 2.3046, + "step": 3650 + }, + { + "epoch": 0.41864465084279323, + "grad_norm": 0.2514037358280559, + "learning_rate": 0.0006535545109206529, + "loss": 2.4381, + "step": 3651 + }, + { + "epoch": 0.41875931659213395, + "grad_norm": 0.29762551092127837, + "learning_rate": 0.0006533777789070475, + "loss": 2.4484, + "step": 3652 + }, + { + "epoch": 0.4188739823414746, + "grad_norm": 0.30377877711475104, + "learning_rate": 0.0006532010257379013, + "loss": 2.4685, + "step": 3653 + }, + { + "epoch": 0.41898864809081526, + "grad_norm": 0.28711003071855085, + "learning_rate": 0.0006530242514375938, + "loss": 2.5352, + "step": 3654 + }, + { + "epoch": 0.4191033138401559, + "grad_norm": 0.2641730946140604, + "learning_rate": 0.0006528474560305076, + "loss": 2.4769, + "step": 3655 + }, + { + "epoch": 0.41921797958949664, + "grad_norm": 0.26220968808717726, + "learning_rate": 0.0006526706395410287, + "loss": 2.5616, + "step": 3656 + }, + { + "epoch": 0.4193326453388373, + "grad_norm": 0.24579250232458383, + "learning_rate": 0.000652493801993545, + "loss": 2.5398, + "step": 3657 + }, + { + "epoch": 0.41944731108817795, + "grad_norm": 0.2584288253945437, + "learning_rate": 0.0006523169434124481, + "loss": 2.5222, + "step": 3658 + }, + { + "epoch": 0.4195619768375186, + "grad_norm": 0.27235083051415143, + "learning_rate": 0.0006521400638221324, + "loss": 2.3881, + "step": 3659 + }, + { + "epoch": 0.4196766425868593, + "grad_norm": 0.27625268398247943, + "learning_rate": 0.0006519631632469949, + "loss": 2.6397, + "step": 3660 + }, + { + "epoch": 0.4197913083362, + "grad_norm": 0.2515383665546757, + "learning_rate": 0.0006517862417114356, + "loss": 2.624, + "step": 3661 + }, + { + "epoch": 0.41990597408554065, + "grad_norm": 0.2460270979163395, + "learning_rate": 0.0006516092992398578, + "loss": 2.4939, + "step": 3662 + }, + { + "epoch": 0.4200206398348813, + "grad_norm": 0.23099606469858047, + "learning_rate": 0.000651432335856667, + "loss": 2.4019, + "step": 3663 + }, + { + "epoch": 0.420135305584222, + "grad_norm": 0.2552418157454789, + "learning_rate": 0.0006512553515862718, + "loss": 2.4975, + "step": 3664 + }, + { + "epoch": 0.4202499713335627, + "grad_norm": 0.23242353915986808, + "learning_rate": 0.0006510783464530842, + "loss": 2.626, + "step": 3665 + }, + { + "epoch": 0.42036463708290334, + "grad_norm": 0.22278426422229544, + "learning_rate": 0.0006509013204815188, + "loss": 2.5481, + "step": 3666 + }, + { + "epoch": 0.420479302832244, + "grad_norm": 0.23955265080111315, + "learning_rate": 0.0006507242736959923, + "loss": 2.4875, + "step": 3667 + }, + { + "epoch": 0.42059396858158465, + "grad_norm": 0.21981000800173509, + "learning_rate": 0.0006505472061209255, + "loss": 2.5418, + "step": 3668 + }, + { + "epoch": 0.42070863433092537, + "grad_norm": 0.24129292094650165, + "learning_rate": 0.0006503701177807413, + "loss": 2.5035, + "step": 3669 + }, + { + "epoch": 0.420823300080266, + "grad_norm": 0.21527620418362667, + "learning_rate": 0.0006501930086998656, + "loss": 2.5824, + "step": 3670 + }, + { + "epoch": 0.4209379658296067, + "grad_norm": 0.24351315526142064, + "learning_rate": 0.0006500158789027273, + "loss": 2.468, + "step": 3671 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.22488655184067774, + "learning_rate": 0.0006498387284137582, + "loss": 2.7689, + "step": 3672 + }, + { + "epoch": 0.42116729732828806, + "grad_norm": 0.2480513170812118, + "learning_rate": 0.0006496615572573925, + "loss": 2.331, + "step": 3673 + }, + { + "epoch": 0.4212819630776287, + "grad_norm": 0.244318439562019, + "learning_rate": 0.0006494843654580678, + "loss": 2.4074, + "step": 3674 + }, + { + "epoch": 0.4213966288269694, + "grad_norm": 0.2540218256842001, + "learning_rate": 0.0006493071530402244, + "loss": 2.414, + "step": 3675 + }, + { + "epoch": 0.42151129457631004, + "grad_norm": 0.23502327944606952, + "learning_rate": 0.0006491299200283054, + "loss": 2.592, + "step": 3676 + }, + { + "epoch": 0.42162596032565075, + "grad_norm": 0.2681510368523747, + "learning_rate": 0.0006489526664467565, + "loss": 2.638, + "step": 3677 + }, + { + "epoch": 0.4217406260749914, + "grad_norm": 0.24727921577216871, + "learning_rate": 0.0006487753923200264, + "loss": 2.5062, + "step": 3678 + }, + { + "epoch": 0.42185529182433207, + "grad_norm": 0.2331083222229182, + "learning_rate": 0.0006485980976725671, + "loss": 2.4754, + "step": 3679 + }, + { + "epoch": 0.4219699575736727, + "grad_norm": 0.24080329141980003, + "learning_rate": 0.0006484207825288326, + "loss": 2.4949, + "step": 3680 + }, + { + "epoch": 0.42208462332301344, + "grad_norm": 0.2574465613492795, + "learning_rate": 0.0006482434469132803, + "loss": 2.5269, + "step": 3681 + }, + { + "epoch": 0.4221992890723541, + "grad_norm": 0.2649815871035571, + "learning_rate": 0.0006480660908503704, + "loss": 2.5162, + "step": 3682 + }, + { + "epoch": 0.42231395482169476, + "grad_norm": 0.25100304750393715, + "learning_rate": 0.0006478887143645655, + "loss": 2.571, + "step": 3683 + }, + { + "epoch": 0.4224286205710354, + "grad_norm": 0.28004326570790655, + "learning_rate": 0.0006477113174803317, + "loss": 2.5961, + "step": 3684 + }, + { + "epoch": 0.4225432863203761, + "grad_norm": 0.2614567496956879, + "learning_rate": 0.0006475339002221371, + "loss": 2.5287, + "step": 3685 + }, + { + "epoch": 0.4226579520697168, + "grad_norm": 0.23950864239161568, + "learning_rate": 0.0006473564626144532, + "loss": 2.5136, + "step": 3686 + }, + { + "epoch": 0.42277261781905745, + "grad_norm": 0.25308605542563317, + "learning_rate": 0.0006471790046817542, + "loss": 2.424, + "step": 3687 + }, + { + "epoch": 0.4228872835683981, + "grad_norm": 0.23941390075116217, + "learning_rate": 0.0006470015264485172, + "loss": 2.4019, + "step": 3688 + }, + { + "epoch": 0.42300194931773877, + "grad_norm": 0.22787922982155834, + "learning_rate": 0.0006468240279392214, + "loss": 2.5225, + "step": 3689 + }, + { + "epoch": 0.4231166150670795, + "grad_norm": 0.2261014479388271, + "learning_rate": 0.0006466465091783497, + "loss": 2.5607, + "step": 3690 + }, + { + "epoch": 0.42323128081642014, + "grad_norm": 0.24999187077182888, + "learning_rate": 0.0006464689701903876, + "loss": 2.4943, + "step": 3691 + }, + { + "epoch": 0.4233459465657608, + "grad_norm": 0.22771112716645384, + "learning_rate": 0.000646291410999823, + "loss": 2.444, + "step": 3692 + }, + { + "epoch": 0.42346061231510146, + "grad_norm": 0.25846594342301527, + "learning_rate": 0.0006461138316311467, + "loss": 2.5091, + "step": 3693 + }, + { + "epoch": 0.4235752780644422, + "grad_norm": 0.25574418375764013, + "learning_rate": 0.0006459362321088527, + "loss": 2.4611, + "step": 3694 + }, + { + "epoch": 0.42368994381378283, + "grad_norm": 0.2575415996951374, + "learning_rate": 0.000645758612457437, + "loss": 2.5979, + "step": 3695 + }, + { + "epoch": 0.4238046095631235, + "grad_norm": 0.23552465592246552, + "learning_rate": 0.0006455809727013992, + "loss": 2.4927, + "step": 3696 + }, + { + "epoch": 0.42391927531246415, + "grad_norm": 0.24466490532846727, + "learning_rate": 0.0006454033128652414, + "loss": 2.4182, + "step": 3697 + }, + { + "epoch": 0.42403394106180486, + "grad_norm": 0.2338675433072939, + "learning_rate": 0.0006452256329734682, + "loss": 2.457, + "step": 3698 + }, + { + "epoch": 0.4241486068111455, + "grad_norm": 0.24434853884220542, + "learning_rate": 0.0006450479330505869, + "loss": 2.4482, + "step": 3699 + }, + { + "epoch": 0.4242632725604862, + "grad_norm": 0.24994143018382756, + "learning_rate": 0.0006448702131211084, + "loss": 2.484, + "step": 3700 + }, + { + "epoch": 0.42437793830982684, + "grad_norm": 0.270087048825776, + "learning_rate": 0.0006446924732095455, + "loss": 2.5821, + "step": 3701 + }, + { + "epoch": 0.4244926040591675, + "grad_norm": 0.2663004701614018, + "learning_rate": 0.0006445147133404139, + "loss": 2.6339, + "step": 3702 + }, + { + "epoch": 0.4246072698085082, + "grad_norm": 0.2571666460587385, + "learning_rate": 0.0006443369335382322, + "loss": 2.5783, + "step": 3703 + }, + { + "epoch": 0.4247219355578489, + "grad_norm": 0.24683490224213048, + "learning_rate": 0.000644159133827522, + "loss": 2.5038, + "step": 3704 + }, + { + "epoch": 0.42483660130718953, + "grad_norm": 0.23592983938425593, + "learning_rate": 0.000643981314232807, + "loss": 2.4197, + "step": 3705 + }, + { + "epoch": 0.4249512670565302, + "grad_norm": 0.2572591280936109, + "learning_rate": 0.0006438034747786144, + "loss": 2.5235, + "step": 3706 + }, + { + "epoch": 0.4250659328058709, + "grad_norm": 0.2640370097849018, + "learning_rate": 0.0006436256154894737, + "loss": 2.6165, + "step": 3707 + }, + { + "epoch": 0.42518059855521156, + "grad_norm": 0.23384258285351509, + "learning_rate": 0.0006434477363899168, + "loss": 2.5107, + "step": 3708 + }, + { + "epoch": 0.4252952643045522, + "grad_norm": 0.23186840917901988, + "learning_rate": 0.0006432698375044793, + "loss": 2.4481, + "step": 3709 + }, + { + "epoch": 0.4254099300538929, + "grad_norm": 0.2443600239659201, + "learning_rate": 0.0006430919188576986, + "loss": 2.6296, + "step": 3710 + }, + { + "epoch": 0.4255245958032336, + "grad_norm": 0.2356305614139818, + "learning_rate": 0.0006429139804741151, + "loss": 2.3981, + "step": 3711 + }, + { + "epoch": 0.42563926155257426, + "grad_norm": 0.24227539571917678, + "learning_rate": 0.0006427360223782726, + "loss": 2.5319, + "step": 3712 + }, + { + "epoch": 0.4257539273019149, + "grad_norm": 0.2645723066921694, + "learning_rate": 0.0006425580445947163, + "loss": 2.492, + "step": 3713 + }, + { + "epoch": 0.4258685930512556, + "grad_norm": 0.23120728446033437, + "learning_rate": 0.000642380047147995, + "loss": 2.67, + "step": 3714 + }, + { + "epoch": 0.4259832588005963, + "grad_norm": 0.2648993174355372, + "learning_rate": 0.0006422020300626604, + "loss": 2.3976, + "step": 3715 + }, + { + "epoch": 0.42609792454993695, + "grad_norm": 0.2484545470537326, + "learning_rate": 0.0006420239933632666, + "loss": 2.4172, + "step": 3716 + }, + { + "epoch": 0.4262125902992776, + "grad_norm": 0.27046625234251764, + "learning_rate": 0.0006418459370743698, + "loss": 2.5206, + "step": 3717 + }, + { + "epoch": 0.42632725604861826, + "grad_norm": 0.31105171709620916, + "learning_rate": 0.0006416678612205298, + "loss": 2.4857, + "step": 3718 + }, + { + "epoch": 0.4264419217979589, + "grad_norm": 0.2581816029461363, + "learning_rate": 0.000641489765826309, + "loss": 2.5993, + "step": 3719 + }, + { + "epoch": 0.42655658754729964, + "grad_norm": 0.2781239094791762, + "learning_rate": 0.0006413116509162719, + "loss": 2.4373, + "step": 3720 + }, + { + "epoch": 0.4266712532966403, + "grad_norm": 0.2292560634301624, + "learning_rate": 0.0006411335165149863, + "loss": 2.2859, + "step": 3721 + }, + { + "epoch": 0.42678591904598095, + "grad_norm": 0.2456432796770211, + "learning_rate": 0.0006409553626470223, + "loss": 2.4036, + "step": 3722 + }, + { + "epoch": 0.4269005847953216, + "grad_norm": 0.2629152475483945, + "learning_rate": 0.0006407771893369529, + "loss": 2.4365, + "step": 3723 + }, + { + "epoch": 0.42701525054466233, + "grad_norm": 0.3050686891133825, + "learning_rate": 0.0006405989966093536, + "loss": 2.4266, + "step": 3724 + }, + { + "epoch": 0.427129916294003, + "grad_norm": 0.2631024282904743, + "learning_rate": 0.0006404207844888029, + "loss": 2.4018, + "step": 3725 + }, + { + "epoch": 0.42724458204334365, + "grad_norm": 0.26369223526007296, + "learning_rate": 0.0006402425529998816, + "loss": 2.5805, + "step": 3726 + }, + { + "epoch": 0.4273592477926843, + "grad_norm": 0.23744506051488645, + "learning_rate": 0.0006400643021671734, + "loss": 2.4739, + "step": 3727 + }, + { + "epoch": 0.427473913542025, + "grad_norm": 0.24404333486975455, + "learning_rate": 0.0006398860320152645, + "loss": 2.5921, + "step": 3728 + }, + { + "epoch": 0.4275885792913657, + "grad_norm": 0.29878729898666484, + "learning_rate": 0.0006397077425687441, + "loss": 2.6021, + "step": 3729 + }, + { + "epoch": 0.42770324504070634, + "grad_norm": 0.22748280688714, + "learning_rate": 0.0006395294338522034, + "loss": 2.4298, + "step": 3730 + }, + { + "epoch": 0.427817910790047, + "grad_norm": 0.23387627875344763, + "learning_rate": 0.0006393511058902373, + "loss": 2.3872, + "step": 3731 + }, + { + "epoch": 0.4279325765393877, + "grad_norm": 0.2835332758032488, + "learning_rate": 0.0006391727587074423, + "loss": 2.5757, + "step": 3732 + }, + { + "epoch": 0.42804724228872837, + "grad_norm": 0.2312958461587959, + "learning_rate": 0.000638994392328418, + "loss": 2.396, + "step": 3733 + }, + { + "epoch": 0.42816190803806903, + "grad_norm": 0.25887357672841227, + "learning_rate": 0.0006388160067777669, + "loss": 2.5799, + "step": 3734 + }, + { + "epoch": 0.4282765737874097, + "grad_norm": 0.307914208229826, + "learning_rate": 0.0006386376020800936, + "loss": 2.5125, + "step": 3735 + }, + { + "epoch": 0.42839123953675035, + "grad_norm": 0.2680163434022882, + "learning_rate": 0.0006384591782600058, + "loss": 2.4314, + "step": 3736 + }, + { + "epoch": 0.42850590528609106, + "grad_norm": 0.28050719326701523, + "learning_rate": 0.0006382807353421138, + "loss": 2.7169, + "step": 3737 + }, + { + "epoch": 0.4286205710354317, + "grad_norm": 0.24226221061292363, + "learning_rate": 0.0006381022733510299, + "loss": 2.6226, + "step": 3738 + }, + { + "epoch": 0.4287352367847724, + "grad_norm": 0.24580847946134887, + "learning_rate": 0.0006379237923113701, + "loss": 2.5466, + "step": 3739 + }, + { + "epoch": 0.42884990253411304, + "grad_norm": 0.25087045518254625, + "learning_rate": 0.0006377452922477523, + "loss": 2.4611, + "step": 3740 + }, + { + "epoch": 0.42896456828345375, + "grad_norm": 0.23830860797130302, + "learning_rate": 0.0006375667731847969, + "loss": 2.4343, + "step": 3741 + }, + { + "epoch": 0.4290792340327944, + "grad_norm": 0.25650783675335387, + "learning_rate": 0.0006373882351471275, + "loss": 2.5961, + "step": 3742 + }, + { + "epoch": 0.42919389978213507, + "grad_norm": 0.24559551902348492, + "learning_rate": 0.0006372096781593699, + "loss": 2.4247, + "step": 3743 + }, + { + "epoch": 0.4293085655314757, + "grad_norm": 0.2446883345321881, + "learning_rate": 0.0006370311022461528, + "loss": 2.6487, + "step": 3744 + }, + { + "epoch": 0.42942323128081644, + "grad_norm": 0.24647327899397467, + "learning_rate": 0.000636852507432107, + "loss": 2.5216, + "step": 3745 + }, + { + "epoch": 0.4295378970301571, + "grad_norm": 0.265291680237914, + "learning_rate": 0.0006366738937418664, + "loss": 2.4052, + "step": 3746 + }, + { + "epoch": 0.42965256277949776, + "grad_norm": 0.24621810515357678, + "learning_rate": 0.0006364952612000676, + "loss": 2.5064, + "step": 3747 + }, + { + "epoch": 0.4297672285288384, + "grad_norm": 0.2378835341393557, + "learning_rate": 0.0006363166098313492, + "loss": 2.4359, + "step": 3748 + }, + { + "epoch": 0.42988189427817913, + "grad_norm": 0.2267593299771046, + "learning_rate": 0.0006361379396603529, + "loss": 2.3247, + "step": 3749 + }, + { + "epoch": 0.4299965600275198, + "grad_norm": 0.24720079023203084, + "learning_rate": 0.0006359592507117229, + "loss": 2.3598, + "step": 3750 + }, + { + "epoch": 0.43011122577686045, + "grad_norm": 0.26042743558804593, + "learning_rate": 0.000635780543010106, + "loss": 2.4743, + "step": 3751 + }, + { + "epoch": 0.4302258915262011, + "grad_norm": 0.26468915150417194, + "learning_rate": 0.0006356018165801511, + "loss": 2.4608, + "step": 3752 + }, + { + "epoch": 0.43034055727554177, + "grad_norm": 0.2305352737816054, + "learning_rate": 0.0006354230714465107, + "loss": 2.5031, + "step": 3753 + }, + { + "epoch": 0.4304552230248825, + "grad_norm": 0.2496842595140439, + "learning_rate": 0.000635244307633839, + "loss": 2.4778, + "step": 3754 + }, + { + "epoch": 0.43056988877422314, + "grad_norm": 0.2590041276695064, + "learning_rate": 0.0006350655251667927, + "loss": 2.4273, + "step": 3755 + }, + { + "epoch": 0.4306845545235638, + "grad_norm": 0.2648184856956326, + "learning_rate": 0.0006348867240700321, + "loss": 2.4532, + "step": 3756 + }, + { + "epoch": 0.43079922027290446, + "grad_norm": 0.2545433220533618, + "learning_rate": 0.0006347079043682191, + "loss": 2.6002, + "step": 3757 + }, + { + "epoch": 0.4309138860222452, + "grad_norm": 0.2465823638818654, + "learning_rate": 0.0006345290660860184, + "loss": 2.4923, + "step": 3758 + }, + { + "epoch": 0.43102855177158583, + "grad_norm": 0.23451123863473178, + "learning_rate": 0.0006343502092480973, + "loss": 2.4748, + "step": 3759 + }, + { + "epoch": 0.4311432175209265, + "grad_norm": 0.2338368793288413, + "learning_rate": 0.0006341713338791258, + "loss": 2.4136, + "step": 3760 + }, + { + "epoch": 0.43125788327026715, + "grad_norm": 0.2324315784484788, + "learning_rate": 0.0006339924400037765, + "loss": 2.5137, + "step": 3761 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 0.2281217999996604, + "learning_rate": 0.0006338135276467241, + "loss": 2.463, + "step": 3762 + }, + { + "epoch": 0.4314872147689485, + "grad_norm": 0.24134162249044894, + "learning_rate": 0.0006336345968326462, + "loss": 2.4698, + "step": 3763 + }, + { + "epoch": 0.4316018805182892, + "grad_norm": 0.24365597122225233, + "learning_rate": 0.0006334556475862231, + "loss": 2.4225, + "step": 3764 + }, + { + "epoch": 0.43171654626762984, + "grad_norm": 0.22942000523899764, + "learning_rate": 0.0006332766799321372, + "loss": 2.6032, + "step": 3765 + }, + { + "epoch": 0.43183121201697056, + "grad_norm": 0.2296419363673588, + "learning_rate": 0.000633097693895074, + "loss": 2.5136, + "step": 3766 + }, + { + "epoch": 0.4319458777663112, + "grad_norm": 0.23517661546603696, + "learning_rate": 0.0006329186894997208, + "loss": 2.528, + "step": 3767 + }, + { + "epoch": 0.4320605435156519, + "grad_norm": 0.25223796241212604, + "learning_rate": 0.0006327396667707682, + "loss": 2.4296, + "step": 3768 + }, + { + "epoch": 0.43217520926499253, + "grad_norm": 0.30022384597663754, + "learning_rate": 0.0006325606257329086, + "loss": 2.5669, + "step": 3769 + }, + { + "epoch": 0.4322898750143332, + "grad_norm": 0.2245703909708394, + "learning_rate": 0.0006323815664108376, + "loss": 2.356, + "step": 3770 + }, + { + "epoch": 0.4324045407636739, + "grad_norm": 0.2606408643947649, + "learning_rate": 0.000632202488829253, + "loss": 2.5251, + "step": 3771 + }, + { + "epoch": 0.43251920651301456, + "grad_norm": 0.23537474111510084, + "learning_rate": 0.0006320233930128551, + "loss": 2.4366, + "step": 3772 + }, + { + "epoch": 0.4326338722623552, + "grad_norm": 0.2356086924610653, + "learning_rate": 0.0006318442789863466, + "loss": 2.5444, + "step": 3773 + }, + { + "epoch": 0.4327485380116959, + "grad_norm": 0.2531315608513024, + "learning_rate": 0.0006316651467744332, + "loss": 2.3707, + "step": 3774 + }, + { + "epoch": 0.4328632037610366, + "grad_norm": 0.2594569588939986, + "learning_rate": 0.0006314859964018224, + "loss": 2.5726, + "step": 3775 + }, + { + "epoch": 0.43297786951037726, + "grad_norm": 0.24059963468784043, + "learning_rate": 0.0006313068278932248, + "loss": 2.7038, + "step": 3776 + }, + { + "epoch": 0.4330925352597179, + "grad_norm": 0.25555491876004655, + "learning_rate": 0.0006311276412733532, + "loss": 2.5235, + "step": 3777 + }, + { + "epoch": 0.4332072010090586, + "grad_norm": 0.25295838276705923, + "learning_rate": 0.000630948436566923, + "loss": 2.4533, + "step": 3778 + }, + { + "epoch": 0.4333218667583993, + "grad_norm": 0.2688805844503762, + "learning_rate": 0.0006307692137986522, + "loss": 2.4243, + "step": 3779 + }, + { + "epoch": 0.43343653250773995, + "grad_norm": 0.2188599506955828, + "learning_rate": 0.0006305899729932608, + "loss": 2.475, + "step": 3780 + }, + { + "epoch": 0.4335511982570806, + "grad_norm": 0.24833190551796147, + "learning_rate": 0.0006304107141754721, + "loss": 2.4303, + "step": 3781 + }, + { + "epoch": 0.43366586400642126, + "grad_norm": 0.24515110713446844, + "learning_rate": 0.0006302314373700113, + "loss": 2.4447, + "step": 3782 + }, + { + "epoch": 0.433780529755762, + "grad_norm": 0.23527766995876795, + "learning_rate": 0.000630052142601606, + "loss": 2.4665, + "step": 3783 + }, + { + "epoch": 0.43389519550510264, + "grad_norm": 0.26971669511913365, + "learning_rate": 0.0006298728298949866, + "loss": 2.5148, + "step": 3784 + }, + { + "epoch": 0.4340098612544433, + "grad_norm": 0.24451204525007422, + "learning_rate": 0.0006296934992748859, + "loss": 2.4205, + "step": 3785 + }, + { + "epoch": 0.43412452700378396, + "grad_norm": 0.2578467598176871, + "learning_rate": 0.0006295141507660394, + "loss": 2.4938, + "step": 3786 + }, + { + "epoch": 0.4342391927531246, + "grad_norm": 0.23212579148929557, + "learning_rate": 0.0006293347843931844, + "loss": 2.4657, + "step": 3787 + }, + { + "epoch": 0.43435385850246533, + "grad_norm": 0.25282789182567583, + "learning_rate": 0.0006291554001810612, + "loss": 2.4794, + "step": 3788 + }, + { + "epoch": 0.434468524251806, + "grad_norm": 0.24879253120626094, + "learning_rate": 0.0006289759981544126, + "loss": 2.4861, + "step": 3789 + }, + { + "epoch": 0.43458319000114665, + "grad_norm": 0.24143234228504765, + "learning_rate": 0.0006287965783379834, + "loss": 2.497, + "step": 3790 + }, + { + "epoch": 0.4346978557504873, + "grad_norm": 0.2355557012793554, + "learning_rate": 0.0006286171407565214, + "loss": 2.3984, + "step": 3791 + }, + { + "epoch": 0.434812521499828, + "grad_norm": 0.25739912543839666, + "learning_rate": 0.0006284376854347766, + "loss": 2.3254, + "step": 3792 + }, + { + "epoch": 0.4349271872491687, + "grad_norm": 0.24402508892393346, + "learning_rate": 0.0006282582123975011, + "loss": 2.5377, + "step": 3793 + }, + { + "epoch": 0.43504185299850934, + "grad_norm": 0.2773765602416316, + "learning_rate": 0.0006280787216694502, + "loss": 2.5083, + "step": 3794 + }, + { + "epoch": 0.43515651874785, + "grad_norm": 0.2446188541949575, + "learning_rate": 0.000627899213275381, + "loss": 2.4669, + "step": 3795 + }, + { + "epoch": 0.4352711844971907, + "grad_norm": 0.25016913690541104, + "learning_rate": 0.0006277196872400534, + "loss": 2.4943, + "step": 3796 + }, + { + "epoch": 0.43538585024653137, + "grad_norm": 0.24296763525600565, + "learning_rate": 0.0006275401435882294, + "loss": 2.6248, + "step": 3797 + }, + { + "epoch": 0.43550051599587203, + "grad_norm": 0.23540469014095888, + "learning_rate": 0.0006273605823446738, + "loss": 2.5115, + "step": 3798 + }, + { + "epoch": 0.4356151817452127, + "grad_norm": 0.24191755703757348, + "learning_rate": 0.0006271810035341534, + "loss": 2.4802, + "step": 3799 + }, + { + "epoch": 0.4357298474945534, + "grad_norm": 0.24929082712908535, + "learning_rate": 0.000627001407181438, + "loss": 2.5123, + "step": 3800 + }, + { + "epoch": 0.43584451324389406, + "grad_norm": 0.2607264207787983, + "learning_rate": 0.0006268217933112994, + "loss": 2.4693, + "step": 3801 + }, + { + "epoch": 0.4359591789932347, + "grad_norm": 0.22791731122897538, + "learning_rate": 0.0006266421619485116, + "loss": 2.5408, + "step": 3802 + }, + { + "epoch": 0.4360738447425754, + "grad_norm": 0.2613530812527927, + "learning_rate": 0.0006264625131178519, + "loss": 2.5761, + "step": 3803 + }, + { + "epoch": 0.43618851049191604, + "grad_norm": 0.24247552865664587, + "learning_rate": 0.000626282846844099, + "loss": 2.5135, + "step": 3804 + }, + { + "epoch": 0.43630317624125675, + "grad_norm": 0.2497939969879368, + "learning_rate": 0.0006261031631520345, + "loss": 2.556, + "step": 3805 + }, + { + "epoch": 0.4364178419905974, + "grad_norm": 0.2444041005359748, + "learning_rate": 0.0006259234620664423, + "loss": 2.5556, + "step": 3806 + }, + { + "epoch": 0.43653250773993807, + "grad_norm": 0.26548794040209045, + "learning_rate": 0.0006257437436121091, + "loss": 2.6427, + "step": 3807 + }, + { + "epoch": 0.43664717348927873, + "grad_norm": 0.25278498904062957, + "learning_rate": 0.0006255640078138229, + "loss": 2.5058, + "step": 3808 + }, + { + "epoch": 0.43676183923861944, + "grad_norm": 0.22627951201911523, + "learning_rate": 0.0006253842546963757, + "loss": 2.4026, + "step": 3809 + }, + { + "epoch": 0.4368765049879601, + "grad_norm": 0.22829330932828906, + "learning_rate": 0.0006252044842845606, + "loss": 2.4813, + "step": 3810 + }, + { + "epoch": 0.43699117073730076, + "grad_norm": 0.22259102899055572, + "learning_rate": 0.0006250246966031733, + "loss": 2.5642, + "step": 3811 + }, + { + "epoch": 0.4371058364866414, + "grad_norm": 0.2381572298918821, + "learning_rate": 0.0006248448916770124, + "loss": 2.434, + "step": 3812 + }, + { + "epoch": 0.43722050223598213, + "grad_norm": 0.22236994168538668, + "learning_rate": 0.0006246650695308784, + "loss": 2.4427, + "step": 3813 + }, + { + "epoch": 0.4373351679853228, + "grad_norm": 0.22964165361230704, + "learning_rate": 0.0006244852301895745, + "loss": 2.4248, + "step": 3814 + }, + { + "epoch": 0.43744983373466345, + "grad_norm": 0.24227387661447042, + "learning_rate": 0.0006243053736779058, + "loss": 2.4845, + "step": 3815 + }, + { + "epoch": 0.4375644994840041, + "grad_norm": 0.26371652912392507, + "learning_rate": 0.0006241255000206806, + "loss": 2.617, + "step": 3816 + }, + { + "epoch": 0.4376791652333448, + "grad_norm": 0.2569499544365948, + "learning_rate": 0.0006239456092427085, + "loss": 2.4847, + "step": 3817 + }, + { + "epoch": 0.4377938309826855, + "grad_norm": 0.25737505633385255, + "learning_rate": 0.0006237657013688022, + "loss": 2.3512, + "step": 3818 + }, + { + "epoch": 0.43790849673202614, + "grad_norm": 0.28322089965060304, + "learning_rate": 0.0006235857764237767, + "loss": 2.419, + "step": 3819 + }, + { + "epoch": 0.4380231624813668, + "grad_norm": 0.25947777793560933, + "learning_rate": 0.0006234058344324491, + "loss": 2.4968, + "step": 3820 + }, + { + "epoch": 0.43813782823070746, + "grad_norm": 0.22628442212614977, + "learning_rate": 0.0006232258754196388, + "loss": 2.5635, + "step": 3821 + }, + { + "epoch": 0.4382524939800482, + "grad_norm": 0.26789431207644143, + "learning_rate": 0.0006230458994101681, + "loss": 2.4917, + "step": 3822 + }, + { + "epoch": 0.43836715972938883, + "grad_norm": 0.25978882136460274, + "learning_rate": 0.0006228659064288609, + "loss": 2.6976, + "step": 3823 + }, + { + "epoch": 0.4384818254787295, + "grad_norm": 0.22690374013611359, + "learning_rate": 0.0006226858965005439, + "loss": 2.448, + "step": 3824 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 0.22572923677405254, + "learning_rate": 0.0006225058696500462, + "loss": 2.4013, + "step": 3825 + }, + { + "epoch": 0.43871115697741087, + "grad_norm": 0.2446367548888773, + "learning_rate": 0.000622325825902199, + "loss": 2.453, + "step": 3826 + }, + { + "epoch": 0.4388258227267515, + "grad_norm": 0.21250331424107224, + "learning_rate": 0.0006221457652818357, + "loss": 2.5505, + "step": 3827 + }, + { + "epoch": 0.4389404884760922, + "grad_norm": 0.2221859098836378, + "learning_rate": 0.0006219656878137925, + "loss": 2.5177, + "step": 3828 + }, + { + "epoch": 0.43905515422543284, + "grad_norm": 0.2258421138052024, + "learning_rate": 0.0006217855935229075, + "loss": 2.4143, + "step": 3829 + }, + { + "epoch": 0.43916981997477356, + "grad_norm": 0.2359458275586472, + "learning_rate": 0.0006216054824340212, + "loss": 2.5349, + "step": 3830 + }, + { + "epoch": 0.4392844857241142, + "grad_norm": 0.26109347273243705, + "learning_rate": 0.0006214253545719768, + "loss": 2.6663, + "step": 3831 + }, + { + "epoch": 0.4393991514734549, + "grad_norm": 0.22306737299137816, + "learning_rate": 0.0006212452099616194, + "loss": 2.6206, + "step": 3832 + }, + { + "epoch": 0.43951381722279553, + "grad_norm": 0.25070728465884073, + "learning_rate": 0.0006210650486277961, + "loss": 2.4855, + "step": 3833 + }, + { + "epoch": 0.43962848297213625, + "grad_norm": 0.26101690754790224, + "learning_rate": 0.0006208848705953573, + "loss": 2.5547, + "step": 3834 + }, + { + "epoch": 0.4397431487214769, + "grad_norm": 0.25975082801274285, + "learning_rate": 0.0006207046758891548, + "loss": 2.4832, + "step": 3835 + }, + { + "epoch": 0.43985781447081757, + "grad_norm": 0.2393787817938355, + "learning_rate": 0.0006205244645340431, + "loss": 2.4284, + "step": 3836 + }, + { + "epoch": 0.4399724802201582, + "grad_norm": 0.23961291386277636, + "learning_rate": 0.0006203442365548791, + "loss": 2.4275, + "step": 3837 + }, + { + "epoch": 0.4400871459694989, + "grad_norm": 0.26333085205637896, + "learning_rate": 0.0006201639919765214, + "loss": 2.4963, + "step": 3838 + }, + { + "epoch": 0.4402018117188396, + "grad_norm": 0.25231724731244304, + "learning_rate": 0.0006199837308238315, + "loss": 2.4523, + "step": 3839 + }, + { + "epoch": 0.44031647746818026, + "grad_norm": 0.2633268161070358, + "learning_rate": 0.0006198034531216731, + "loss": 2.7312, + "step": 3840 + }, + { + "epoch": 0.4404311432175209, + "grad_norm": 0.2748456169816904, + "learning_rate": 0.0006196231588949121, + "loss": 2.4758, + "step": 3841 + }, + { + "epoch": 0.4405458089668616, + "grad_norm": 0.28739378399684495, + "learning_rate": 0.0006194428481684166, + "loss": 2.6028, + "step": 3842 + }, + { + "epoch": 0.4406604747162023, + "grad_norm": 0.27796273392966664, + "learning_rate": 0.0006192625209670568, + "loss": 2.3836, + "step": 3843 + }, + { + "epoch": 0.44077514046554295, + "grad_norm": 0.2892768369562647, + "learning_rate": 0.0006190821773157058, + "loss": 2.5094, + "step": 3844 + }, + { + "epoch": 0.4408898062148836, + "grad_norm": 0.24011257302760441, + "learning_rate": 0.0006189018172392382, + "loss": 2.5442, + "step": 3845 + }, + { + "epoch": 0.44100447196422426, + "grad_norm": 0.2632337942022086, + "learning_rate": 0.0006187214407625313, + "loss": 2.4544, + "step": 3846 + }, + { + "epoch": 0.441119137713565, + "grad_norm": 0.24290546998360119, + "learning_rate": 0.000618541047910465, + "loss": 2.4866, + "step": 3847 + }, + { + "epoch": 0.44123380346290564, + "grad_norm": 0.25689002083208545, + "learning_rate": 0.0006183606387079205, + "loss": 2.6536, + "step": 3848 + }, + { + "epoch": 0.4413484692122463, + "grad_norm": 0.2637469231521172, + "learning_rate": 0.0006181802131797821, + "loss": 2.5916, + "step": 3849 + }, + { + "epoch": 0.44146313496158696, + "grad_norm": 0.22675212961957192, + "learning_rate": 0.0006179997713509359, + "loss": 2.5034, + "step": 3850 + }, + { + "epoch": 0.44157780071092767, + "grad_norm": 0.2384649535056212, + "learning_rate": 0.0006178193132462706, + "loss": 2.4278, + "step": 3851 + }, + { + "epoch": 0.44169246646026833, + "grad_norm": 0.2349726456516134, + "learning_rate": 0.000617638838890677, + "loss": 2.6831, + "step": 3852 + }, + { + "epoch": 0.441807132209609, + "grad_norm": 0.2633477014569964, + "learning_rate": 0.0006174583483090478, + "loss": 2.4221, + "step": 3853 + }, + { + "epoch": 0.44192179795894965, + "grad_norm": 0.2332608319041885, + "learning_rate": 0.0006172778415262785, + "loss": 2.3146, + "step": 3854 + }, + { + "epoch": 0.44203646370829036, + "grad_norm": 0.30116217089939906, + "learning_rate": 0.0006170973185672664, + "loss": 2.345, + "step": 3855 + }, + { + "epoch": 0.442151129457631, + "grad_norm": 0.2368935097053184, + "learning_rate": 0.0006169167794569114, + "loss": 2.5089, + "step": 3856 + }, + { + "epoch": 0.4422657952069717, + "grad_norm": 0.27957531433546395, + "learning_rate": 0.0006167362242201153, + "loss": 2.4535, + "step": 3857 + }, + { + "epoch": 0.44238046095631234, + "grad_norm": 0.27333998451105695, + "learning_rate": 0.000616555652881782, + "loss": 2.5352, + "step": 3858 + }, + { + "epoch": 0.442495126705653, + "grad_norm": 0.25179900070696376, + "learning_rate": 0.0006163750654668186, + "loss": 2.533, + "step": 3859 + }, + { + "epoch": 0.4426097924549937, + "grad_norm": 0.24506089272052226, + "learning_rate": 0.0006161944620001328, + "loss": 2.591, + "step": 3860 + }, + { + "epoch": 0.44272445820433437, + "grad_norm": 0.2488847921021115, + "learning_rate": 0.0006160138425066361, + "loss": 2.4796, + "step": 3861 + }, + { + "epoch": 0.44283912395367503, + "grad_norm": 0.2604219886655084, + "learning_rate": 0.0006158332070112412, + "loss": 2.4917, + "step": 3862 + }, + { + "epoch": 0.4429537897030157, + "grad_norm": 0.258590520124936, + "learning_rate": 0.0006156525555388633, + "loss": 2.5954, + "step": 3863 + }, + { + "epoch": 0.4430684554523564, + "grad_norm": 0.2387580872999713, + "learning_rate": 0.0006154718881144199, + "loss": 2.4027, + "step": 3864 + }, + { + "epoch": 0.44318312120169706, + "grad_norm": 0.24549099971447785, + "learning_rate": 0.0006152912047628307, + "loss": 2.4684, + "step": 3865 + }, + { + "epoch": 0.4432977869510377, + "grad_norm": 0.2508850192649552, + "learning_rate": 0.0006151105055090174, + "loss": 2.5822, + "step": 3866 + }, + { + "epoch": 0.4434124527003784, + "grad_norm": 0.2194730128716753, + "learning_rate": 0.0006149297903779042, + "loss": 2.6273, + "step": 3867 + }, + { + "epoch": 0.4435271184497191, + "grad_norm": 0.24156538980703018, + "learning_rate": 0.000614749059394417, + "loss": 2.4084, + "step": 3868 + }, + { + "epoch": 0.44364178419905975, + "grad_norm": 0.24259527687767338, + "learning_rate": 0.0006145683125834846, + "loss": 2.5858, + "step": 3869 + }, + { + "epoch": 0.4437564499484004, + "grad_norm": 0.25413084606845093, + "learning_rate": 0.000614387549970037, + "loss": 2.6548, + "step": 3870 + }, + { + "epoch": 0.44387111569774107, + "grad_norm": 0.19819001733668784, + "learning_rate": 0.0006142067715790077, + "loss": 2.4568, + "step": 3871 + }, + { + "epoch": 0.4439857814470818, + "grad_norm": 0.24614956490109688, + "learning_rate": 0.000614025977435331, + "loss": 2.5309, + "step": 3872 + }, + { + "epoch": 0.44410044719642244, + "grad_norm": 0.22962040823978638, + "learning_rate": 0.0006138451675639443, + "loss": 2.4202, + "step": 3873 + }, + { + "epoch": 0.4442151129457631, + "grad_norm": 0.23898955761888835, + "learning_rate": 0.0006136643419897867, + "loss": 2.5688, + "step": 3874 + }, + { + "epoch": 0.44432977869510376, + "grad_norm": 0.2434298411416163, + "learning_rate": 0.0006134835007377999, + "loss": 2.6224, + "step": 3875 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.2552399708065581, + "learning_rate": 0.0006133026438329274, + "loss": 2.426, + "step": 3876 + }, + { + "epoch": 0.44455911019378513, + "grad_norm": 0.24162588918886282, + "learning_rate": 0.0006131217713001148, + "loss": 2.5712, + "step": 3877 + }, + { + "epoch": 0.4446737759431258, + "grad_norm": 0.23899440037493203, + "learning_rate": 0.0006129408831643102, + "loss": 2.4479, + "step": 3878 + }, + { + "epoch": 0.44478844169246645, + "grad_norm": 0.23280427428474385, + "learning_rate": 0.0006127599794504637, + "loss": 2.5349, + "step": 3879 + }, + { + "epoch": 0.4449031074418071, + "grad_norm": 0.22526148695649914, + "learning_rate": 0.0006125790601835272, + "loss": 2.6287, + "step": 3880 + }, + { + "epoch": 0.4450177731911478, + "grad_norm": 0.23862405763550582, + "learning_rate": 0.0006123981253884556, + "loss": 2.4412, + "step": 3881 + }, + { + "epoch": 0.4451324389404885, + "grad_norm": 0.27119703023534425, + "learning_rate": 0.000612217175090205, + "loss": 2.5351, + "step": 3882 + }, + { + "epoch": 0.44524710468982914, + "grad_norm": 0.23155040221695772, + "learning_rate": 0.000612036209313734, + "loss": 2.4429, + "step": 3883 + }, + { + "epoch": 0.4453617704391698, + "grad_norm": 0.26284288531812217, + "learning_rate": 0.0006118552280840037, + "loss": 2.5161, + "step": 3884 + }, + { + "epoch": 0.4454764361885105, + "grad_norm": 0.22384431229291182, + "learning_rate": 0.0006116742314259769, + "loss": 2.4255, + "step": 3885 + }, + { + "epoch": 0.4455911019378512, + "grad_norm": 0.25200180465874367, + "learning_rate": 0.0006114932193646184, + "loss": 2.3996, + "step": 3886 + }, + { + "epoch": 0.44570576768719183, + "grad_norm": 0.24758407089851703, + "learning_rate": 0.0006113121919248957, + "loss": 2.509, + "step": 3887 + }, + { + "epoch": 0.4458204334365325, + "grad_norm": 0.24533149947496877, + "learning_rate": 0.0006111311491317778, + "loss": 2.4881, + "step": 3888 + }, + { + "epoch": 0.4459350991858732, + "grad_norm": 0.2395786161317771, + "learning_rate": 0.0006109500910102362, + "loss": 2.3821, + "step": 3889 + }, + { + "epoch": 0.44604976493521387, + "grad_norm": 0.22882911491575342, + "learning_rate": 0.0006107690175852445, + "loss": 2.5027, + "step": 3890 + }, + { + "epoch": 0.4461644306845545, + "grad_norm": 0.2246763906574688, + "learning_rate": 0.0006105879288817784, + "loss": 2.4849, + "step": 3891 + }, + { + "epoch": 0.4462790964338952, + "grad_norm": 0.26695889144441043, + "learning_rate": 0.0006104068249248154, + "loss": 2.6179, + "step": 3892 + }, + { + "epoch": 0.44639376218323584, + "grad_norm": 0.2549769365322054, + "learning_rate": 0.0006102257057393354, + "loss": 2.5787, + "step": 3893 + }, + { + "epoch": 0.44650842793257656, + "grad_norm": 0.23786294401581556, + "learning_rate": 0.0006100445713503206, + "loss": 2.4304, + "step": 3894 + }, + { + "epoch": 0.4466230936819172, + "grad_norm": 0.2319978596995435, + "learning_rate": 0.0006098634217827545, + "loss": 2.5445, + "step": 3895 + }, + { + "epoch": 0.4467377594312579, + "grad_norm": 0.2788302410289403, + "learning_rate": 0.0006096822570616237, + "loss": 2.5701, + "step": 3896 + }, + { + "epoch": 0.44685242518059853, + "grad_norm": 0.25301233180829713, + "learning_rate": 0.0006095010772119164, + "loss": 2.3952, + "step": 3897 + }, + { + "epoch": 0.44696709092993925, + "grad_norm": 0.26091805107399546, + "learning_rate": 0.0006093198822586226, + "loss": 2.5388, + "step": 3898 + }, + { + "epoch": 0.4470817566792799, + "grad_norm": 0.2466995792807347, + "learning_rate": 0.000609138672226735, + "loss": 2.4445, + "step": 3899 + }, + { + "epoch": 0.44719642242862057, + "grad_norm": 0.23208189183317335, + "learning_rate": 0.0006089574471412478, + "loss": 2.4613, + "step": 3900 + }, + { + "epoch": 0.4473110881779612, + "grad_norm": 0.2790907575989822, + "learning_rate": 0.0006087762070271578, + "loss": 2.4901, + "step": 3901 + }, + { + "epoch": 0.44742575392730194, + "grad_norm": 0.23807506556601798, + "learning_rate": 0.0006085949519094633, + "loss": 2.4675, + "step": 3902 + }, + { + "epoch": 0.4475404196766426, + "grad_norm": 0.2563582711324039, + "learning_rate": 0.0006084136818131654, + "loss": 2.4278, + "step": 3903 + }, + { + "epoch": 0.44765508542598326, + "grad_norm": 0.26126836016239624, + "learning_rate": 0.0006082323967632667, + "loss": 2.5002, + "step": 3904 + }, + { + "epoch": 0.4477697511753239, + "grad_norm": 0.271403256976357, + "learning_rate": 0.0006080510967847717, + "loss": 2.4677, + "step": 3905 + }, + { + "epoch": 0.44788441692466463, + "grad_norm": 0.2681036625460401, + "learning_rate": 0.0006078697819026878, + "loss": 2.5847, + "step": 3906 + }, + { + "epoch": 0.4479990826740053, + "grad_norm": 0.2693740921353591, + "learning_rate": 0.0006076884521420236, + "loss": 2.5738, + "step": 3907 + }, + { + "epoch": 0.44811374842334595, + "grad_norm": 0.26215981276277794, + "learning_rate": 0.00060750710752779, + "loss": 2.4214, + "step": 3908 + }, + { + "epoch": 0.4482284141726866, + "grad_norm": 0.2860964131000218, + "learning_rate": 0.0006073257480850004, + "loss": 2.4278, + "step": 3909 + }, + { + "epoch": 0.44834307992202727, + "grad_norm": 0.24765137731736517, + "learning_rate": 0.0006071443738386697, + "loss": 2.5565, + "step": 3910 + }, + { + "epoch": 0.448457745671368, + "grad_norm": 0.2585403997360567, + "learning_rate": 0.0006069629848138148, + "loss": 2.6081, + "step": 3911 + }, + { + "epoch": 0.44857241142070864, + "grad_norm": 0.257597783934508, + "learning_rate": 0.0006067815810354551, + "loss": 2.5092, + "step": 3912 + }, + { + "epoch": 0.4486870771700493, + "grad_norm": 0.2372115874551654, + "learning_rate": 0.0006066001625286118, + "loss": 2.3958, + "step": 3913 + }, + { + "epoch": 0.44880174291938996, + "grad_norm": 0.24870830577203515, + "learning_rate": 0.0006064187293183078, + "loss": 2.4428, + "step": 3914 + }, + { + "epoch": 0.44891640866873067, + "grad_norm": 0.28603906753009717, + "learning_rate": 0.0006062372814295689, + "loss": 2.5037, + "step": 3915 + }, + { + "epoch": 0.44903107441807133, + "grad_norm": 0.22456715997685556, + "learning_rate": 0.0006060558188874222, + "loss": 2.4613, + "step": 3916 + }, + { + "epoch": 0.449145740167412, + "grad_norm": 0.2386837408228895, + "learning_rate": 0.0006058743417168966, + "loss": 2.4513, + "step": 3917 + }, + { + "epoch": 0.44926040591675265, + "grad_norm": 0.24303256281060853, + "learning_rate": 0.0006056928499430237, + "loss": 2.4909, + "step": 3918 + }, + { + "epoch": 0.44937507166609336, + "grad_norm": 0.21746086090275502, + "learning_rate": 0.0006055113435908372, + "loss": 2.4478, + "step": 3919 + }, + { + "epoch": 0.449489737415434, + "grad_norm": 0.23063285317287116, + "learning_rate": 0.0006053298226853717, + "loss": 2.4453, + "step": 3920 + }, + { + "epoch": 0.4496044031647747, + "grad_norm": 0.25474637989298565, + "learning_rate": 0.0006051482872516652, + "loss": 2.4666, + "step": 3921 + }, + { + "epoch": 0.44971906891411534, + "grad_norm": 0.25808627323979777, + "learning_rate": 0.0006049667373147566, + "loss": 2.5451, + "step": 3922 + }, + { + "epoch": 0.44983373466345605, + "grad_norm": 0.2449051572113027, + "learning_rate": 0.0006047851728996875, + "loss": 2.5724, + "step": 3923 + }, + { + "epoch": 0.4499484004127967, + "grad_norm": 0.27452360169006523, + "learning_rate": 0.0006046035940315011, + "loss": 2.4163, + "step": 3924 + }, + { + "epoch": 0.45006306616213737, + "grad_norm": 0.25584978753401827, + "learning_rate": 0.0006044220007352429, + "loss": 2.5012, + "step": 3925 + }, + { + "epoch": 0.45017773191147803, + "grad_norm": 0.2527483904493502, + "learning_rate": 0.0006042403930359603, + "loss": 2.575, + "step": 3926 + }, + { + "epoch": 0.4502923976608187, + "grad_norm": 0.2645945218118985, + "learning_rate": 0.0006040587709587024, + "loss": 2.4448, + "step": 3927 + }, + { + "epoch": 0.4504070634101594, + "grad_norm": 0.2504694488306309, + "learning_rate": 0.0006038771345285204, + "loss": 2.4461, + "step": 3928 + }, + { + "epoch": 0.45052172915950006, + "grad_norm": 0.2621190909534613, + "learning_rate": 0.000603695483770468, + "loss": 2.635, + "step": 3929 + }, + { + "epoch": 0.4506363949088407, + "grad_norm": 0.23075822405655771, + "learning_rate": 0.0006035138187096, + "loss": 2.5651, + "step": 3930 + }, + { + "epoch": 0.4507510606581814, + "grad_norm": 0.2601474395698221, + "learning_rate": 0.0006033321393709741, + "loss": 2.4761, + "step": 3931 + }, + { + "epoch": 0.4508657264075221, + "grad_norm": 0.2350743451551013, + "learning_rate": 0.0006031504457796493, + "loss": 2.4331, + "step": 3932 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 0.2397324484573778, + "learning_rate": 0.0006029687379606864, + "loss": 2.4914, + "step": 3933 + }, + { + "epoch": 0.4510950579062034, + "grad_norm": 0.23647721962641163, + "learning_rate": 0.0006027870159391491, + "loss": 2.4892, + "step": 3934 + }, + { + "epoch": 0.45120972365554407, + "grad_norm": 0.2536743555375813, + "learning_rate": 0.0006026052797401022, + "loss": 2.5464, + "step": 3935 + }, + { + "epoch": 0.4513243894048848, + "grad_norm": 0.2292260255227131, + "learning_rate": 0.0006024235293886127, + "loss": 2.5472, + "step": 3936 + }, + { + "epoch": 0.45143905515422544, + "grad_norm": 0.24976893414891224, + "learning_rate": 0.0006022417649097499, + "loss": 2.4496, + "step": 3937 + }, + { + "epoch": 0.4515537209035661, + "grad_norm": 0.21049273864184484, + "learning_rate": 0.0006020599863285845, + "loss": 2.4127, + "step": 3938 + }, + { + "epoch": 0.45166838665290676, + "grad_norm": 0.24644349924514333, + "learning_rate": 0.0006018781936701893, + "loss": 2.5237, + "step": 3939 + }, + { + "epoch": 0.4517830524022475, + "grad_norm": 0.22943860141941533, + "learning_rate": 0.0006016963869596392, + "loss": 2.359, + "step": 3940 + }, + { + "epoch": 0.45189771815158813, + "grad_norm": 0.22909120652737902, + "learning_rate": 0.0006015145662220113, + "loss": 2.3493, + "step": 3941 + }, + { + "epoch": 0.4520123839009288, + "grad_norm": 0.2775648807420886, + "learning_rate": 0.0006013327314823838, + "loss": 2.4402, + "step": 3942 + }, + { + "epoch": 0.45212704965026945, + "grad_norm": 0.23361955558017575, + "learning_rate": 0.0006011508827658376, + "loss": 2.4639, + "step": 3943 + }, + { + "epoch": 0.4522417153996101, + "grad_norm": 0.2622691237723836, + "learning_rate": 0.0006009690200974553, + "loss": 2.6042, + "step": 3944 + }, + { + "epoch": 0.4523563811489508, + "grad_norm": 0.24593252243237382, + "learning_rate": 0.0006007871435023213, + "loss": 2.3898, + "step": 3945 + }, + { + "epoch": 0.4524710468982915, + "grad_norm": 0.2364488838696429, + "learning_rate": 0.0006006052530055221, + "loss": 2.567, + "step": 3946 + }, + { + "epoch": 0.45258571264763214, + "grad_norm": 0.2698985054027099, + "learning_rate": 0.0006004233486321459, + "loss": 2.5669, + "step": 3947 + }, + { + "epoch": 0.4527003783969728, + "grad_norm": 0.2660980300059388, + "learning_rate": 0.000600241430407283, + "loss": 2.4717, + "step": 3948 + }, + { + "epoch": 0.4528150441463135, + "grad_norm": 0.24842296062626618, + "learning_rate": 0.0006000594983560255, + "loss": 2.5179, + "step": 3949 + }, + { + "epoch": 0.4529297098956542, + "grad_norm": 0.263252003583487, + "learning_rate": 0.0005998775525034675, + "loss": 2.4295, + "step": 3950 + }, + { + "epoch": 0.45304437564499483, + "grad_norm": 0.22590002470299006, + "learning_rate": 0.0005996955928747051, + "loss": 2.433, + "step": 3951 + }, + { + "epoch": 0.4531590413943355, + "grad_norm": 0.23613619835138924, + "learning_rate": 0.0005995136194948359, + "loss": 2.4345, + "step": 3952 + }, + { + "epoch": 0.4532737071436762, + "grad_norm": 0.24913655879233865, + "learning_rate": 0.0005993316323889598, + "loss": 2.4798, + "step": 3953 + }, + { + "epoch": 0.45338837289301687, + "grad_norm": 0.2250205008799578, + "learning_rate": 0.0005991496315821786, + "loss": 2.4742, + "step": 3954 + }, + { + "epoch": 0.4535030386423575, + "grad_norm": 0.2364726356552407, + "learning_rate": 0.0005989676170995954, + "loss": 2.5198, + "step": 3955 + }, + { + "epoch": 0.4536177043916982, + "grad_norm": 0.2324341528537448, + "learning_rate": 0.0005987855889663163, + "loss": 2.5988, + "step": 3956 + }, + { + "epoch": 0.4537323701410389, + "grad_norm": 0.24049310020696812, + "learning_rate": 0.000598603547207448, + "loss": 2.4445, + "step": 3957 + }, + { + "epoch": 0.45384703589037956, + "grad_norm": 0.23565677178868943, + "learning_rate": 0.0005984214918480999, + "loss": 2.4665, + "step": 3958 + }, + { + "epoch": 0.4539617016397202, + "grad_norm": 0.25642489670638774, + "learning_rate": 0.0005982394229133832, + "loss": 2.6918, + "step": 3959 + }, + { + "epoch": 0.4540763673890609, + "grad_norm": 0.2410093420291846, + "learning_rate": 0.0005980573404284107, + "loss": 2.4633, + "step": 3960 + }, + { + "epoch": 0.45419103313840153, + "grad_norm": 0.23350412053138167, + "learning_rate": 0.000597875244418297, + "loss": 2.5599, + "step": 3961 + }, + { + "epoch": 0.45430569888774225, + "grad_norm": 0.24087131854138213, + "learning_rate": 0.0005976931349081593, + "loss": 2.3308, + "step": 3962 + }, + { + "epoch": 0.4544203646370829, + "grad_norm": 0.23713647194721107, + "learning_rate": 0.0005975110119231157, + "loss": 2.564, + "step": 3963 + }, + { + "epoch": 0.45453503038642357, + "grad_norm": 0.22472626844348545, + "learning_rate": 0.0005973288754882867, + "loss": 2.561, + "step": 3964 + }, + { + "epoch": 0.4546496961357642, + "grad_norm": 0.27797479606783915, + "learning_rate": 0.0005971467256287947, + "loss": 2.4504, + "step": 3965 + }, + { + "epoch": 0.45476436188510494, + "grad_norm": 0.24716802967188817, + "learning_rate": 0.0005969645623697636, + "loss": 2.5725, + "step": 3966 + }, + { + "epoch": 0.4548790276344456, + "grad_norm": 0.25772233146269496, + "learning_rate": 0.0005967823857363195, + "loss": 2.5397, + "step": 3967 + }, + { + "epoch": 0.45499369338378626, + "grad_norm": 0.23219086909346606, + "learning_rate": 0.0005966001957535901, + "loss": 2.5535, + "step": 3968 + }, + { + "epoch": 0.4551083591331269, + "grad_norm": 0.22606169938490384, + "learning_rate": 0.000596417992446705, + "loss": 2.4102, + "step": 3969 + }, + { + "epoch": 0.45522302488246763, + "grad_norm": 0.2613332214463567, + "learning_rate": 0.0005962357758407958, + "loss": 2.4485, + "step": 3970 + }, + { + "epoch": 0.4553376906318083, + "grad_norm": 0.25739259406632836, + "learning_rate": 0.0005960535459609957, + "loss": 2.4801, + "step": 3971 + }, + { + "epoch": 0.45545235638114895, + "grad_norm": 0.253512675124322, + "learning_rate": 0.00059587130283244, + "loss": 2.4935, + "step": 3972 + }, + { + "epoch": 0.4555670221304896, + "grad_norm": 0.2601760453329737, + "learning_rate": 0.0005956890464802654, + "loss": 2.4514, + "step": 3973 + }, + { + "epoch": 0.4556816878798303, + "grad_norm": 0.26276500057400415, + "learning_rate": 0.0005955067769296109, + "loss": 2.3884, + "step": 3974 + }, + { + "epoch": 0.455796353629171, + "grad_norm": 0.21460375373954713, + "learning_rate": 0.0005953244942056171, + "loss": 2.4364, + "step": 3975 + }, + { + "epoch": 0.45591101937851164, + "grad_norm": 0.2862866908494329, + "learning_rate": 0.0005951421983334263, + "loss": 2.5374, + "step": 3976 + }, + { + "epoch": 0.4560256851278523, + "grad_norm": 0.239740462331181, + "learning_rate": 0.0005949598893381828, + "loss": 2.5119, + "step": 3977 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 0.23932017674871514, + "learning_rate": 0.0005947775672450326, + "loss": 2.4051, + "step": 3978 + }, + { + "epoch": 0.45625501662653367, + "grad_norm": 0.25772753388996095, + "learning_rate": 0.0005945952320791239, + "loss": 2.5401, + "step": 3979 + }, + { + "epoch": 0.45636968237587433, + "grad_norm": 0.235613442920123, + "learning_rate": 0.0005944128838656059, + "loss": 2.4094, + "step": 3980 + }, + { + "epoch": 0.456484348125215, + "grad_norm": 0.24132322076811574, + "learning_rate": 0.0005942305226296302, + "loss": 2.5086, + "step": 3981 + }, + { + "epoch": 0.45659901387455565, + "grad_norm": 0.24971451976323805, + "learning_rate": 0.0005940481483963502, + "loss": 2.538, + "step": 3982 + }, + { + "epoch": 0.45671367962389636, + "grad_norm": 0.24986706023905864, + "learning_rate": 0.0005938657611909206, + "loss": 2.4424, + "step": 3983 + }, + { + "epoch": 0.456828345373237, + "grad_norm": 0.2501549109312565, + "learning_rate": 0.0005936833610384988, + "loss": 2.3045, + "step": 3984 + }, + { + "epoch": 0.4569430111225777, + "grad_norm": 0.2392833009886116, + "learning_rate": 0.0005935009479642431, + "loss": 2.4197, + "step": 3985 + }, + { + "epoch": 0.45705767687191834, + "grad_norm": 0.23392399533363353, + "learning_rate": 0.0005933185219933137, + "loss": 2.5313, + "step": 3986 + }, + { + "epoch": 0.45717234262125905, + "grad_norm": 0.24513047609038893, + "learning_rate": 0.0005931360831508732, + "loss": 2.5182, + "step": 3987 + }, + { + "epoch": 0.4572870083705997, + "grad_norm": 0.23490837286542676, + "learning_rate": 0.0005929536314620852, + "loss": 2.4514, + "step": 3988 + }, + { + "epoch": 0.45740167411994037, + "grad_norm": 0.2352393406286705, + "learning_rate": 0.0005927711669521156, + "loss": 2.4493, + "step": 3989 + }, + { + "epoch": 0.45751633986928103, + "grad_norm": 0.24756796586114393, + "learning_rate": 0.0005925886896461321, + "loss": 2.4291, + "step": 3990 + }, + { + "epoch": 0.45763100561862174, + "grad_norm": 0.29996769307345406, + "learning_rate": 0.0005924061995693036, + "loss": 2.3033, + "step": 3991 + }, + { + "epoch": 0.4577456713679624, + "grad_norm": 0.24497351739374512, + "learning_rate": 0.0005922236967468013, + "loss": 2.3321, + "step": 3992 + }, + { + "epoch": 0.45786033711730306, + "grad_norm": 0.23903044983833396, + "learning_rate": 0.000592041181203798, + "loss": 2.4554, + "step": 3993 + }, + { + "epoch": 0.4579750028666437, + "grad_norm": 0.25883812390045613, + "learning_rate": 0.0005918586529654684, + "loss": 2.5333, + "step": 3994 + }, + { + "epoch": 0.4580896686159844, + "grad_norm": 0.2836977633084591, + "learning_rate": 0.0005916761120569883, + "loss": 2.5951, + "step": 3995 + }, + { + "epoch": 0.4582043343653251, + "grad_norm": 0.2611893556043827, + "learning_rate": 0.0005914935585035363, + "loss": 2.5358, + "step": 3996 + }, + { + "epoch": 0.45831900011466575, + "grad_norm": 0.24492121390836463, + "learning_rate": 0.0005913109923302919, + "loss": 2.5196, + "step": 3997 + }, + { + "epoch": 0.4584336658640064, + "grad_norm": 0.259712987879203, + "learning_rate": 0.0005911284135624365, + "loss": 2.5814, + "step": 3998 + }, + { + "epoch": 0.45854833161334707, + "grad_norm": 0.24640690971169735, + "learning_rate": 0.0005909458222251536, + "loss": 2.3768, + "step": 3999 + }, + { + "epoch": 0.4586629973626878, + "grad_norm": 0.2710985881162735, + "learning_rate": 0.0005907632183436281, + "loss": 2.2765, + "step": 4000 + }, + { + "epoch": 0.45877766311202844, + "grad_norm": 0.24117229763682085, + "learning_rate": 0.0005905806019430468, + "loss": 2.4958, + "step": 4001 + }, + { + "epoch": 0.4588923288613691, + "grad_norm": 0.24304433426590974, + "learning_rate": 0.0005903979730485979, + "loss": 2.508, + "step": 4002 + }, + { + "epoch": 0.45900699461070976, + "grad_norm": 0.22195366824526383, + "learning_rate": 0.0005902153316854718, + "loss": 2.5461, + "step": 4003 + }, + { + "epoch": 0.4591216603600505, + "grad_norm": 0.3022572093007704, + "learning_rate": 0.0005900326778788605, + "loss": 2.4491, + "step": 4004 + }, + { + "epoch": 0.45923632610939114, + "grad_norm": 0.23712500714320361, + "learning_rate": 0.0005898500116539575, + "loss": 2.4224, + "step": 4005 + }, + { + "epoch": 0.4593509918587318, + "grad_norm": 0.2276446315279386, + "learning_rate": 0.0005896673330359579, + "loss": 2.4692, + "step": 4006 + }, + { + "epoch": 0.45946565760807245, + "grad_norm": 0.24160217840238002, + "learning_rate": 0.000589484642050059, + "loss": 2.5746, + "step": 4007 + }, + { + "epoch": 0.45958032335741317, + "grad_norm": 0.21556816317647048, + "learning_rate": 0.0005893019387214595, + "loss": 2.3802, + "step": 4008 + }, + { + "epoch": 0.4596949891067538, + "grad_norm": 0.2585601183500471, + "learning_rate": 0.0005891192230753597, + "loss": 2.677, + "step": 4009 + }, + { + "epoch": 0.4598096548560945, + "grad_norm": 0.25481247858543454, + "learning_rate": 0.000588936495136962, + "loss": 2.4946, + "step": 4010 + }, + { + "epoch": 0.45992432060543514, + "grad_norm": 0.25821269187748086, + "learning_rate": 0.0005887537549314699, + "loss": 2.4564, + "step": 4011 + }, + { + "epoch": 0.4600389863547758, + "grad_norm": 0.2783909170668665, + "learning_rate": 0.0005885710024840893, + "loss": 2.4776, + "step": 4012 + }, + { + "epoch": 0.4601536521041165, + "grad_norm": 0.24701354913323206, + "learning_rate": 0.000588388237820027, + "loss": 2.4524, + "step": 4013 + }, + { + "epoch": 0.4602683178534572, + "grad_norm": 0.2587736756076629, + "learning_rate": 0.0005882054609644923, + "loss": 2.4944, + "step": 4014 + }, + { + "epoch": 0.46038298360279784, + "grad_norm": 0.2777160543424753, + "learning_rate": 0.0005880226719426956, + "loss": 2.3666, + "step": 4015 + }, + { + "epoch": 0.4604976493521385, + "grad_norm": 0.27470829566841637, + "learning_rate": 0.0005878398707798491, + "loss": 2.4321, + "step": 4016 + }, + { + "epoch": 0.4606123151014792, + "grad_norm": 0.2515435455577454, + "learning_rate": 0.0005876570575011668, + "loss": 2.4091, + "step": 4017 + }, + { + "epoch": 0.46072698085081987, + "grad_norm": 0.2622692983818868, + "learning_rate": 0.0005874742321318643, + "loss": 2.5589, + "step": 4018 + }, + { + "epoch": 0.4608416466001605, + "grad_norm": 0.26305709035978064, + "learning_rate": 0.0005872913946971591, + "loss": 2.4523, + "step": 4019 + }, + { + "epoch": 0.4609563123495012, + "grad_norm": 0.24299542114107053, + "learning_rate": 0.0005871085452222697, + "loss": 2.5156, + "step": 4020 + }, + { + "epoch": 0.4610709780988419, + "grad_norm": 0.2525557451145121, + "learning_rate": 0.000586925683732417, + "loss": 2.4831, + "step": 4021 + }, + { + "epoch": 0.46118564384818256, + "grad_norm": 0.23241168067750473, + "learning_rate": 0.0005867428102528233, + "loss": 2.5007, + "step": 4022 + }, + { + "epoch": 0.4613003095975232, + "grad_norm": 0.21674378350997456, + "learning_rate": 0.0005865599248087122, + "loss": 2.445, + "step": 4023 + }, + { + "epoch": 0.4614149753468639, + "grad_norm": 0.2619977477590639, + "learning_rate": 0.0005863770274253095, + "loss": 2.5395, + "step": 4024 + }, + { + "epoch": 0.4615296410962046, + "grad_norm": 0.21657052303508098, + "learning_rate": 0.0005861941181278425, + "loss": 2.457, + "step": 4025 + }, + { + "epoch": 0.46164430684554525, + "grad_norm": 0.2336017436641503, + "learning_rate": 0.0005860111969415397, + "loss": 2.5186, + "step": 4026 + }, + { + "epoch": 0.4617589725948859, + "grad_norm": 0.2449664358939737, + "learning_rate": 0.0005858282638916319, + "loss": 2.4698, + "step": 4027 + }, + { + "epoch": 0.46187363834422657, + "grad_norm": 0.21640288697634297, + "learning_rate": 0.0005856453190033512, + "loss": 2.4507, + "step": 4028 + }, + { + "epoch": 0.4619883040935672, + "grad_norm": 0.22687931653126772, + "learning_rate": 0.0005854623623019313, + "loss": 2.5422, + "step": 4029 + }, + { + "epoch": 0.46210296984290794, + "grad_norm": 0.22559651454310134, + "learning_rate": 0.0005852793938126074, + "loss": 2.4375, + "step": 4030 + }, + { + "epoch": 0.4622176355922486, + "grad_norm": 0.24831776755149493, + "learning_rate": 0.0005850964135606169, + "loss": 2.4207, + "step": 4031 + }, + { + "epoch": 0.46233230134158926, + "grad_norm": 0.26743306223850294, + "learning_rate": 0.000584913421571198, + "loss": 2.3825, + "step": 4032 + }, + { + "epoch": 0.4624469670909299, + "grad_norm": 0.25457242928236234, + "learning_rate": 0.0005847304178695914, + "loss": 2.502, + "step": 4033 + }, + { + "epoch": 0.46256163284027063, + "grad_norm": 0.25912940667538276, + "learning_rate": 0.0005845474024810387, + "loss": 2.5471, + "step": 4034 + }, + { + "epoch": 0.4626762985896113, + "grad_norm": 0.24394943247383544, + "learning_rate": 0.0005843643754307834, + "loss": 2.3433, + "step": 4035 + }, + { + "epoch": 0.46279096433895195, + "grad_norm": 0.28551047448527844, + "learning_rate": 0.0005841813367440707, + "loss": 2.5346, + "step": 4036 + }, + { + "epoch": 0.4629056300882926, + "grad_norm": 0.30348234511021444, + "learning_rate": 0.0005839982864461473, + "loss": 2.5535, + "step": 4037 + }, + { + "epoch": 0.4630202958376333, + "grad_norm": 0.3006559693647635, + "learning_rate": 0.0005838152245622614, + "loss": 2.5015, + "step": 4038 + }, + { + "epoch": 0.463134961586974, + "grad_norm": 0.26931888177659113, + "learning_rate": 0.0005836321511176628, + "loss": 2.5331, + "step": 4039 + }, + { + "epoch": 0.46324962733631464, + "grad_norm": 0.26874848305500937, + "learning_rate": 0.0005834490661376033, + "loss": 2.5138, + "step": 4040 + }, + { + "epoch": 0.4633642930856553, + "grad_norm": 0.25467707896578495, + "learning_rate": 0.0005832659696473356, + "loss": 2.5055, + "step": 4041 + }, + { + "epoch": 0.463478958834996, + "grad_norm": 0.2512393029246634, + "learning_rate": 0.0005830828616721148, + "loss": 2.414, + "step": 4042 + }, + { + "epoch": 0.46359362458433667, + "grad_norm": 0.2707415766615185, + "learning_rate": 0.0005828997422371967, + "loss": 2.4702, + "step": 4043 + }, + { + "epoch": 0.46370829033367733, + "grad_norm": 0.24811277836871373, + "learning_rate": 0.0005827166113678398, + "loss": 2.5298, + "step": 4044 + }, + { + "epoch": 0.463822956083018, + "grad_norm": 0.22288366279505703, + "learning_rate": 0.0005825334690893028, + "loss": 2.5795, + "step": 4045 + }, + { + "epoch": 0.46393762183235865, + "grad_norm": 0.23955363484670503, + "learning_rate": 0.0005823503154268468, + "loss": 2.6075, + "step": 4046 + }, + { + "epoch": 0.46405228758169936, + "grad_norm": 0.25359801528817916, + "learning_rate": 0.0005821671504057348, + "loss": 2.5652, + "step": 4047 + }, + { + "epoch": 0.46416695333104, + "grad_norm": 0.23369015862046746, + "learning_rate": 0.0005819839740512305, + "loss": 2.5, + "step": 4048 + }, + { + "epoch": 0.4642816190803807, + "grad_norm": 0.24502240622352708, + "learning_rate": 0.0005818007863885998, + "loss": 2.5694, + "step": 4049 + }, + { + "epoch": 0.46439628482972134, + "grad_norm": 0.24052848907085234, + "learning_rate": 0.00058161758744311, + "loss": 2.3116, + "step": 4050 + }, + { + "epoch": 0.46451095057906205, + "grad_norm": 0.25976262827275237, + "learning_rate": 0.0005814343772400296, + "loss": 2.59, + "step": 4051 + }, + { + "epoch": 0.4646256163284027, + "grad_norm": 0.23473119492495118, + "learning_rate": 0.0005812511558046291, + "loss": 2.6317, + "step": 4052 + }, + { + "epoch": 0.46474028207774337, + "grad_norm": 0.24156652517910596, + "learning_rate": 0.0005810679231621806, + "loss": 2.3444, + "step": 4053 + }, + { + "epoch": 0.46485494782708403, + "grad_norm": 0.25752262646360335, + "learning_rate": 0.0005808846793379575, + "loss": 2.4572, + "step": 4054 + }, + { + "epoch": 0.46496961357642475, + "grad_norm": 0.22791304663471093, + "learning_rate": 0.0005807014243572343, + "loss": 2.486, + "step": 4055 + }, + { + "epoch": 0.4650842793257654, + "grad_norm": 0.2415570771674224, + "learning_rate": 0.0005805181582452882, + "loss": 2.4783, + "step": 4056 + }, + { + "epoch": 0.46519894507510606, + "grad_norm": 0.27076805445126584, + "learning_rate": 0.0005803348810273969, + "loss": 2.3277, + "step": 4057 + }, + { + "epoch": 0.4653136108244467, + "grad_norm": 0.24976209707713065, + "learning_rate": 0.0005801515927288401, + "loss": 2.4968, + "step": 4058 + }, + { + "epoch": 0.46542827657378744, + "grad_norm": 0.27699544269951104, + "learning_rate": 0.000579968293374899, + "loss": 2.5445, + "step": 4059 + }, + { + "epoch": 0.4655429423231281, + "grad_norm": 0.25349821394296557, + "learning_rate": 0.0005797849829908561, + "loss": 2.5464, + "step": 4060 + }, + { + "epoch": 0.46565760807246875, + "grad_norm": 0.2561071030111794, + "learning_rate": 0.0005796016616019955, + "loss": 2.404, + "step": 4061 + }, + { + "epoch": 0.4657722738218094, + "grad_norm": 0.26941566199940703, + "learning_rate": 0.0005794183292336032, + "loss": 2.5623, + "step": 4062 + }, + { + "epoch": 0.46588693957115007, + "grad_norm": 0.2656433122784312, + "learning_rate": 0.0005792349859109662, + "loss": 2.4062, + "step": 4063 + }, + { + "epoch": 0.4660016053204908, + "grad_norm": 0.23417855297483753, + "learning_rate": 0.0005790516316593733, + "loss": 2.5016, + "step": 4064 + }, + { + "epoch": 0.46611627106983144, + "grad_norm": 0.27376676814228473, + "learning_rate": 0.0005788682665041146, + "loss": 2.5389, + "step": 4065 + }, + { + "epoch": 0.4662309368191721, + "grad_norm": 0.27298158316833904, + "learning_rate": 0.0005786848904704818, + "loss": 2.4707, + "step": 4066 + }, + { + "epoch": 0.46634560256851276, + "grad_norm": 0.24304253791162964, + "learning_rate": 0.0005785015035837684, + "loss": 2.3979, + "step": 4067 + }, + { + "epoch": 0.4664602683178535, + "grad_norm": 0.2278037832499448, + "learning_rate": 0.000578318105869269, + "loss": 2.4998, + "step": 4068 + }, + { + "epoch": 0.46657493406719414, + "grad_norm": 0.22367781495391703, + "learning_rate": 0.0005781346973522797, + "loss": 2.2515, + "step": 4069 + }, + { + "epoch": 0.4666895998165348, + "grad_norm": 0.24845967795308857, + "learning_rate": 0.0005779512780580981, + "loss": 2.4585, + "step": 4070 + }, + { + "epoch": 0.46680426556587545, + "grad_norm": 0.22475780584585855, + "learning_rate": 0.0005777678480120237, + "loss": 2.5699, + "step": 4071 + }, + { + "epoch": 0.46691893131521617, + "grad_norm": 0.23761655436314588, + "learning_rate": 0.0005775844072393573, + "loss": 2.564, + "step": 4072 + }, + { + "epoch": 0.4670335970645568, + "grad_norm": 0.2569776784275677, + "learning_rate": 0.0005774009557654006, + "loss": 2.5437, + "step": 4073 + }, + { + "epoch": 0.4671482628138975, + "grad_norm": 0.22783050644097697, + "learning_rate": 0.0005772174936154573, + "loss": 2.5144, + "step": 4074 + }, + { + "epoch": 0.46726292856323814, + "grad_norm": 0.24813561223739344, + "learning_rate": 0.000577034020814833, + "loss": 2.5319, + "step": 4075 + }, + { + "epoch": 0.46737759431257886, + "grad_norm": 0.20983291977362017, + "learning_rate": 0.0005768505373888337, + "loss": 2.4116, + "step": 4076 + }, + { + "epoch": 0.4674922600619195, + "grad_norm": 0.25789828576346707, + "learning_rate": 0.0005766670433627677, + "loss": 2.5806, + "step": 4077 + }, + { + "epoch": 0.4676069258112602, + "grad_norm": 0.2464997740378045, + "learning_rate": 0.0005764835387619444, + "loss": 2.4314, + "step": 4078 + }, + { + "epoch": 0.46772159156060084, + "grad_norm": 0.24757597445739538, + "learning_rate": 0.000576300023611675, + "loss": 2.5836, + "step": 4079 + }, + { + "epoch": 0.4678362573099415, + "grad_norm": 0.23818417194069333, + "learning_rate": 0.0005761164979372716, + "loss": 2.3464, + "step": 4080 + }, + { + "epoch": 0.4679509230592822, + "grad_norm": 0.24755121328919102, + "learning_rate": 0.0005759329617640483, + "loss": 2.4426, + "step": 4081 + }, + { + "epoch": 0.46806558880862287, + "grad_norm": 0.23916563763164692, + "learning_rate": 0.0005757494151173204, + "loss": 2.4868, + "step": 4082 + }, + { + "epoch": 0.4681802545579635, + "grad_norm": 0.2612887135226204, + "learning_rate": 0.0005755658580224043, + "loss": 2.5336, + "step": 4083 + }, + { + "epoch": 0.4682949203073042, + "grad_norm": 0.25747175587157045, + "learning_rate": 0.0005753822905046189, + "loss": 2.5981, + "step": 4084 + }, + { + "epoch": 0.4684095860566449, + "grad_norm": 0.2423193432486496, + "learning_rate": 0.0005751987125892833, + "loss": 2.3855, + "step": 4085 + }, + { + "epoch": 0.46852425180598556, + "grad_norm": 0.25026805400259666, + "learning_rate": 0.0005750151243017187, + "loss": 2.6247, + "step": 4086 + }, + { + "epoch": 0.4686389175553262, + "grad_norm": 0.23501021503952846, + "learning_rate": 0.0005748315256672476, + "loss": 2.3819, + "step": 4087 + }, + { + "epoch": 0.4687535833046669, + "grad_norm": 0.2800629810545975, + "learning_rate": 0.000574647916711194, + "loss": 2.6783, + "step": 4088 + }, + { + "epoch": 0.4688682490540076, + "grad_norm": 0.25150584813890065, + "learning_rate": 0.000574464297458883, + "loss": 2.5303, + "step": 4089 + }, + { + "epoch": 0.46898291480334825, + "grad_norm": 0.3058588158795674, + "learning_rate": 0.000574280667935642, + "loss": 2.3786, + "step": 4090 + }, + { + "epoch": 0.4690975805526889, + "grad_norm": 0.23932394408537352, + "learning_rate": 0.0005740970281667984, + "loss": 2.41, + "step": 4091 + }, + { + "epoch": 0.46921224630202957, + "grad_norm": 0.2566441254792209, + "learning_rate": 0.0005739133781776824, + "loss": 2.551, + "step": 4092 + }, + { + "epoch": 0.4693269120513703, + "grad_norm": 0.25718589538971726, + "learning_rate": 0.0005737297179936247, + "loss": 2.4187, + "step": 4093 + }, + { + "epoch": 0.46944157780071094, + "grad_norm": 0.23482095526672317, + "learning_rate": 0.0005735460476399579, + "loss": 2.4443, + "step": 4094 + }, + { + "epoch": 0.4695562435500516, + "grad_norm": 0.25081607485848917, + "learning_rate": 0.0005733623671420156, + "loss": 2.4984, + "step": 4095 + }, + { + "epoch": 0.46967090929939226, + "grad_norm": 0.23490462008074964, + "learning_rate": 0.0005731786765251333, + "loss": 2.3803, + "step": 4096 + }, + { + "epoch": 0.4697855750487329, + "grad_norm": 0.24706699653589545, + "learning_rate": 0.0005729949758146475, + "loss": 2.6328, + "step": 4097 + }, + { + "epoch": 0.46990024079807363, + "grad_norm": 0.23957953264753773, + "learning_rate": 0.0005728112650358961, + "loss": 2.509, + "step": 4098 + }, + { + "epoch": 0.4700149065474143, + "grad_norm": 0.23415303858141057, + "learning_rate": 0.0005726275442142185, + "loss": 2.4905, + "step": 4099 + }, + { + "epoch": 0.47012957229675495, + "grad_norm": 0.2348783567025958, + "learning_rate": 0.0005724438133749559, + "loss": 2.4462, + "step": 4100 + }, + { + "epoch": 0.4702442380460956, + "grad_norm": 0.24221795688984288, + "learning_rate": 0.0005722600725434499, + "loss": 2.5664, + "step": 4101 + }, + { + "epoch": 0.4703589037954363, + "grad_norm": 0.24700780962568017, + "learning_rate": 0.0005720763217450443, + "loss": 2.4485, + "step": 4102 + }, + { + "epoch": 0.470473569544777, + "grad_norm": 0.2395912443385863, + "learning_rate": 0.000571892561005084, + "loss": 2.6206, + "step": 4103 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.25004202397048964, + "learning_rate": 0.0005717087903489155, + "loss": 2.4654, + "step": 4104 + }, + { + "epoch": 0.4707029010434583, + "grad_norm": 0.23686223407279985, + "learning_rate": 0.000571525009801886, + "loss": 2.3086, + "step": 4105 + }, + { + "epoch": 0.470817566792799, + "grad_norm": 0.27117640531275145, + "learning_rate": 0.0005713412193893451, + "loss": 2.6296, + "step": 4106 + }, + { + "epoch": 0.4709322325421397, + "grad_norm": 0.2292084382065695, + "learning_rate": 0.0005711574191366427, + "loss": 2.4905, + "step": 4107 + }, + { + "epoch": 0.47104689829148033, + "grad_norm": 0.2440423126972062, + "learning_rate": 0.0005709736090691305, + "loss": 2.4408, + "step": 4108 + }, + { + "epoch": 0.471161564040821, + "grad_norm": 0.2319575387284643, + "learning_rate": 0.0005707897892121621, + "loss": 2.5366, + "step": 4109 + }, + { + "epoch": 0.4712762297901617, + "grad_norm": 0.25557685494323185, + "learning_rate": 0.0005706059595910918, + "loss": 2.5395, + "step": 4110 + }, + { + "epoch": 0.47139089553950236, + "grad_norm": 0.2418003712874159, + "learning_rate": 0.0005704221202312748, + "loss": 2.372, + "step": 4111 + }, + { + "epoch": 0.471505561288843, + "grad_norm": 0.246028321079148, + "learning_rate": 0.000570238271158069, + "loss": 2.5977, + "step": 4112 + }, + { + "epoch": 0.4716202270381837, + "grad_norm": 0.2405902488996967, + "learning_rate": 0.0005700544123968325, + "loss": 2.4233, + "step": 4113 + }, + { + "epoch": 0.47173489278752434, + "grad_norm": 0.3004289086723318, + "learning_rate": 0.0005698705439729251, + "loss": 2.3543, + "step": 4114 + }, + { + "epoch": 0.47184955853686505, + "grad_norm": 0.23741141665197765, + "learning_rate": 0.0005696866659117081, + "loss": 2.46, + "step": 4115 + }, + { + "epoch": 0.4719642242862057, + "grad_norm": 0.21233013484141894, + "learning_rate": 0.0005695027782385438, + "loss": 2.3237, + "step": 4116 + }, + { + "epoch": 0.4720788900355464, + "grad_norm": 0.24478340844456165, + "learning_rate": 0.0005693188809787961, + "loss": 2.5672, + "step": 4117 + }, + { + "epoch": 0.47219355578488703, + "grad_norm": 0.2300892030179047, + "learning_rate": 0.00056913497415783, + "loss": 2.414, + "step": 4118 + }, + { + "epoch": 0.47230822153422775, + "grad_norm": 0.2485024049875083, + "learning_rate": 0.0005689510578010123, + "loss": 2.532, + "step": 4119 + }, + { + "epoch": 0.4724228872835684, + "grad_norm": 0.22393404248639082, + "learning_rate": 0.0005687671319337103, + "loss": 2.5336, + "step": 4120 + }, + { + "epoch": 0.47253755303290906, + "grad_norm": 0.2433928478306489, + "learning_rate": 0.0005685831965812933, + "loss": 2.4394, + "step": 4121 + }, + { + "epoch": 0.4726522187822497, + "grad_norm": 0.2346018230669044, + "learning_rate": 0.0005683992517691318, + "loss": 2.4222, + "step": 4122 + }, + { + "epoch": 0.47276688453159044, + "grad_norm": 0.24427067208209588, + "learning_rate": 0.0005682152975225972, + "loss": 2.37, + "step": 4123 + }, + { + "epoch": 0.4728815502809311, + "grad_norm": 0.22716393868440118, + "learning_rate": 0.0005680313338670627, + "loss": 2.5162, + "step": 4124 + }, + { + "epoch": 0.47299621603027175, + "grad_norm": 0.23690201275609046, + "learning_rate": 0.0005678473608279024, + "loss": 2.4383, + "step": 4125 + }, + { + "epoch": 0.4731108817796124, + "grad_norm": 0.2536670048753871, + "learning_rate": 0.000567663378430492, + "loss": 2.472, + "step": 4126 + }, + { + "epoch": 0.4732255475289531, + "grad_norm": 0.25796315385799085, + "learning_rate": 0.0005674793867002083, + "loss": 2.4826, + "step": 4127 + }, + { + "epoch": 0.4733402132782938, + "grad_norm": 0.25690528032225146, + "learning_rate": 0.0005672953856624294, + "loss": 2.4395, + "step": 4128 + }, + { + "epoch": 0.47345487902763445, + "grad_norm": 0.23938403695296154, + "learning_rate": 0.000567111375342535, + "loss": 2.5153, + "step": 4129 + }, + { + "epoch": 0.4735695447769751, + "grad_norm": 0.2642814905579894, + "learning_rate": 0.0005669273557659055, + "loss": 2.3523, + "step": 4130 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.2523240847978188, + "learning_rate": 0.000566743326957923, + "loss": 2.5267, + "step": 4131 + }, + { + "epoch": 0.4737988762756565, + "grad_norm": 0.2558946186781651, + "learning_rate": 0.0005665592889439709, + "loss": 2.3615, + "step": 4132 + }, + { + "epoch": 0.47391354202499714, + "grad_norm": 0.27073208018057404, + "learning_rate": 0.0005663752417494334, + "loss": 2.5466, + "step": 4133 + }, + { + "epoch": 0.4740282077743378, + "grad_norm": 0.2858518708997128, + "learning_rate": 0.0005661911853996969, + "loss": 2.5796, + "step": 4134 + }, + { + "epoch": 0.47414287352367845, + "grad_norm": 0.22950780578974697, + "learning_rate": 0.0005660071199201479, + "loss": 2.4335, + "step": 4135 + }, + { + "epoch": 0.47425753927301917, + "grad_norm": 0.22233527700134328, + "learning_rate": 0.0005658230453361748, + "loss": 2.4029, + "step": 4136 + }, + { + "epoch": 0.4743722050223598, + "grad_norm": 0.28034842991877434, + "learning_rate": 0.0005656389616731675, + "loss": 2.6054, + "step": 4137 + }, + { + "epoch": 0.4744868707717005, + "grad_norm": 0.2570318663817991, + "learning_rate": 0.0005654548689565164, + "loss": 2.4566, + "step": 4138 + }, + { + "epoch": 0.47460153652104115, + "grad_norm": 0.23051429074322416, + "learning_rate": 0.000565270767211614, + "loss": 2.4399, + "step": 4139 + }, + { + "epoch": 0.47471620227038186, + "grad_norm": 0.22875454841555037, + "learning_rate": 0.0005650866564638535, + "loss": 2.506, + "step": 4140 + }, + { + "epoch": 0.4748308680197225, + "grad_norm": 0.25345988244530293, + "learning_rate": 0.0005649025367386292, + "loss": 2.3816, + "step": 4141 + }, + { + "epoch": 0.4749455337690632, + "grad_norm": 0.22454371472156445, + "learning_rate": 0.0005647184080613371, + "loss": 2.4985, + "step": 4142 + }, + { + "epoch": 0.47506019951840384, + "grad_norm": 0.24890509324488688, + "learning_rate": 0.0005645342704573744, + "loss": 2.5377, + "step": 4143 + }, + { + "epoch": 0.47517486526774455, + "grad_norm": 0.25006798776180694, + "learning_rate": 0.0005643501239521393, + "loss": 2.4799, + "step": 4144 + }, + { + "epoch": 0.4752895310170852, + "grad_norm": 0.23803095789544196, + "learning_rate": 0.0005641659685710309, + "loss": 2.4267, + "step": 4145 + }, + { + "epoch": 0.47540419676642587, + "grad_norm": 0.2567255849652182, + "learning_rate": 0.0005639818043394506, + "loss": 2.5436, + "step": 4146 + }, + { + "epoch": 0.4755188625157665, + "grad_norm": 0.2438244139619499, + "learning_rate": 0.0005637976312827998, + "loss": 2.5394, + "step": 4147 + }, + { + "epoch": 0.4756335282651072, + "grad_norm": 0.2538153128820238, + "learning_rate": 0.000563613449426482, + "loss": 2.5064, + "step": 4148 + }, + { + "epoch": 0.4757481940144479, + "grad_norm": 0.28925088969359114, + "learning_rate": 0.0005634292587959012, + "loss": 2.4247, + "step": 4149 + }, + { + "epoch": 0.47586285976378856, + "grad_norm": 0.27083006504708856, + "learning_rate": 0.0005632450594164635, + "loss": 2.5274, + "step": 4150 + }, + { + "epoch": 0.4759775255131292, + "grad_norm": 0.2622811849663945, + "learning_rate": 0.0005630608513135752, + "loss": 2.367, + "step": 4151 + }, + { + "epoch": 0.4760921912624699, + "grad_norm": 0.2880598981528039, + "learning_rate": 0.0005628766345126446, + "loss": 2.5864, + "step": 4152 + }, + { + "epoch": 0.4762068570118106, + "grad_norm": 0.2660879487401973, + "learning_rate": 0.0005626924090390808, + "loss": 2.4833, + "step": 4153 + }, + { + "epoch": 0.47632152276115125, + "grad_norm": 0.24466872053368477, + "learning_rate": 0.0005625081749182942, + "loss": 2.5885, + "step": 4154 + }, + { + "epoch": 0.4764361885104919, + "grad_norm": 0.24965560831163044, + "learning_rate": 0.0005623239321756963, + "loss": 2.4572, + "step": 4155 + }, + { + "epoch": 0.47655085425983257, + "grad_norm": 0.25674691599470817, + "learning_rate": 0.0005621396808367001, + "loss": 2.4566, + "step": 4156 + }, + { + "epoch": 0.4766655200091733, + "grad_norm": 0.2185965287018234, + "learning_rate": 0.0005619554209267193, + "loss": 2.3968, + "step": 4157 + }, + { + "epoch": 0.47678018575851394, + "grad_norm": 0.2498996348556679, + "learning_rate": 0.0005617711524711691, + "loss": 2.5938, + "step": 4158 + }, + { + "epoch": 0.4768948515078546, + "grad_norm": 0.25388160847277025, + "learning_rate": 0.0005615868754954661, + "loss": 2.442, + "step": 4159 + }, + { + "epoch": 0.47700951725719526, + "grad_norm": 0.2740946707734979, + "learning_rate": 0.0005614025900250276, + "loss": 2.3021, + "step": 4160 + }, + { + "epoch": 0.477124183006536, + "grad_norm": 0.2752682076295188, + "learning_rate": 0.000561218296085272, + "loss": 2.4521, + "step": 4161 + }, + { + "epoch": 0.47723884875587663, + "grad_norm": 0.21940153849400076, + "learning_rate": 0.0005610339937016195, + "loss": 2.5537, + "step": 4162 + }, + { + "epoch": 0.4773535145052173, + "grad_norm": 0.23835519195762034, + "learning_rate": 0.0005608496828994911, + "loss": 2.4207, + "step": 4163 + }, + { + "epoch": 0.47746818025455795, + "grad_norm": 0.24430055114641686, + "learning_rate": 0.0005606653637043087, + "loss": 2.635, + "step": 4164 + }, + { + "epoch": 0.4775828460038986, + "grad_norm": 0.27245225007613466, + "learning_rate": 0.0005604810361414958, + "loss": 2.5286, + "step": 4165 + }, + { + "epoch": 0.4776975117532393, + "grad_norm": 0.22963128586497952, + "learning_rate": 0.0005602967002364768, + "loss": 2.5396, + "step": 4166 + }, + { + "epoch": 0.47781217750258, + "grad_norm": 0.24359699584858852, + "learning_rate": 0.0005601123560146774, + "loss": 2.4934, + "step": 4167 + }, + { + "epoch": 0.47792684325192064, + "grad_norm": 0.24688195138327487, + "learning_rate": 0.0005599280035015243, + "loss": 2.4549, + "step": 4168 + }, + { + "epoch": 0.4780415090012613, + "grad_norm": 0.24096815241092795, + "learning_rate": 0.0005597436427224455, + "loss": 2.5376, + "step": 4169 + }, + { + "epoch": 0.478156174750602, + "grad_norm": 0.2665731500867811, + "learning_rate": 0.00055955927370287, + "loss": 2.6084, + "step": 4170 + }, + { + "epoch": 0.4782708404999427, + "grad_norm": 0.2737497553961168, + "learning_rate": 0.000559374896468228, + "loss": 2.5465, + "step": 4171 + }, + { + "epoch": 0.47838550624928333, + "grad_norm": 0.25154056533526276, + "learning_rate": 0.0005591905110439507, + "loss": 2.4183, + "step": 4172 + }, + { + "epoch": 0.478500171998624, + "grad_norm": 0.23683953615149914, + "learning_rate": 0.0005590061174554708, + "loss": 2.4335, + "step": 4173 + }, + { + "epoch": 0.4786148377479647, + "grad_norm": 0.24164204542028214, + "learning_rate": 0.0005588217157282217, + "loss": 2.4426, + "step": 4174 + }, + { + "epoch": 0.47872950349730536, + "grad_norm": 0.23853892285568642, + "learning_rate": 0.0005586373058876383, + "loss": 2.41, + "step": 4175 + }, + { + "epoch": 0.478844169246646, + "grad_norm": 0.22510349155277046, + "learning_rate": 0.0005584528879591562, + "loss": 2.4898, + "step": 4176 + }, + { + "epoch": 0.4789588349959867, + "grad_norm": 0.20392768209206685, + "learning_rate": 0.0005582684619682123, + "loss": 2.5063, + "step": 4177 + }, + { + "epoch": 0.4790735007453274, + "grad_norm": 0.24279545892432144, + "learning_rate": 0.0005580840279402448, + "loss": 2.5717, + "step": 4178 + }, + { + "epoch": 0.47918816649466806, + "grad_norm": 0.2403223707295445, + "learning_rate": 0.000557899585900693, + "loss": 2.4519, + "step": 4179 + }, + { + "epoch": 0.4793028322440087, + "grad_norm": 0.2617493685594155, + "learning_rate": 0.0005577151358749968, + "loss": 2.5576, + "step": 4180 + }, + { + "epoch": 0.4794174979933494, + "grad_norm": 0.24416194030712499, + "learning_rate": 0.0005575306778885978, + "loss": 2.4738, + "step": 4181 + }, + { + "epoch": 0.47953216374269003, + "grad_norm": 0.21831416887248878, + "learning_rate": 0.0005573462119669386, + "loss": 2.3628, + "step": 4182 + }, + { + "epoch": 0.47964682949203075, + "grad_norm": 0.24711339501712612, + "learning_rate": 0.0005571617381354622, + "loss": 2.4892, + "step": 4183 + }, + { + "epoch": 0.4797614952413714, + "grad_norm": 0.2856610249724389, + "learning_rate": 0.0005569772564196139, + "loss": 2.5061, + "step": 4184 + }, + { + "epoch": 0.47987616099071206, + "grad_norm": 0.24541557651021437, + "learning_rate": 0.0005567927668448392, + "loss": 2.4748, + "step": 4185 + }, + { + "epoch": 0.4799908267400527, + "grad_norm": 0.266935338012606, + "learning_rate": 0.0005566082694365847, + "loss": 2.4486, + "step": 4186 + }, + { + "epoch": 0.48010549248939344, + "grad_norm": 0.32799397814201664, + "learning_rate": 0.0005564237642202987, + "loss": 2.3662, + "step": 4187 + }, + { + "epoch": 0.4802201582387341, + "grad_norm": 0.2218832535166006, + "learning_rate": 0.0005562392512214299, + "loss": 2.4748, + "step": 4188 + }, + { + "epoch": 0.48033482398807475, + "grad_norm": 0.23240099261642552, + "learning_rate": 0.0005560547304654282, + "loss": 2.53, + "step": 4189 + }, + { + "epoch": 0.4804494897374154, + "grad_norm": 0.23974538682350902, + "learning_rate": 0.0005558702019777452, + "loss": 2.4599, + "step": 4190 + }, + { + "epoch": 0.48056415548675613, + "grad_norm": 0.2429648295185892, + "learning_rate": 0.0005556856657838327, + "loss": 2.5631, + "step": 4191 + }, + { + "epoch": 0.4806788212360968, + "grad_norm": 0.2690553335467335, + "learning_rate": 0.000555501121909144, + "loss": 2.4736, + "step": 4192 + }, + { + "epoch": 0.48079348698543745, + "grad_norm": 0.24231355856882744, + "learning_rate": 0.0005553165703791335, + "loss": 2.4327, + "step": 4193 + }, + { + "epoch": 0.4809081527347781, + "grad_norm": 0.24067285009075667, + "learning_rate": 0.0005551320112192567, + "loss": 2.4934, + "step": 4194 + }, + { + "epoch": 0.4810228184841188, + "grad_norm": 0.244330097894077, + "learning_rate": 0.0005549474444549698, + "loss": 2.3214, + "step": 4195 + }, + { + "epoch": 0.4811374842334595, + "grad_norm": 0.24740247424614512, + "learning_rate": 0.0005547628701117303, + "loss": 2.521, + "step": 4196 + }, + { + "epoch": 0.48125214998280014, + "grad_norm": 0.262990618117095, + "learning_rate": 0.0005545782882149968, + "loss": 2.5422, + "step": 4197 + }, + { + "epoch": 0.4813668157321408, + "grad_norm": 0.24598372162153725, + "learning_rate": 0.0005543936987902287, + "loss": 2.4684, + "step": 4198 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.22410309874476436, + "learning_rate": 0.0005542091018628867, + "loss": 2.4555, + "step": 4199 + }, + { + "epoch": 0.48159614723082217, + "grad_norm": 0.2734295047046339, + "learning_rate": 0.0005540244974584325, + "loss": 2.5799, + "step": 4200 + }, + { + "epoch": 0.48171081298016283, + "grad_norm": 0.24319467954381446, + "learning_rate": 0.0005538398856023285, + "loss": 2.4253, + "step": 4201 + }, + { + "epoch": 0.4818254787295035, + "grad_norm": 0.27044248933350296, + "learning_rate": 0.0005536552663200387, + "loss": 2.5299, + "step": 4202 + }, + { + "epoch": 0.48194014447884415, + "grad_norm": 0.2505844276806928, + "learning_rate": 0.0005534706396370277, + "loss": 2.5111, + "step": 4203 + }, + { + "epoch": 0.48205481022818486, + "grad_norm": 0.24297564590093956, + "learning_rate": 0.0005532860055787611, + "loss": 2.4284, + "step": 4204 + }, + { + "epoch": 0.4821694759775255, + "grad_norm": 0.2665742312700134, + "learning_rate": 0.0005531013641707059, + "loss": 2.438, + "step": 4205 + }, + { + "epoch": 0.4822841417268662, + "grad_norm": 0.2677745265775324, + "learning_rate": 0.0005529167154383296, + "loss": 2.4421, + "step": 4206 + }, + { + "epoch": 0.48239880747620684, + "grad_norm": 0.2670371489279821, + "learning_rate": 0.0005527320594071012, + "loss": 2.5502, + "step": 4207 + }, + { + "epoch": 0.48251347322554755, + "grad_norm": 0.21826003909240005, + "learning_rate": 0.0005525473961024901, + "loss": 2.4324, + "step": 4208 + }, + { + "epoch": 0.4826281389748882, + "grad_norm": 0.23030028735105276, + "learning_rate": 0.0005523627255499677, + "loss": 2.4573, + "step": 4209 + }, + { + "epoch": 0.48274280472422887, + "grad_norm": 0.23932718133943717, + "learning_rate": 0.0005521780477750054, + "loss": 2.296, + "step": 4210 + }, + { + "epoch": 0.48285747047356953, + "grad_norm": 0.24105268850172284, + "learning_rate": 0.0005519933628030757, + "loss": 2.5126, + "step": 4211 + }, + { + "epoch": 0.48297213622291024, + "grad_norm": 0.25143757053984317, + "learning_rate": 0.000551808670659653, + "loss": 2.4582, + "step": 4212 + }, + { + "epoch": 0.4830868019722509, + "grad_norm": 0.25384382760074164, + "learning_rate": 0.0005516239713702116, + "loss": 2.5686, + "step": 4213 + }, + { + "epoch": 0.48320146772159156, + "grad_norm": 0.2916862086286595, + "learning_rate": 0.0005514392649602273, + "loss": 2.4804, + "step": 4214 + }, + { + "epoch": 0.4833161334709322, + "grad_norm": 0.2729425916295906, + "learning_rate": 0.0005512545514551772, + "loss": 2.6418, + "step": 4215 + }, + { + "epoch": 0.4834307992202729, + "grad_norm": 0.25862163737338417, + "learning_rate": 0.0005510698308805385, + "loss": 2.3957, + "step": 4216 + }, + { + "epoch": 0.4835454649696136, + "grad_norm": 0.26077714663841295, + "learning_rate": 0.0005508851032617902, + "loss": 2.4755, + "step": 4217 + }, + { + "epoch": 0.48366013071895425, + "grad_norm": 0.2555993867906374, + "learning_rate": 0.0005507003686244119, + "loss": 2.5443, + "step": 4218 + }, + { + "epoch": 0.4837747964682949, + "grad_norm": 0.23200621062770846, + "learning_rate": 0.0005505156269938841, + "loss": 2.545, + "step": 4219 + }, + { + "epoch": 0.48388946221763557, + "grad_norm": 0.23766825825499693, + "learning_rate": 0.0005503308783956886, + "loss": 2.5025, + "step": 4220 + }, + { + "epoch": 0.4840041279669763, + "grad_norm": 0.24549516255760703, + "learning_rate": 0.0005501461228553075, + "loss": 2.3714, + "step": 4221 + }, + { + "epoch": 0.48411879371631694, + "grad_norm": 0.2399244918309305, + "learning_rate": 0.000549961360398225, + "loss": 2.6042, + "step": 4222 + }, + { + "epoch": 0.4842334594656576, + "grad_norm": 0.23892995641309037, + "learning_rate": 0.0005497765910499249, + "loss": 2.4644, + "step": 4223 + }, + { + "epoch": 0.48434812521499826, + "grad_norm": 0.2305873569472715, + "learning_rate": 0.0005495918148358931, + "loss": 2.5181, + "step": 4224 + }, + { + "epoch": 0.484462790964339, + "grad_norm": 0.21161925391713776, + "learning_rate": 0.0005494070317816157, + "loss": 2.3756, + "step": 4225 + }, + { + "epoch": 0.48457745671367963, + "grad_norm": 0.19553756490611487, + "learning_rate": 0.00054922224191258, + "loss": 2.4679, + "step": 4226 + }, + { + "epoch": 0.4846921224630203, + "grad_norm": 0.2509488475806269, + "learning_rate": 0.0005490374452542743, + "loss": 2.4931, + "step": 4227 + }, + { + "epoch": 0.48480678821236095, + "grad_norm": 0.2032686860834027, + "learning_rate": 0.0005488526418321877, + "loss": 2.5107, + "step": 4228 + }, + { + "epoch": 0.48492145396170167, + "grad_norm": 0.2570530359428502, + "learning_rate": 0.0005486678316718106, + "loss": 2.4928, + "step": 4229 + }, + { + "epoch": 0.4850361197110423, + "grad_norm": 0.21508363925198118, + "learning_rate": 0.0005484830147986336, + "loss": 2.5095, + "step": 4230 + }, + { + "epoch": 0.485150785460383, + "grad_norm": 0.23659902635576155, + "learning_rate": 0.0005482981912381488, + "loss": 2.4119, + "step": 4231 + }, + { + "epoch": 0.48526545120972364, + "grad_norm": 0.2701606249619523, + "learning_rate": 0.0005481133610158494, + "loss": 2.5991, + "step": 4232 + }, + { + "epoch": 0.4853801169590643, + "grad_norm": 0.24778337449965787, + "learning_rate": 0.0005479285241572288, + "loss": 2.6576, + "step": 4233 + }, + { + "epoch": 0.485494782708405, + "grad_norm": 0.236888107365057, + "learning_rate": 0.000547743680687782, + "loss": 2.3645, + "step": 4234 + }, + { + "epoch": 0.4856094484577457, + "grad_norm": 0.23407375060186117, + "learning_rate": 0.0005475588306330046, + "loss": 2.4974, + "step": 4235 + }, + { + "epoch": 0.48572411420708633, + "grad_norm": 0.22629806638267463, + "learning_rate": 0.0005473739740183928, + "loss": 2.531, + "step": 4236 + }, + { + "epoch": 0.485838779956427, + "grad_norm": 0.24855762162652337, + "learning_rate": 0.0005471891108694446, + "loss": 2.4044, + "step": 4237 + }, + { + "epoch": 0.4859534457057677, + "grad_norm": 0.2564242889606033, + "learning_rate": 0.0005470042412116579, + "loss": 2.464, + "step": 4238 + }, + { + "epoch": 0.48606811145510836, + "grad_norm": 0.27857970126644244, + "learning_rate": 0.0005468193650705321, + "loss": 2.423, + "step": 4239 + }, + { + "epoch": 0.486182777204449, + "grad_norm": 0.24478707403159253, + "learning_rate": 0.0005466344824715674, + "loss": 2.4987, + "step": 4240 + }, + { + "epoch": 0.4862974429537897, + "grad_norm": 0.2536700361415309, + "learning_rate": 0.0005464495934402648, + "loss": 2.4268, + "step": 4241 + }, + { + "epoch": 0.4864121087031304, + "grad_norm": 0.26261690434204255, + "learning_rate": 0.0005462646980021262, + "loss": 2.4507, + "step": 4242 + }, + { + "epoch": 0.48652677445247106, + "grad_norm": 0.36019749984865407, + "learning_rate": 0.0005460797961826542, + "loss": 2.4636, + "step": 4243 + }, + { + "epoch": 0.4866414402018117, + "grad_norm": 0.2445501595451593, + "learning_rate": 0.000545894888007353, + "loss": 2.5122, + "step": 4244 + }, + { + "epoch": 0.4867561059511524, + "grad_norm": 0.2291999464700877, + "learning_rate": 0.0005457099735017267, + "loss": 2.5294, + "step": 4245 + }, + { + "epoch": 0.4868707717004931, + "grad_norm": 0.24812820036897562, + "learning_rate": 0.0005455250526912807, + "loss": 2.6283, + "step": 4246 + }, + { + "epoch": 0.48698543744983375, + "grad_norm": 0.2231409092844094, + "learning_rate": 0.0005453401256015218, + "loss": 2.4978, + "step": 4247 + }, + { + "epoch": 0.4871001031991744, + "grad_norm": 0.2512465108857026, + "learning_rate": 0.0005451551922579566, + "loss": 2.5082, + "step": 4248 + }, + { + "epoch": 0.48721476894851506, + "grad_norm": 0.2066376412129163, + "learning_rate": 0.0005449702526860934, + "loss": 2.2685, + "step": 4249 + }, + { + "epoch": 0.4873294346978557, + "grad_norm": 0.22974447559988523, + "learning_rate": 0.0005447853069114411, + "loss": 2.529, + "step": 4250 + }, + { + "epoch": 0.48744410044719644, + "grad_norm": 0.2740874949110293, + "learning_rate": 0.0005446003549595093, + "loss": 2.4813, + "step": 4251 + }, + { + "epoch": 0.4875587661965371, + "grad_norm": 0.2566153116365614, + "learning_rate": 0.0005444153968558088, + "loss": 2.4179, + "step": 4252 + }, + { + "epoch": 0.48767343194587776, + "grad_norm": 0.23473142848904294, + "learning_rate": 0.0005442304326258508, + "loss": 2.5479, + "step": 4253 + }, + { + "epoch": 0.4877880976952184, + "grad_norm": 0.2374375579176573, + "learning_rate": 0.000544045462295148, + "loss": 2.3556, + "step": 4254 + }, + { + "epoch": 0.48790276344455913, + "grad_norm": 0.24071576626632016, + "learning_rate": 0.000543860485889213, + "loss": 2.627, + "step": 4255 + }, + { + "epoch": 0.4880174291938998, + "grad_norm": 0.23503959632370858, + "learning_rate": 0.00054367550343356, + "loss": 2.4079, + "step": 4256 + }, + { + "epoch": 0.48813209494324045, + "grad_norm": 0.22458094518413046, + "learning_rate": 0.000543490514953704, + "loss": 2.423, + "step": 4257 + }, + { + "epoch": 0.4882467606925811, + "grad_norm": 0.2398448295756372, + "learning_rate": 0.0005433055204751604, + "loss": 2.5793, + "step": 4258 + }, + { + "epoch": 0.4883614264419218, + "grad_norm": 0.23464049198236497, + "learning_rate": 0.0005431205200234457, + "loss": 2.5672, + "step": 4259 + }, + { + "epoch": 0.4884760921912625, + "grad_norm": 0.2518782650815768, + "learning_rate": 0.0005429355136240773, + "loss": 2.4267, + "step": 4260 + }, + { + "epoch": 0.48859075794060314, + "grad_norm": 0.23715372959684425, + "learning_rate": 0.0005427505013025731, + "loss": 2.4568, + "step": 4261 + }, + { + "epoch": 0.4887054236899438, + "grad_norm": 0.24645576035430528, + "learning_rate": 0.0005425654830844523, + "loss": 2.5417, + "step": 4262 + }, + { + "epoch": 0.4888200894392845, + "grad_norm": 0.24759509493121049, + "learning_rate": 0.0005423804589952344, + "loss": 2.5087, + "step": 4263 + }, + { + "epoch": 0.48893475518862517, + "grad_norm": 0.2598287931736392, + "learning_rate": 0.0005421954290604399, + "loss": 2.5177, + "step": 4264 + }, + { + "epoch": 0.48904942093796583, + "grad_norm": 0.2556963448716178, + "learning_rate": 0.0005420103933055906, + "loss": 2.4589, + "step": 4265 + }, + { + "epoch": 0.4891640866873065, + "grad_norm": 0.26849628655194163, + "learning_rate": 0.0005418253517562079, + "loss": 2.4733, + "step": 4266 + }, + { + "epoch": 0.48927875243664715, + "grad_norm": 0.26067681780266044, + "learning_rate": 0.0005416403044378156, + "loss": 2.5143, + "step": 4267 + }, + { + "epoch": 0.48939341818598786, + "grad_norm": 0.2404985866767127, + "learning_rate": 0.0005414552513759368, + "loss": 2.3637, + "step": 4268 + }, + { + "epoch": 0.4895080839353285, + "grad_norm": 0.2548735428180745, + "learning_rate": 0.0005412701925960964, + "loss": 2.4836, + "step": 4269 + }, + { + "epoch": 0.4896227496846692, + "grad_norm": 0.23476625803310852, + "learning_rate": 0.0005410851281238195, + "loss": 2.5129, + "step": 4270 + }, + { + "epoch": 0.48973741543400984, + "grad_norm": 0.23508413597369218, + "learning_rate": 0.0005409000579846324, + "loss": 2.6133, + "step": 4271 + }, + { + "epoch": 0.48985208118335055, + "grad_norm": 0.24962041487693615, + "learning_rate": 0.0005407149822040619, + "loss": 2.3489, + "step": 4272 + }, + { + "epoch": 0.4899667469326912, + "grad_norm": 0.25083219597405193, + "learning_rate": 0.0005405299008076357, + "loss": 2.6215, + "step": 4273 + }, + { + "epoch": 0.49008141268203187, + "grad_norm": 0.2426135980413356, + "learning_rate": 0.0005403448138208823, + "loss": 2.4241, + "step": 4274 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 0.23926376804241262, + "learning_rate": 0.0005401597212693308, + "loss": 2.4401, + "step": 4275 + }, + { + "epoch": 0.49031074418071324, + "grad_norm": 0.23207931903684031, + "learning_rate": 0.0005399746231785113, + "loss": 2.4661, + "step": 4276 + }, + { + "epoch": 0.4904254099300539, + "grad_norm": 0.27293068142181676, + "learning_rate": 0.0005397895195739545, + "loss": 2.383, + "step": 4277 + }, + { + "epoch": 0.49054007567939456, + "grad_norm": 0.2537311051258877, + "learning_rate": 0.0005396044104811921, + "loss": 2.3339, + "step": 4278 + }, + { + "epoch": 0.4906547414287352, + "grad_norm": 0.24632938681352637, + "learning_rate": 0.000539419295925756, + "loss": 2.3931, + "step": 4279 + }, + { + "epoch": 0.49076940717807593, + "grad_norm": 0.22728189776320773, + "learning_rate": 0.0005392341759331795, + "loss": 2.521, + "step": 4280 + }, + { + "epoch": 0.4908840729274166, + "grad_norm": 0.2674274932067508, + "learning_rate": 0.0005390490505289962, + "loss": 2.5069, + "step": 4281 + }, + { + "epoch": 0.49099873867675725, + "grad_norm": 0.2563514920239442, + "learning_rate": 0.0005388639197387409, + "loss": 2.5872, + "step": 4282 + }, + { + "epoch": 0.4911134044260979, + "grad_norm": 0.2502072809615758, + "learning_rate": 0.0005386787835879486, + "loss": 2.4592, + "step": 4283 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 0.2765230776312634, + "learning_rate": 0.0005384936421021553, + "loss": 2.5667, + "step": 4284 + }, + { + "epoch": 0.4913427359247793, + "grad_norm": 0.2200257965411627, + "learning_rate": 0.0005383084953068981, + "loss": 2.4286, + "step": 4285 + }, + { + "epoch": 0.49145740167411994, + "grad_norm": 0.25761105066310963, + "learning_rate": 0.0005381233432277139, + "loss": 2.4997, + "step": 4286 + }, + { + "epoch": 0.4915720674234606, + "grad_norm": 0.25414238638468956, + "learning_rate": 0.0005379381858901413, + "loss": 2.4525, + "step": 4287 + }, + { + "epoch": 0.49168673317280126, + "grad_norm": 0.2511874865540193, + "learning_rate": 0.0005377530233197191, + "loss": 2.5808, + "step": 4288 + }, + { + "epoch": 0.491801398922142, + "grad_norm": 0.20720540273143123, + "learning_rate": 0.000537567855541987, + "loss": 2.3213, + "step": 4289 + }, + { + "epoch": 0.49191606467148263, + "grad_norm": 0.2432710960792619, + "learning_rate": 0.0005373826825824854, + "loss": 2.453, + "step": 4290 + }, + { + "epoch": 0.4920307304208233, + "grad_norm": 0.2595504598432154, + "learning_rate": 0.0005371975044667553, + "loss": 2.4501, + "step": 4291 + }, + { + "epoch": 0.49214539617016395, + "grad_norm": 0.20816762225845506, + "learning_rate": 0.0005370123212203384, + "loss": 2.5177, + "step": 4292 + }, + { + "epoch": 0.49226006191950467, + "grad_norm": 0.22714299356906906, + "learning_rate": 0.0005368271328687774, + "loss": 2.515, + "step": 4293 + }, + { + "epoch": 0.4923747276688453, + "grad_norm": 0.22210753118075688, + "learning_rate": 0.0005366419394376154, + "loss": 2.2833, + "step": 4294 + }, + { + "epoch": 0.492489393418186, + "grad_norm": 0.25128822091647535, + "learning_rate": 0.0005364567409523963, + "loss": 2.4628, + "step": 4295 + }, + { + "epoch": 0.49260405916752664, + "grad_norm": 0.2602511270742645, + "learning_rate": 0.0005362715374386646, + "loss": 2.4961, + "step": 4296 + }, + { + "epoch": 0.49271872491686736, + "grad_norm": 0.24531584186410146, + "learning_rate": 0.0005360863289219659, + "loss": 2.5136, + "step": 4297 + }, + { + "epoch": 0.492833390666208, + "grad_norm": 0.2613062806439916, + "learning_rate": 0.000535901115427846, + "loss": 2.4943, + "step": 4298 + }, + { + "epoch": 0.4929480564155487, + "grad_norm": 0.26172691646817, + "learning_rate": 0.0005357158969818514, + "loss": 2.4155, + "step": 4299 + }, + { + "epoch": 0.49306272216488933, + "grad_norm": 0.2331435011732198, + "learning_rate": 0.0005355306736095298, + "loss": 2.5172, + "step": 4300 + }, + { + "epoch": 0.49317738791423, + "grad_norm": 0.2530558766681741, + "learning_rate": 0.000535345445336429, + "loss": 2.5656, + "step": 4301 + }, + { + "epoch": 0.4932920536635707, + "grad_norm": 0.24530525666215877, + "learning_rate": 0.0005351602121880976, + "loss": 2.5785, + "step": 4302 + }, + { + "epoch": 0.49340671941291137, + "grad_norm": 0.2655244967781446, + "learning_rate": 0.0005349749741900853, + "loss": 2.4208, + "step": 4303 + }, + { + "epoch": 0.493521385162252, + "grad_norm": 0.2249974826944688, + "learning_rate": 0.0005347897313679419, + "loss": 2.4915, + "step": 4304 + }, + { + "epoch": 0.4936360509115927, + "grad_norm": 0.29188190162051086, + "learning_rate": 0.0005346044837472182, + "loss": 2.5387, + "step": 4305 + }, + { + "epoch": 0.4937507166609334, + "grad_norm": 0.24289980749343582, + "learning_rate": 0.0005344192313534657, + "loss": 2.484, + "step": 4306 + }, + { + "epoch": 0.49386538241027406, + "grad_norm": 0.2249088135742346, + "learning_rate": 0.0005342339742122363, + "loss": 2.5179, + "step": 4307 + }, + { + "epoch": 0.4939800481596147, + "grad_norm": 0.23733012950216503, + "learning_rate": 0.0005340487123490826, + "loss": 2.4911, + "step": 4308 + }, + { + "epoch": 0.4940947139089554, + "grad_norm": 0.2340885346788916, + "learning_rate": 0.0005338634457895582, + "loss": 2.4287, + "step": 4309 + }, + { + "epoch": 0.4942093796582961, + "grad_norm": 0.25535947570363887, + "learning_rate": 0.000533678174559217, + "loss": 2.4026, + "step": 4310 + }, + { + "epoch": 0.49432404540763675, + "grad_norm": 0.24377776070634471, + "learning_rate": 0.0005334928986836133, + "loss": 2.426, + "step": 4311 + }, + { + "epoch": 0.4944387111569774, + "grad_norm": 0.2581042010118711, + "learning_rate": 0.000533307618188303, + "loss": 2.3824, + "step": 4312 + }, + { + "epoch": 0.49455337690631807, + "grad_norm": 0.2661210215017931, + "learning_rate": 0.0005331223330988414, + "loss": 2.4082, + "step": 4313 + }, + { + "epoch": 0.4946680426556588, + "grad_norm": 0.23815720206257696, + "learning_rate": 0.0005329370434407854, + "loss": 2.5449, + "step": 4314 + }, + { + "epoch": 0.49478270840499944, + "grad_norm": 0.24444229151729496, + "learning_rate": 0.0005327517492396922, + "loss": 2.5335, + "step": 4315 + }, + { + "epoch": 0.4948973741543401, + "grad_norm": 0.23806688669118478, + "learning_rate": 0.0005325664505211194, + "loss": 2.551, + "step": 4316 + }, + { + "epoch": 0.49501203990368076, + "grad_norm": 0.2504947960283395, + "learning_rate": 0.0005323811473106256, + "loss": 2.4301, + "step": 4317 + }, + { + "epoch": 0.4951267056530214, + "grad_norm": 0.22534763705446684, + "learning_rate": 0.0005321958396337696, + "loss": 2.4637, + "step": 4318 + }, + { + "epoch": 0.49524137140236213, + "grad_norm": 0.2618821137478578, + "learning_rate": 0.0005320105275161115, + "loss": 2.4316, + "step": 4319 + }, + { + "epoch": 0.4953560371517028, + "grad_norm": 0.2558620385536041, + "learning_rate": 0.0005318252109832111, + "loss": 2.6486, + "step": 4320 + }, + { + "epoch": 0.49547070290104345, + "grad_norm": 0.2521347483261181, + "learning_rate": 0.0005316398900606296, + "loss": 2.5281, + "step": 4321 + }, + { + "epoch": 0.4955853686503841, + "grad_norm": 0.22726251697872826, + "learning_rate": 0.0005314545647739283, + "loss": 2.5423, + "step": 4322 + }, + { + "epoch": 0.4957000343997248, + "grad_norm": 0.22684565656876438, + "learning_rate": 0.0005312692351486693, + "loss": 2.4285, + "step": 4323 + }, + { + "epoch": 0.4958147001490655, + "grad_norm": 0.23929360932044078, + "learning_rate": 0.0005310839012104155, + "loss": 2.351, + "step": 4324 + }, + { + "epoch": 0.49592936589840614, + "grad_norm": 0.23307035068356424, + "learning_rate": 0.00053089856298473, + "loss": 2.4303, + "step": 4325 + }, + { + "epoch": 0.4960440316477468, + "grad_norm": 0.24511968533819883, + "learning_rate": 0.0005307132204971768, + "loss": 2.5271, + "step": 4326 + }, + { + "epoch": 0.4961586973970875, + "grad_norm": 0.23896102011573656, + "learning_rate": 0.00053052787377332, + "loss": 2.3507, + "step": 4327 + }, + { + "epoch": 0.49627336314642817, + "grad_norm": 0.2611983246520042, + "learning_rate": 0.0005303425228387251, + "loss": 2.4173, + "step": 4328 + }, + { + "epoch": 0.49638802889576883, + "grad_norm": 0.26947685251238335, + "learning_rate": 0.0005301571677189576, + "loss": 2.5653, + "step": 4329 + }, + { + "epoch": 0.4965026946451095, + "grad_norm": 0.2615342277467975, + "learning_rate": 0.0005299718084395837, + "loss": 2.4597, + "step": 4330 + }, + { + "epoch": 0.4966173603944502, + "grad_norm": 0.26557108328446577, + "learning_rate": 0.00052978644502617, + "loss": 2.4971, + "step": 4331 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.2198060610422783, + "learning_rate": 0.0005296010775042841, + "loss": 2.3134, + "step": 4332 + }, + { + "epoch": 0.4968466918931315, + "grad_norm": 0.29288098044585603, + "learning_rate": 0.0005294157058994936, + "loss": 2.4945, + "step": 4333 + }, + { + "epoch": 0.4969613576424722, + "grad_norm": 0.23740508081886666, + "learning_rate": 0.0005292303302373674, + "loss": 2.6261, + "step": 4334 + }, + { + "epoch": 0.49707602339181284, + "grad_norm": 0.24642642766795028, + "learning_rate": 0.0005290449505434744, + "loss": 2.5387, + "step": 4335 + }, + { + "epoch": 0.49719068914115355, + "grad_norm": 0.2432609178735972, + "learning_rate": 0.0005288595668433839, + "loss": 2.476, + "step": 4336 + }, + { + "epoch": 0.4973053548904942, + "grad_norm": 0.22396146015071064, + "learning_rate": 0.0005286741791626664, + "loss": 2.4127, + "step": 4337 + }, + { + "epoch": 0.49742002063983487, + "grad_norm": 0.2138523084117149, + "learning_rate": 0.0005284887875268925, + "loss": 2.378, + "step": 4338 + }, + { + "epoch": 0.49753468638917553, + "grad_norm": 0.2290172302443223, + "learning_rate": 0.0005283033919616331, + "loss": 2.4073, + "step": 4339 + }, + { + "epoch": 0.49764935213851624, + "grad_norm": 0.23100085948497584, + "learning_rate": 0.0005281179924924608, + "loss": 2.5252, + "step": 4340 + }, + { + "epoch": 0.4977640178878569, + "grad_norm": 0.2792997905698276, + "learning_rate": 0.000527932589144947, + "loss": 2.4047, + "step": 4341 + }, + { + "epoch": 0.49787868363719756, + "grad_norm": 0.23418684662042613, + "learning_rate": 0.0005277471819446651, + "loss": 2.4912, + "step": 4342 + }, + { + "epoch": 0.4979933493865382, + "grad_norm": 0.2640122099667919, + "learning_rate": 0.0005275617709171882, + "loss": 2.5087, + "step": 4343 + }, + { + "epoch": 0.49810801513587893, + "grad_norm": 0.2347998866606567, + "learning_rate": 0.0005273763560880907, + "loss": 2.4046, + "step": 4344 + }, + { + "epoch": 0.4982226808852196, + "grad_norm": 0.2563839596862694, + "learning_rate": 0.0005271909374829466, + "loss": 2.5381, + "step": 4345 + }, + { + "epoch": 0.49833734663456025, + "grad_norm": 0.23775486581077201, + "learning_rate": 0.0005270055151273309, + "loss": 2.4436, + "step": 4346 + }, + { + "epoch": 0.4984520123839009, + "grad_norm": 0.2378905835063179, + "learning_rate": 0.0005268200890468192, + "loss": 2.3399, + "step": 4347 + }, + { + "epoch": 0.4985666781332416, + "grad_norm": 0.23686736558516328, + "learning_rate": 0.0005266346592669875, + "loss": 2.4547, + "step": 4348 + }, + { + "epoch": 0.4986813438825823, + "grad_norm": 0.29314887510902804, + "learning_rate": 0.0005264492258134121, + "loss": 2.483, + "step": 4349 + }, + { + "epoch": 0.49879600963192294, + "grad_norm": 0.24834169641333712, + "learning_rate": 0.0005262637887116703, + "loss": 2.3696, + "step": 4350 + }, + { + "epoch": 0.4989106753812636, + "grad_norm": 0.24855628444251068, + "learning_rate": 0.0005260783479873396, + "loss": 2.4848, + "step": 4351 + }, + { + "epoch": 0.49902534113060426, + "grad_norm": 0.28044919871995994, + "learning_rate": 0.0005258929036659976, + "loss": 2.381, + "step": 4352 + }, + { + "epoch": 0.499140006879945, + "grad_norm": 0.24000288919255766, + "learning_rate": 0.0005257074557732232, + "loss": 2.4622, + "step": 4353 + }, + { + "epoch": 0.49925467262928563, + "grad_norm": 0.2402070588094161, + "learning_rate": 0.0005255220043345956, + "loss": 2.378, + "step": 4354 + }, + { + "epoch": 0.4993693383786263, + "grad_norm": 0.24815694349582698, + "learning_rate": 0.0005253365493756936, + "loss": 2.4035, + "step": 4355 + }, + { + "epoch": 0.49948400412796695, + "grad_norm": 0.23350687870001346, + "learning_rate": 0.0005251510909220977, + "loss": 2.4617, + "step": 4356 + }, + { + "epoch": 0.49959866987730767, + "grad_norm": 0.25447443417865584, + "learning_rate": 0.0005249656289993883, + "loss": 2.5667, + "step": 4357 + }, + { + "epoch": 0.4997133356266483, + "grad_norm": 0.22263325311493223, + "learning_rate": 0.0005247801636331462, + "loss": 2.4553, + "step": 4358 + }, + { + "epoch": 0.499828001375989, + "grad_norm": 0.26593953278004306, + "learning_rate": 0.000524594694848953, + "loss": 2.5588, + "step": 4359 + }, + { + "epoch": 0.49994266712532964, + "grad_norm": 0.24116554696319004, + "learning_rate": 0.0005244092226723903, + "loss": 2.5647, + "step": 4360 + }, + { + "epoch": 0.5000573328746704, + "grad_norm": 0.2154257618620745, + "learning_rate": 0.0005242237471290407, + "loss": 2.5087, + "step": 4361 + }, + { + "epoch": 0.500171998624011, + "grad_norm": 0.24405985841943548, + "learning_rate": 0.0005240382682444868, + "loss": 2.5069, + "step": 4362 + }, + { + "epoch": 0.5002866643733517, + "grad_norm": 0.24672393776592733, + "learning_rate": 0.0005238527860443122, + "loss": 2.4768, + "step": 4363 + }, + { + "epoch": 0.5004013301226924, + "grad_norm": 0.24480548735357616, + "learning_rate": 0.0005236673005541003, + "loss": 2.4839, + "step": 4364 + }, + { + "epoch": 0.500515995872033, + "grad_norm": 0.23641511146009606, + "learning_rate": 0.0005234818117994355, + "loss": 2.4447, + "step": 4365 + }, + { + "epoch": 0.5006306616213737, + "grad_norm": 0.23033054122489685, + "learning_rate": 0.0005232963198059024, + "loss": 2.4205, + "step": 4366 + }, + { + "epoch": 0.5007453273707144, + "grad_norm": 0.23471865686219945, + "learning_rate": 0.000523110824599086, + "loss": 2.4934, + "step": 4367 + }, + { + "epoch": 0.500859993120055, + "grad_norm": 0.22465373613646814, + "learning_rate": 0.0005229253262045719, + "loss": 2.377, + "step": 4368 + }, + { + "epoch": 0.5009746588693957, + "grad_norm": 0.2559427369580352, + "learning_rate": 0.000522739824647946, + "loss": 2.3529, + "step": 4369 + }, + { + "epoch": 0.5010893246187363, + "grad_norm": 0.2704878009102001, + "learning_rate": 0.0005225543199547948, + "loss": 2.4102, + "step": 4370 + }, + { + "epoch": 0.5012039903680771, + "grad_norm": 0.2570878713750787, + "learning_rate": 0.0005223688121507051, + "loss": 2.6003, + "step": 4371 + }, + { + "epoch": 0.5013186561174178, + "grad_norm": 0.24828334581651285, + "learning_rate": 0.0005221833012612642, + "loss": 2.3542, + "step": 4372 + }, + { + "epoch": 0.5014333218667584, + "grad_norm": 0.24665577293376492, + "learning_rate": 0.0005219977873120596, + "loss": 2.5724, + "step": 4373 + }, + { + "epoch": 0.5015479876160991, + "grad_norm": 0.24817184883593726, + "learning_rate": 0.0005218122703286797, + "loss": 2.5644, + "step": 4374 + }, + { + "epoch": 0.5016626533654397, + "grad_norm": 0.26257739917337064, + "learning_rate": 0.0005216267503367127, + "loss": 2.2543, + "step": 4375 + }, + { + "epoch": 0.5017773191147804, + "grad_norm": 0.26173233982638006, + "learning_rate": 0.0005214412273617478, + "loss": 2.4989, + "step": 4376 + }, + { + "epoch": 0.5018919848641211, + "grad_norm": 0.22745211643805005, + "learning_rate": 0.0005212557014293744, + "loss": 2.2826, + "step": 4377 + }, + { + "epoch": 0.5020066506134617, + "grad_norm": 0.26922479109104175, + "learning_rate": 0.0005210701725651821, + "loss": 2.489, + "step": 4378 + }, + { + "epoch": 0.5021213163628024, + "grad_norm": 0.246191625954822, + "learning_rate": 0.0005208846407947612, + "loss": 2.4276, + "step": 4379 + }, + { + "epoch": 0.5022359821121432, + "grad_norm": 0.22786411001936432, + "learning_rate": 0.000520699106143702, + "loss": 2.4541, + "step": 4380 + }, + { + "epoch": 0.5023506478614838, + "grad_norm": 0.24946120802188787, + "learning_rate": 0.0005205135686375958, + "loss": 2.4557, + "step": 4381 + }, + { + "epoch": 0.5024653136108245, + "grad_norm": 0.24385283179877318, + "learning_rate": 0.0005203280283020338, + "loss": 2.5944, + "step": 4382 + }, + { + "epoch": 0.5025799793601651, + "grad_norm": 0.2508890244482735, + "learning_rate": 0.0005201424851626078, + "loss": 2.4113, + "step": 4383 + }, + { + "epoch": 0.5026946451095058, + "grad_norm": 0.23851510829480885, + "learning_rate": 0.00051995693924491, + "loss": 2.5417, + "step": 4384 + }, + { + "epoch": 0.5028093108588465, + "grad_norm": 0.25325972274514374, + "learning_rate": 0.0005197713905745328, + "loss": 2.5853, + "step": 4385 + }, + { + "epoch": 0.5029239766081871, + "grad_norm": 0.26702654435678297, + "learning_rate": 0.0005195858391770689, + "loss": 2.6262, + "step": 4386 + }, + { + "epoch": 0.5030386423575278, + "grad_norm": 0.23603443757491216, + "learning_rate": 0.0005194002850781122, + "loss": 2.6366, + "step": 4387 + }, + { + "epoch": 0.5031533081068684, + "grad_norm": 0.23710005345337615, + "learning_rate": 0.0005192147283032557, + "loss": 2.4218, + "step": 4388 + }, + { + "epoch": 0.5032679738562091, + "grad_norm": 0.25943668770551953, + "learning_rate": 0.000519029168878094, + "loss": 2.4113, + "step": 4389 + }, + { + "epoch": 0.5033826396055499, + "grad_norm": 0.22831516495904508, + "learning_rate": 0.000518843606828221, + "loss": 2.503, + "step": 4390 + }, + { + "epoch": 0.5034973053548905, + "grad_norm": 0.21658715154696095, + "learning_rate": 0.0005186580421792315, + "loss": 2.3754, + "step": 4391 + }, + { + "epoch": 0.5036119711042312, + "grad_norm": 0.2547942756977791, + "learning_rate": 0.0005184724749567209, + "loss": 2.3781, + "step": 4392 + }, + { + "epoch": 0.5037266368535719, + "grad_norm": 0.24651508559680219, + "learning_rate": 0.0005182869051862844, + "loss": 2.35, + "step": 4393 + }, + { + "epoch": 0.5038413026029125, + "grad_norm": 0.2600624583993239, + "learning_rate": 0.0005181013328935181, + "loss": 2.4508, + "step": 4394 + }, + { + "epoch": 0.5039559683522532, + "grad_norm": 0.2606676702325652, + "learning_rate": 0.0005179157581040178, + "loss": 2.6062, + "step": 4395 + }, + { + "epoch": 0.5040706341015938, + "grad_norm": 0.2772535738480416, + "learning_rate": 0.0005177301808433802, + "loss": 2.512, + "step": 4396 + }, + { + "epoch": 0.5041852998509345, + "grad_norm": 0.24186459735077867, + "learning_rate": 0.0005175446011372022, + "loss": 2.3872, + "step": 4397 + }, + { + "epoch": 0.5042999656002752, + "grad_norm": 0.2558804130529151, + "learning_rate": 0.0005173590190110808, + "loss": 2.4312, + "step": 4398 + }, + { + "epoch": 0.5044146313496158, + "grad_norm": 0.24112551217120248, + "learning_rate": 0.0005171734344906136, + "loss": 2.3829, + "step": 4399 + }, + { + "epoch": 0.5045292970989566, + "grad_norm": 0.2415728109942904, + "learning_rate": 0.0005169878476013986, + "loss": 2.3493, + "step": 4400 + }, + { + "epoch": 0.5046439628482973, + "grad_norm": 0.246232785858063, + "learning_rate": 0.0005168022583690339, + "loss": 2.5167, + "step": 4401 + }, + { + "epoch": 0.5047586285976379, + "grad_norm": 0.26085823164738764, + "learning_rate": 0.0005166166668191176, + "loss": 2.4536, + "step": 4402 + }, + { + "epoch": 0.5048732943469786, + "grad_norm": 0.22342981142176194, + "learning_rate": 0.0005164310729772492, + "loss": 2.4591, + "step": 4403 + }, + { + "epoch": 0.5049879600963192, + "grad_norm": 0.2376840349946497, + "learning_rate": 0.0005162454768690274, + "loss": 2.4508, + "step": 4404 + }, + { + "epoch": 0.5051026258456599, + "grad_norm": 0.25095568889004893, + "learning_rate": 0.0005160598785200515, + "loss": 2.5652, + "step": 4405 + }, + { + "epoch": 0.5052172915950006, + "grad_norm": 0.20828091036306182, + "learning_rate": 0.0005158742779559217, + "loss": 2.3737, + "step": 4406 + }, + { + "epoch": 0.5053319573443412, + "grad_norm": 0.2440456995035093, + "learning_rate": 0.0005156886752022379, + "loss": 2.4314, + "step": 4407 + }, + { + "epoch": 0.5054466230936819, + "grad_norm": 0.23002148398894387, + "learning_rate": 0.0005155030702846002, + "loss": 2.2855, + "step": 4408 + }, + { + "epoch": 0.5055612888430225, + "grad_norm": 0.24588306016190825, + "learning_rate": 0.0005153174632286097, + "loss": 2.4195, + "step": 4409 + }, + { + "epoch": 0.5056759545923633, + "grad_norm": 0.22560849928427396, + "learning_rate": 0.000515131854059867, + "loss": 2.4525, + "step": 4410 + }, + { + "epoch": 0.505790620341704, + "grad_norm": 0.21367860373916914, + "learning_rate": 0.0005149462428039734, + "loss": 2.4725, + "step": 4411 + }, + { + "epoch": 0.5059052860910446, + "grad_norm": 0.23780094065203372, + "learning_rate": 0.0005147606294865307, + "loss": 2.3423, + "step": 4412 + }, + { + "epoch": 0.5060199518403853, + "grad_norm": 0.2282695418919905, + "learning_rate": 0.0005145750141331405, + "loss": 2.4634, + "step": 4413 + }, + { + "epoch": 0.506134617589726, + "grad_norm": 0.26447227568405257, + "learning_rate": 0.0005143893967694047, + "loss": 2.3778, + "step": 4414 + }, + { + "epoch": 0.5062492833390666, + "grad_norm": 0.24072813448579483, + "learning_rate": 0.0005142037774209262, + "loss": 2.4324, + "step": 4415 + }, + { + "epoch": 0.5063639490884073, + "grad_norm": 0.2540354720690453, + "learning_rate": 0.0005140181561133072, + "loss": 2.5933, + "step": 4416 + }, + { + "epoch": 0.5064786148377479, + "grad_norm": 0.2427770695912424, + "learning_rate": 0.0005138325328721507, + "loss": 2.4489, + "step": 4417 + }, + { + "epoch": 0.5065932805870886, + "grad_norm": 0.22934548472730665, + "learning_rate": 0.00051364690772306, + "loss": 2.5259, + "step": 4418 + }, + { + "epoch": 0.5067079463364293, + "grad_norm": 0.2549041071744384, + "learning_rate": 0.0005134612806916387, + "loss": 2.4151, + "step": 4419 + }, + { + "epoch": 0.50682261208577, + "grad_norm": 0.2659909621739589, + "learning_rate": 0.0005132756518034901, + "loss": 2.4278, + "step": 4420 + }, + { + "epoch": 0.5069372778351107, + "grad_norm": 0.23455112783123422, + "learning_rate": 0.0005130900210842185, + "loss": 2.509, + "step": 4421 + }, + { + "epoch": 0.5070519435844513, + "grad_norm": 0.2664959095586685, + "learning_rate": 0.000512904388559428, + "loss": 2.54, + "step": 4422 + }, + { + "epoch": 0.507166609333792, + "grad_norm": 0.2547826840573038, + "learning_rate": 0.000512718754254723, + "loss": 2.5208, + "step": 4423 + }, + { + "epoch": 0.5072812750831327, + "grad_norm": 0.2441892015538959, + "learning_rate": 0.0005125331181957083, + "loss": 2.2279, + "step": 4424 + }, + { + "epoch": 0.5073959408324733, + "grad_norm": 0.21239476219175457, + "learning_rate": 0.000512347480407989, + "loss": 2.4623, + "step": 4425 + }, + { + "epoch": 0.507510606581814, + "grad_norm": 0.2745595924956919, + "learning_rate": 0.00051216184091717, + "loss": 2.4629, + "step": 4426 + }, + { + "epoch": 0.5076252723311547, + "grad_norm": 0.22795539508952897, + "learning_rate": 0.0005119761997488569, + "loss": 2.3366, + "step": 4427 + }, + { + "epoch": 0.5077399380804953, + "grad_norm": 0.2593414101848472, + "learning_rate": 0.0005117905569286552, + "loss": 2.438, + "step": 4428 + }, + { + "epoch": 0.507854603829836, + "grad_norm": 0.2336111029956309, + "learning_rate": 0.0005116049124821713, + "loss": 2.4158, + "step": 4429 + }, + { + "epoch": 0.5079692695791767, + "grad_norm": 0.24040709718790787, + "learning_rate": 0.0005114192664350107, + "loss": 2.505, + "step": 4430 + }, + { + "epoch": 0.5080839353285174, + "grad_norm": 0.26136585074887025, + "learning_rate": 0.0005112336188127798, + "loss": 2.438, + "step": 4431 + }, + { + "epoch": 0.5081986010778581, + "grad_norm": 0.26678426255053617, + "learning_rate": 0.0005110479696410857, + "loss": 2.4769, + "step": 4432 + }, + { + "epoch": 0.5083132668271987, + "grad_norm": 0.24903560889938164, + "learning_rate": 0.0005108623189455343, + "loss": 2.5166, + "step": 4433 + }, + { + "epoch": 0.5084279325765394, + "grad_norm": 0.25636261895291235, + "learning_rate": 0.0005106766667517335, + "loss": 2.3231, + "step": 4434 + }, + { + "epoch": 0.5085425983258801, + "grad_norm": 0.261907056277633, + "learning_rate": 0.0005104910130852899, + "loss": 2.5382, + "step": 4435 + }, + { + "epoch": 0.5086572640752207, + "grad_norm": 0.23130714531705523, + "learning_rate": 0.0005103053579718109, + "loss": 2.4024, + "step": 4436 + }, + { + "epoch": 0.5087719298245614, + "grad_norm": 0.25426433593529624, + "learning_rate": 0.0005101197014369043, + "loss": 2.5182, + "step": 4437 + }, + { + "epoch": 0.508886595573902, + "grad_norm": 0.2305563415556242, + "learning_rate": 0.0005099340435061778, + "loss": 2.364, + "step": 4438 + }, + { + "epoch": 0.5090012613232427, + "grad_norm": 0.2676155459551082, + "learning_rate": 0.0005097483842052393, + "loss": 2.4152, + "step": 4439 + }, + { + "epoch": 0.5091159270725835, + "grad_norm": 0.23044463646881486, + "learning_rate": 0.000509562723559697, + "loss": 2.48, + "step": 4440 + }, + { + "epoch": 0.5092305928219241, + "grad_norm": 0.24413787166101544, + "learning_rate": 0.0005093770615951594, + "loss": 2.3764, + "step": 4441 + }, + { + "epoch": 0.5093452585712648, + "grad_norm": 0.24532958513089856, + "learning_rate": 0.0005091913983372347, + "loss": 2.4351, + "step": 4442 + }, + { + "epoch": 0.5094599243206054, + "grad_norm": 0.22810286833879911, + "learning_rate": 0.0005090057338115319, + "loss": 2.4722, + "step": 4443 + }, + { + "epoch": 0.5095745900699461, + "grad_norm": 0.24140086287401355, + "learning_rate": 0.0005088200680436599, + "loss": 2.4526, + "step": 4444 + }, + { + "epoch": 0.5096892558192868, + "grad_norm": 0.25546042318538686, + "learning_rate": 0.0005086344010592275, + "loss": 2.5041, + "step": 4445 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 0.28499420258810065, + "learning_rate": 0.0005084487328838441, + "loss": 2.5198, + "step": 4446 + }, + { + "epoch": 0.5099185873179681, + "grad_norm": 0.23295272539280099, + "learning_rate": 0.0005082630635431191, + "loss": 2.3797, + "step": 4447 + }, + { + "epoch": 0.5100332530673088, + "grad_norm": 0.2585233925785146, + "learning_rate": 0.0005080773930626619, + "loss": 2.4864, + "step": 4448 + }, + { + "epoch": 0.5101479188166494, + "grad_norm": 0.241014494544708, + "learning_rate": 0.0005078917214680822, + "loss": 2.577, + "step": 4449 + }, + { + "epoch": 0.5102625845659902, + "grad_norm": 0.2636168725975801, + "learning_rate": 0.0005077060487849903, + "loss": 2.5293, + "step": 4450 + }, + { + "epoch": 0.5103772503153308, + "grad_norm": 0.2552041124364305, + "learning_rate": 0.0005075203750389956, + "loss": 2.5916, + "step": 4451 + }, + { + "epoch": 0.5104919160646715, + "grad_norm": 0.24462926706054902, + "learning_rate": 0.0005073347002557085, + "loss": 2.4131, + "step": 4452 + }, + { + "epoch": 0.5106065818140122, + "grad_norm": 0.24262747945724658, + "learning_rate": 0.0005071490244607395, + "loss": 2.3664, + "step": 4453 + }, + { + "epoch": 0.5107212475633528, + "grad_norm": 0.26114618047415733, + "learning_rate": 0.0005069633476796986, + "loss": 2.4137, + "step": 4454 + }, + { + "epoch": 0.5108359133126935, + "grad_norm": 0.2492033942841191, + "learning_rate": 0.0005067776699381969, + "loss": 2.5549, + "step": 4455 + }, + { + "epoch": 0.5109505790620341, + "grad_norm": 0.24389153957071577, + "learning_rate": 0.0005065919912618446, + "loss": 2.4683, + "step": 4456 + }, + { + "epoch": 0.5110652448113748, + "grad_norm": 0.23813897541708795, + "learning_rate": 0.0005064063116762529, + "loss": 2.3981, + "step": 4457 + }, + { + "epoch": 0.5111799105607155, + "grad_norm": 0.2341094797223545, + "learning_rate": 0.0005062206312070323, + "loss": 2.4831, + "step": 4458 + }, + { + "epoch": 0.5112945763100561, + "grad_norm": 0.2618214948187654, + "learning_rate": 0.0005060349498797945, + "loss": 2.4423, + "step": 4459 + }, + { + "epoch": 0.5114092420593969, + "grad_norm": 0.23977210626257703, + "learning_rate": 0.0005058492677201505, + "loss": 2.3942, + "step": 4460 + }, + { + "epoch": 0.5115239078087376, + "grad_norm": 0.25745009131596475, + "learning_rate": 0.0005056635847537112, + "loss": 2.464, + "step": 4461 + }, + { + "epoch": 0.5116385735580782, + "grad_norm": 0.24561943932029004, + "learning_rate": 0.0005054779010060886, + "loss": 2.4204, + "step": 4462 + }, + { + "epoch": 0.5117532393074189, + "grad_norm": 0.2417337496842403, + "learning_rate": 0.0005052922165028939, + "loss": 2.5649, + "step": 4463 + }, + { + "epoch": 0.5118679050567595, + "grad_norm": 0.24649837892767223, + "learning_rate": 0.0005051065312697387, + "loss": 2.331, + "step": 4464 + }, + { + "epoch": 0.5119825708061002, + "grad_norm": 0.236471136853237, + "learning_rate": 0.0005049208453322352, + "loss": 2.451, + "step": 4465 + }, + { + "epoch": 0.5120972365554409, + "grad_norm": 0.2644533385796049, + "learning_rate": 0.0005047351587159945, + "loss": 2.3906, + "step": 4466 + }, + { + "epoch": 0.5122119023047815, + "grad_norm": 0.24228618654172104, + "learning_rate": 0.000504549471446629, + "loss": 2.4267, + "step": 4467 + }, + { + "epoch": 0.5123265680541222, + "grad_norm": 0.24852514379482032, + "learning_rate": 0.0005043637835497507, + "loss": 2.54, + "step": 4468 + }, + { + "epoch": 0.512441233803463, + "grad_norm": 0.22740614956590777, + "learning_rate": 0.0005041780950509716, + "loss": 2.5298, + "step": 4469 + }, + { + "epoch": 0.5125558995528036, + "grad_norm": 0.2529305558200204, + "learning_rate": 0.000503992405975904, + "loss": 2.4577, + "step": 4470 + }, + { + "epoch": 0.5126705653021443, + "grad_norm": 0.26806820702665496, + "learning_rate": 0.00050380671635016, + "loss": 2.3439, + "step": 4471 + }, + { + "epoch": 0.5127852310514849, + "grad_norm": 0.2420338252787076, + "learning_rate": 0.0005036210261993523, + "loss": 2.2924, + "step": 4472 + }, + { + "epoch": 0.5128998968008256, + "grad_norm": 0.2539610864806883, + "learning_rate": 0.0005034353355490927, + "loss": 2.4823, + "step": 4473 + }, + { + "epoch": 0.5130145625501663, + "grad_norm": 0.27065266869163895, + "learning_rate": 0.0005032496444249943, + "loss": 2.5463, + "step": 4474 + }, + { + "epoch": 0.5131292282995069, + "grad_norm": 0.2608645867523156, + "learning_rate": 0.0005030639528526693, + "loss": 2.414, + "step": 4475 + }, + { + "epoch": 0.5132438940488476, + "grad_norm": 0.23714210659653825, + "learning_rate": 0.0005028782608577304, + "loss": 2.4351, + "step": 4476 + }, + { + "epoch": 0.5133585597981882, + "grad_norm": 0.2662937404700151, + "learning_rate": 0.0005026925684657902, + "loss": 2.4784, + "step": 4477 + }, + { + "epoch": 0.5134732255475289, + "grad_norm": 0.24344217044384162, + "learning_rate": 0.0005025068757024616, + "loss": 2.4705, + "step": 4478 + }, + { + "epoch": 0.5135878912968697, + "grad_norm": 0.23555082809672234, + "learning_rate": 0.0005023211825933572, + "loss": 2.4809, + "step": 4479 + }, + { + "epoch": 0.5137025570462103, + "grad_norm": 0.2347572306100918, + "learning_rate": 0.00050213548916409, + "loss": 2.4747, + "step": 4480 + }, + { + "epoch": 0.513817222795551, + "grad_norm": 0.27004970693695995, + "learning_rate": 0.0005019497954402728, + "loss": 2.4364, + "step": 4481 + }, + { + "epoch": 0.5139318885448917, + "grad_norm": 0.2235431060459389, + "learning_rate": 0.0005017641014475184, + "loss": 2.4231, + "step": 4482 + }, + { + "epoch": 0.5140465542942323, + "grad_norm": 0.26006520453402177, + "learning_rate": 0.0005015784072114397, + "loss": 2.3703, + "step": 4483 + }, + { + "epoch": 0.514161220043573, + "grad_norm": 0.28524316064969185, + "learning_rate": 0.0005013927127576501, + "loss": 2.5153, + "step": 4484 + }, + { + "epoch": 0.5142758857929136, + "grad_norm": 0.2549869709717568, + "learning_rate": 0.0005012070181117621, + "loss": 2.5243, + "step": 4485 + }, + { + "epoch": 0.5143905515422543, + "grad_norm": 0.2646476906263981, + "learning_rate": 0.0005010213232993891, + "loss": 2.4464, + "step": 4486 + }, + { + "epoch": 0.514505217291595, + "grad_norm": 0.30204712882569773, + "learning_rate": 0.000500835628346144, + "loss": 2.4255, + "step": 4487 + }, + { + "epoch": 0.5146198830409356, + "grad_norm": 0.2623138365463618, + "learning_rate": 0.00050064993327764, + "loss": 2.3483, + "step": 4488 + }, + { + "epoch": 0.5147345487902764, + "grad_norm": 0.24885624094290695, + "learning_rate": 0.0005004642381194899, + "loss": 2.4325, + "step": 4489 + }, + { + "epoch": 0.514849214539617, + "grad_norm": 0.25972020461232, + "learning_rate": 0.0005002785428973071, + "loss": 2.4662, + "step": 4490 + }, + { + "epoch": 0.5149638802889577, + "grad_norm": 0.26984783925429645, + "learning_rate": 0.0005000928476367046, + "loss": 2.3857, + "step": 4491 + }, + { + "epoch": 0.5150785460382984, + "grad_norm": 0.22819276381571366, + "learning_rate": 0.0004999071523632954, + "loss": 2.4421, + "step": 4492 + }, + { + "epoch": 0.515193211787639, + "grad_norm": 0.24383476682286662, + "learning_rate": 0.000499721457102693, + "loss": 2.5803, + "step": 4493 + }, + { + "epoch": 0.5153078775369797, + "grad_norm": 0.2609559036463876, + "learning_rate": 0.0004995357618805102, + "loss": 2.505, + "step": 4494 + }, + { + "epoch": 0.5154225432863204, + "grad_norm": 0.26360640862036133, + "learning_rate": 0.0004993500667223601, + "loss": 2.4429, + "step": 4495 + }, + { + "epoch": 0.515537209035661, + "grad_norm": 0.23544066023381935, + "learning_rate": 0.0004991643716538561, + "loss": 2.3813, + "step": 4496 + }, + { + "epoch": 0.5156518747850017, + "grad_norm": 0.2231454455377136, + "learning_rate": 0.0004989786767006109, + "loss": 2.3387, + "step": 4497 + }, + { + "epoch": 0.5157665405343423, + "grad_norm": 0.24232797548502388, + "learning_rate": 0.0004987929818882379, + "loss": 2.4783, + "step": 4498 + }, + { + "epoch": 0.5158812062836831, + "grad_norm": 0.2373161126301491, + "learning_rate": 0.0004986072872423499, + "loss": 2.447, + "step": 4499 + }, + { + "epoch": 0.5159958720330238, + "grad_norm": 0.23269584834054896, + "learning_rate": 0.0004984215927885603, + "loss": 2.4573, + "step": 4500 + }, + { + "epoch": 0.5161105377823644, + "grad_norm": 0.2603155372143211, + "learning_rate": 0.0004982358985524817, + "loss": 2.5778, + "step": 4501 + }, + { + "epoch": 0.5162252035317051, + "grad_norm": 0.22523328111648705, + "learning_rate": 0.0004980502045597272, + "loss": 2.421, + "step": 4502 + }, + { + "epoch": 0.5163398692810458, + "grad_norm": 0.2650253885494348, + "learning_rate": 0.00049786451083591, + "loss": 2.5238, + "step": 4503 + }, + { + "epoch": 0.5164545350303864, + "grad_norm": 0.25352827961005536, + "learning_rate": 0.0004976788174066428, + "loss": 2.4048, + "step": 4504 + }, + { + "epoch": 0.5165692007797271, + "grad_norm": 0.27475361726706465, + "learning_rate": 0.0004974931242975385, + "loss": 2.3863, + "step": 4505 + }, + { + "epoch": 0.5166838665290677, + "grad_norm": 0.27467705445347046, + "learning_rate": 0.0004973074315342101, + "loss": 2.4789, + "step": 4506 + }, + { + "epoch": 0.5167985322784084, + "grad_norm": 0.2454587108006379, + "learning_rate": 0.0004971217391422697, + "loss": 2.4899, + "step": 4507 + }, + { + "epoch": 0.5169131980277492, + "grad_norm": 0.24434211971417166, + "learning_rate": 0.0004969360471473309, + "loss": 2.4738, + "step": 4508 + }, + { + "epoch": 0.5170278637770898, + "grad_norm": 0.2534261479393847, + "learning_rate": 0.0004967503555750059, + "loss": 2.4561, + "step": 4509 + }, + { + "epoch": 0.5171425295264305, + "grad_norm": 0.24998579604378462, + "learning_rate": 0.0004965646644509074, + "loss": 2.4068, + "step": 4510 + }, + { + "epoch": 0.5172571952757711, + "grad_norm": 0.24228415885608864, + "learning_rate": 0.000496378973800648, + "loss": 2.4844, + "step": 4511 + }, + { + "epoch": 0.5173718610251118, + "grad_norm": 0.2723702188999539, + "learning_rate": 0.00049619328364984, + "loss": 2.3476, + "step": 4512 + }, + { + "epoch": 0.5174865267744525, + "grad_norm": 0.23970791015710996, + "learning_rate": 0.0004960075940240961, + "loss": 2.4233, + "step": 4513 + }, + { + "epoch": 0.5176011925237931, + "grad_norm": 0.22936563410271157, + "learning_rate": 0.0004958219049490284, + "loss": 2.4077, + "step": 4514 + }, + { + "epoch": 0.5177158582731338, + "grad_norm": 0.24015611618425878, + "learning_rate": 0.0004956362164502495, + "loss": 2.5292, + "step": 4515 + }, + { + "epoch": 0.5178305240224745, + "grad_norm": 0.22340294335669278, + "learning_rate": 0.0004954505285533711, + "loss": 2.4698, + "step": 4516 + }, + { + "epoch": 0.5179451897718151, + "grad_norm": 0.25484182375316994, + "learning_rate": 0.0004952648412840056, + "loss": 2.4738, + "step": 4517 + }, + { + "epoch": 0.5180598555211559, + "grad_norm": 0.24063549968310993, + "learning_rate": 0.000495079154667765, + "loss": 2.4566, + "step": 4518 + }, + { + "epoch": 0.5181745212704965, + "grad_norm": 0.2241071631857714, + "learning_rate": 0.0004948934687302614, + "loss": 2.418, + "step": 4519 + }, + { + "epoch": 0.5182891870198372, + "grad_norm": 0.21984351432177796, + "learning_rate": 0.0004947077834971061, + "loss": 2.4392, + "step": 4520 + }, + { + "epoch": 0.5184038527691779, + "grad_norm": 0.2298693333756246, + "learning_rate": 0.0004945220989939115, + "loss": 2.5411, + "step": 4521 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.23693366586345208, + "learning_rate": 0.0004943364152462887, + "loss": 2.4331, + "step": 4522 + }, + { + "epoch": 0.5186331842678592, + "grad_norm": 0.2414301085737016, + "learning_rate": 0.0004941507322798496, + "loss": 2.5474, + "step": 4523 + }, + { + "epoch": 0.5187478500171998, + "grad_norm": 0.2555304546357254, + "learning_rate": 0.0004939650501202055, + "loss": 2.5069, + "step": 4524 + }, + { + "epoch": 0.5188625157665405, + "grad_norm": 0.24726059620171312, + "learning_rate": 0.0004937793687929677, + "loss": 2.6135, + "step": 4525 + }, + { + "epoch": 0.5189771815158812, + "grad_norm": 0.24491133895399492, + "learning_rate": 0.0004935936883237474, + "loss": 2.3745, + "step": 4526 + }, + { + "epoch": 0.5190918472652218, + "grad_norm": 0.2651742163404057, + "learning_rate": 0.0004934080087381555, + "loss": 2.4878, + "step": 4527 + }, + { + "epoch": 0.5192065130145626, + "grad_norm": 0.2681261190960247, + "learning_rate": 0.0004932223300618033, + "loss": 2.5889, + "step": 4528 + }, + { + "epoch": 0.5193211787639033, + "grad_norm": 0.255648140766007, + "learning_rate": 0.0004930366523203014, + "loss": 2.5325, + "step": 4529 + }, + { + "epoch": 0.5194358445132439, + "grad_norm": 0.2572738217569529, + "learning_rate": 0.0004928509755392607, + "loss": 2.4573, + "step": 4530 + }, + { + "epoch": 0.5195505102625846, + "grad_norm": 0.24624602540049972, + "learning_rate": 0.0004926652997442917, + "loss": 2.3551, + "step": 4531 + }, + { + "epoch": 0.5196651760119252, + "grad_norm": 0.24281949106866502, + "learning_rate": 0.0004924796249610045, + "loss": 2.3984, + "step": 4532 + }, + { + "epoch": 0.5197798417612659, + "grad_norm": 0.23781891944197128, + "learning_rate": 0.0004922939512150098, + "loss": 2.3681, + "step": 4533 + }, + { + "epoch": 0.5198945075106066, + "grad_norm": 0.26569260499395503, + "learning_rate": 0.0004921082785319178, + "loss": 2.4346, + "step": 4534 + }, + { + "epoch": 0.5200091732599472, + "grad_norm": 0.2510981904280255, + "learning_rate": 0.0004919226069373382, + "loss": 2.384, + "step": 4535 + }, + { + "epoch": 0.5201238390092879, + "grad_norm": 0.23339062801106183, + "learning_rate": 0.0004917369364568811, + "loss": 2.3784, + "step": 4536 + }, + { + "epoch": 0.5202385047586287, + "grad_norm": 0.2490741506929699, + "learning_rate": 0.0004915512671161559, + "loss": 2.4568, + "step": 4537 + }, + { + "epoch": 0.5203531705079693, + "grad_norm": 0.23328243174384444, + "learning_rate": 0.0004913655989407726, + "loss": 2.4152, + "step": 4538 + }, + { + "epoch": 0.52046783625731, + "grad_norm": 0.23473355129308804, + "learning_rate": 0.0004911799319563402, + "loss": 2.3595, + "step": 4539 + }, + { + "epoch": 0.5205825020066506, + "grad_norm": 0.2405559171009568, + "learning_rate": 0.0004909942661884681, + "loss": 2.6628, + "step": 4540 + }, + { + "epoch": 0.5206971677559913, + "grad_norm": 0.24631988397109497, + "learning_rate": 0.0004908086016627653, + "loss": 2.5678, + "step": 4541 + }, + { + "epoch": 0.520811833505332, + "grad_norm": 0.23938338252289218, + "learning_rate": 0.0004906229384048407, + "loss": 2.382, + "step": 4542 + }, + { + "epoch": 0.5209264992546726, + "grad_norm": 0.277794012808303, + "learning_rate": 0.000490437276440303, + "loss": 2.4525, + "step": 4543 + }, + { + "epoch": 0.5210411650040133, + "grad_norm": 0.2774102729627632, + "learning_rate": 0.0004902516157947608, + "loss": 2.5504, + "step": 4544 + }, + { + "epoch": 0.5211558307533539, + "grad_norm": 0.24894939743213101, + "learning_rate": 0.0004900659564938223, + "loss": 2.4834, + "step": 4545 + }, + { + "epoch": 0.5212704965026946, + "grad_norm": 0.25074321101055985, + "learning_rate": 0.000489880298563096, + "loss": 2.4665, + "step": 4546 + }, + { + "epoch": 0.5213851622520354, + "grad_norm": 0.2571422675666554, + "learning_rate": 0.0004896946420281891, + "loss": 2.449, + "step": 4547 + }, + { + "epoch": 0.521499828001376, + "grad_norm": 0.2562677574020682, + "learning_rate": 0.0004895089869147102, + "loss": 2.5506, + "step": 4548 + }, + { + "epoch": 0.5216144937507167, + "grad_norm": 0.25596615454633465, + "learning_rate": 0.0004893233332482666, + "loss": 2.5829, + "step": 4549 + }, + { + "epoch": 0.5217291595000574, + "grad_norm": 0.28954517699155546, + "learning_rate": 0.0004891376810544657, + "loss": 2.5289, + "step": 4550 + }, + { + "epoch": 0.521843825249398, + "grad_norm": 0.2737888164371717, + "learning_rate": 0.0004889520303589146, + "loss": 2.4603, + "step": 4551 + }, + { + "epoch": 0.5219584909987387, + "grad_norm": 0.24085932037693727, + "learning_rate": 0.0004887663811872201, + "loss": 2.4901, + "step": 4552 + }, + { + "epoch": 0.5220731567480793, + "grad_norm": 0.271399142591164, + "learning_rate": 0.0004885807335649894, + "loss": 2.5674, + "step": 4553 + }, + { + "epoch": 0.52218782249742, + "grad_norm": 0.22342220561876536, + "learning_rate": 0.0004883950875178288, + "loss": 2.4757, + "step": 4554 + }, + { + "epoch": 0.5223024882467607, + "grad_norm": 0.2850591445903701, + "learning_rate": 0.0004882094430713447, + "loss": 2.4897, + "step": 4555 + }, + { + "epoch": 0.5224171539961013, + "grad_norm": 0.24639292969715829, + "learning_rate": 0.00048802380025114326, + "loss": 2.4004, + "step": 4556 + }, + { + "epoch": 0.522531819745442, + "grad_norm": 0.2370433655434203, + "learning_rate": 0.00048783815908283, + "loss": 2.5665, + "step": 4557 + }, + { + "epoch": 0.5226464854947827, + "grad_norm": 0.23498357577667503, + "learning_rate": 0.00048765251959201106, + "loss": 2.6257, + "step": 4558 + }, + { + "epoch": 0.5227611512441234, + "grad_norm": 0.2358855001733295, + "learning_rate": 0.00048746688180429173, + "loss": 2.4232, + "step": 4559 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.22815090782560007, + "learning_rate": 0.00048728124574527705, + "loss": 2.5505, + "step": 4560 + }, + { + "epoch": 0.5229904827428047, + "grad_norm": 0.25662112315324975, + "learning_rate": 0.00048709561144057216, + "loss": 2.4265, + "step": 4561 + }, + { + "epoch": 0.5231051484921454, + "grad_norm": 0.22889284817229524, + "learning_rate": 0.00048690997891578155, + "loss": 2.4584, + "step": 4562 + }, + { + "epoch": 0.5232198142414861, + "grad_norm": 0.23901989624374834, + "learning_rate": 0.00048672434819651, + "loss": 2.4387, + "step": 4563 + }, + { + "epoch": 0.5233344799908267, + "grad_norm": 0.22800068464520587, + "learning_rate": 0.0004865387193083615, + "loss": 2.2004, + "step": 4564 + }, + { + "epoch": 0.5234491457401674, + "grad_norm": 0.25736316087959404, + "learning_rate": 0.0004863530922769401, + "loss": 2.5902, + "step": 4565 + }, + { + "epoch": 0.523563811489508, + "grad_norm": 0.25884220968363986, + "learning_rate": 0.0004861674671278494, + "loss": 2.428, + "step": 4566 + }, + { + "epoch": 0.5236784772388487, + "grad_norm": 0.24226088930103568, + "learning_rate": 0.0004859818438866928, + "loss": 2.4733, + "step": 4567 + }, + { + "epoch": 0.5237931429881895, + "grad_norm": 0.23995797685399362, + "learning_rate": 0.0004857962225790739, + "loss": 2.2392, + "step": 4568 + }, + { + "epoch": 0.5239078087375301, + "grad_norm": 0.2674956781571016, + "learning_rate": 0.00048561060323059534, + "loss": 2.4999, + "step": 4569 + }, + { + "epoch": 0.5240224744868708, + "grad_norm": 0.30466370962613226, + "learning_rate": 0.0004854249858668597, + "loss": 2.4818, + "step": 4570 + }, + { + "epoch": 0.5241371402362115, + "grad_norm": 0.2815696982899956, + "learning_rate": 0.0004852393705134695, + "loss": 2.5881, + "step": 4571 + }, + { + "epoch": 0.5242518059855521, + "grad_norm": 0.23526110362789063, + "learning_rate": 0.0004850537571960266, + "loss": 2.4455, + "step": 4572 + }, + { + "epoch": 0.5243664717348928, + "grad_norm": 0.2195908363006188, + "learning_rate": 0.00048486814594013303, + "loss": 2.4739, + "step": 4573 + }, + { + "epoch": 0.5244811374842334, + "grad_norm": 0.24518754064148213, + "learning_rate": 0.0004846825367713904, + "loss": 2.5003, + "step": 4574 + }, + { + "epoch": 0.5245958032335741, + "grad_norm": 0.22563465792622345, + "learning_rate": 0.0004844969297153999, + "loss": 2.4513, + "step": 4575 + }, + { + "epoch": 0.5247104689829148, + "grad_norm": 0.22005795205127815, + "learning_rate": 0.00048431132479776227, + "loss": 2.4664, + "step": 4576 + }, + { + "epoch": 0.5248251347322554, + "grad_norm": 0.2510371583678022, + "learning_rate": 0.00048412572204407825, + "loss": 2.5061, + "step": 4577 + }, + { + "epoch": 0.5249398004815962, + "grad_norm": 0.2275422833707922, + "learning_rate": 0.00048394012147994853, + "loss": 2.3534, + "step": 4578 + }, + { + "epoch": 0.5250544662309368, + "grad_norm": 0.25397444999470525, + "learning_rate": 0.0004837545231309728, + "loss": 2.562, + "step": 4579 + }, + { + "epoch": 0.5251691319802775, + "grad_norm": 0.22283918138111716, + "learning_rate": 0.000483568927022751, + "loss": 2.4377, + "step": 4580 + }, + { + "epoch": 0.5252837977296182, + "grad_norm": 0.25149505028307895, + "learning_rate": 0.00048338333318088256, + "loss": 2.6205, + "step": 4581 + }, + { + "epoch": 0.5253984634789588, + "grad_norm": 0.24888288620746105, + "learning_rate": 0.0004831977416309663, + "loss": 2.5287, + "step": 4582 + }, + { + "epoch": 0.5255131292282995, + "grad_norm": 0.23885559322688754, + "learning_rate": 0.00048301215239860145, + "loss": 2.4114, + "step": 4583 + }, + { + "epoch": 0.5256277949776402, + "grad_norm": 0.23613567646283098, + "learning_rate": 0.0004828265655093865, + "loss": 2.32, + "step": 4584 + }, + { + "epoch": 0.5257424607269808, + "grad_norm": 0.269064674806033, + "learning_rate": 0.0004826409809889193, + "loss": 2.5452, + "step": 4585 + }, + { + "epoch": 0.5258571264763215, + "grad_norm": 0.2583331961057893, + "learning_rate": 0.000482455398862798, + "loss": 2.438, + "step": 4586 + }, + { + "epoch": 0.5259717922256621, + "grad_norm": 0.2609596836655053, + "learning_rate": 0.00048226981915661983, + "loss": 2.4499, + "step": 4587 + }, + { + "epoch": 0.5260864579750029, + "grad_norm": 0.25685355186972914, + "learning_rate": 0.00048208424189598233, + "loss": 2.3919, + "step": 4588 + }, + { + "epoch": 0.5262011237243436, + "grad_norm": 0.2584225980272253, + "learning_rate": 0.000481898667106482, + "loss": 2.461, + "step": 4589 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.24486046482854876, + "learning_rate": 0.0004817130948137157, + "loss": 2.505, + "step": 4590 + }, + { + "epoch": 0.5264304552230249, + "grad_norm": 0.22703577614670903, + "learning_rate": 0.0004815275250432792, + "loss": 2.5083, + "step": 4591 + }, + { + "epoch": 0.5265451209723655, + "grad_norm": 0.23781930715467273, + "learning_rate": 0.0004813419578207684, + "loss": 2.4713, + "step": 4592 + }, + { + "epoch": 0.5266597867217062, + "grad_norm": 0.26518777475859917, + "learning_rate": 0.0004811563931717791, + "loss": 2.4179, + "step": 4593 + }, + { + "epoch": 0.5267744524710469, + "grad_norm": 0.27447614968338707, + "learning_rate": 0.0004809708311219062, + "loss": 2.4915, + "step": 4594 + }, + { + "epoch": 0.5268891182203875, + "grad_norm": 0.25602481102122937, + "learning_rate": 0.00048078527169674427, + "loss": 2.4177, + "step": 4595 + }, + { + "epoch": 0.5270037839697282, + "grad_norm": 0.26484903981347924, + "learning_rate": 0.000480599714921888, + "loss": 2.5462, + "step": 4596 + }, + { + "epoch": 0.527118449719069, + "grad_norm": 0.23971267398623883, + "learning_rate": 0.000480414160822931, + "loss": 2.5884, + "step": 4597 + }, + { + "epoch": 0.5272331154684096, + "grad_norm": 0.2862140274867441, + "learning_rate": 0.0004802286094254673, + "loss": 2.4251, + "step": 4598 + }, + { + "epoch": 0.5273477812177503, + "grad_norm": 0.23638962581935394, + "learning_rate": 0.0004800430607550901, + "loss": 2.3957, + "step": 4599 + }, + { + "epoch": 0.5274624469670909, + "grad_norm": 0.2520730205747328, + "learning_rate": 0.0004798575148373923, + "loss": 2.4731, + "step": 4600 + }, + { + "epoch": 0.5275771127164316, + "grad_norm": 0.26626703650233674, + "learning_rate": 0.0004796719716979663, + "loss": 2.4797, + "step": 4601 + }, + { + "epoch": 0.5276917784657723, + "grad_norm": 0.24101367822785053, + "learning_rate": 0.00047948643136240423, + "loss": 2.35, + "step": 4602 + }, + { + "epoch": 0.5278064442151129, + "grad_norm": 0.2669703688442846, + "learning_rate": 0.00047930089385629806, + "loss": 2.5013, + "step": 4603 + }, + { + "epoch": 0.5279211099644536, + "grad_norm": 0.24304588593133702, + "learning_rate": 0.00047911535920523897, + "loss": 2.413, + "step": 4604 + }, + { + "epoch": 0.5280357757137943, + "grad_norm": 0.26784235250295524, + "learning_rate": 0.00047892982743481805, + "loss": 2.493, + "step": 4605 + }, + { + "epoch": 0.528150441463135, + "grad_norm": 0.26398050116651967, + "learning_rate": 0.0004787442985706259, + "loss": 2.4383, + "step": 4606 + }, + { + "epoch": 0.5282651072124757, + "grad_norm": 0.2436180626760646, + "learning_rate": 0.00047855877263825223, + "loss": 2.5951, + "step": 4607 + }, + { + "epoch": 0.5283797729618163, + "grad_norm": 0.25481246291855764, + "learning_rate": 0.0004783732496632873, + "loss": 2.52, + "step": 4608 + }, + { + "epoch": 0.528494438711157, + "grad_norm": 0.22740958414390638, + "learning_rate": 0.0004781877296713205, + "loss": 2.5384, + "step": 4609 + }, + { + "epoch": 0.5286091044604977, + "grad_norm": 0.24362780151568794, + "learning_rate": 0.00047800221268794055, + "loss": 2.4873, + "step": 4610 + }, + { + "epoch": 0.5287237702098383, + "grad_norm": 0.23237228129130977, + "learning_rate": 0.0004778166987387361, + "loss": 2.5083, + "step": 4611 + }, + { + "epoch": 0.528838435959179, + "grad_norm": 0.259790858297508, + "learning_rate": 0.00047763118784929494, + "loss": 2.5232, + "step": 4612 + }, + { + "epoch": 0.5289531017085196, + "grad_norm": 0.25936212780231427, + "learning_rate": 0.00047744568004520527, + "loss": 2.4998, + "step": 4613 + }, + { + "epoch": 0.5290677674578603, + "grad_norm": 0.24899769400572558, + "learning_rate": 0.000477260175352054, + "loss": 2.5072, + "step": 4614 + }, + { + "epoch": 0.529182433207201, + "grad_norm": 0.22906021723849657, + "learning_rate": 0.0004770746737954282, + "loss": 2.4629, + "step": 4615 + }, + { + "epoch": 0.5292970989565416, + "grad_norm": 0.23034056511374007, + "learning_rate": 0.0004768891754009141, + "loss": 2.4814, + "step": 4616 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.24303195738732264, + "learning_rate": 0.00047670368019409753, + "loss": 2.4799, + "step": 4617 + }, + { + "epoch": 0.5295264304552231, + "grad_norm": 0.2980009056936441, + "learning_rate": 0.00047651818820056445, + "loss": 2.4894, + "step": 4618 + }, + { + "epoch": 0.5296410962045637, + "grad_norm": 0.24330631067832084, + "learning_rate": 0.00047633269944589974, + "loss": 2.5226, + "step": 4619 + }, + { + "epoch": 0.5297557619539044, + "grad_norm": 0.25841065191585433, + "learning_rate": 0.00047614721395568786, + "loss": 2.5071, + "step": 4620 + }, + { + "epoch": 0.529870427703245, + "grad_norm": 0.2911977868436059, + "learning_rate": 0.0004759617317555133, + "loss": 2.4488, + "step": 4621 + }, + { + "epoch": 0.5299850934525857, + "grad_norm": 0.25110400322852106, + "learning_rate": 0.0004757762528709594, + "loss": 2.4924, + "step": 4622 + }, + { + "epoch": 0.5300997592019264, + "grad_norm": 0.25753205652346495, + "learning_rate": 0.0004755907773276097, + "loss": 2.4445, + "step": 4623 + }, + { + "epoch": 0.530214424951267, + "grad_norm": 0.2539497173241698, + "learning_rate": 0.0004754053051510472, + "loss": 2.4681, + "step": 4624 + }, + { + "epoch": 0.5303290907006077, + "grad_norm": 0.25438807158825577, + "learning_rate": 0.00047521983636685395, + "loss": 2.5243, + "step": 4625 + }, + { + "epoch": 0.5304437564499485, + "grad_norm": 0.24364219609792495, + "learning_rate": 0.00047503437100061184, + "loss": 2.2718, + "step": 4626 + }, + { + "epoch": 0.5305584221992891, + "grad_norm": 0.2671552847826482, + "learning_rate": 0.00047484890907790225, + "loss": 2.4255, + "step": 4627 + }, + { + "epoch": 0.5306730879486298, + "grad_norm": 0.2480661028907191, + "learning_rate": 0.0004746634506243065, + "loss": 2.4395, + "step": 4628 + }, + { + "epoch": 0.5307877536979704, + "grad_norm": 0.2534163928921084, + "learning_rate": 0.0004744779956654046, + "loss": 2.4363, + "step": 4629 + }, + { + "epoch": 0.5309024194473111, + "grad_norm": 0.24981080646140344, + "learning_rate": 0.00047429254422677684, + "loss": 2.3862, + "step": 4630 + }, + { + "epoch": 0.5310170851966518, + "grad_norm": 0.27459672072737173, + "learning_rate": 0.00047410709633400255, + "loss": 2.3953, + "step": 4631 + }, + { + "epoch": 0.5311317509459924, + "grad_norm": 0.24851827610296273, + "learning_rate": 0.0004739216520126606, + "loss": 2.5256, + "step": 4632 + }, + { + "epoch": 0.5312464166953331, + "grad_norm": 0.26508677962431915, + "learning_rate": 0.0004737362112883297, + "loss": 2.4789, + "step": 4633 + }, + { + "epoch": 0.5313610824446737, + "grad_norm": 0.2691679682286342, + "learning_rate": 0.00047355077418658793, + "loss": 2.3447, + "step": 4634 + }, + { + "epoch": 0.5314757481940144, + "grad_norm": 0.2416074009930367, + "learning_rate": 0.0004733653407330126, + "loss": 2.4471, + "step": 4635 + }, + { + "epoch": 0.5315904139433552, + "grad_norm": 0.25135398914157114, + "learning_rate": 0.00047317991095318095, + "loss": 2.4862, + "step": 4636 + }, + { + "epoch": 0.5317050796926958, + "grad_norm": 0.24549141594066873, + "learning_rate": 0.0004729944848726691, + "loss": 2.4782, + "step": 4637 + }, + { + "epoch": 0.5318197454420365, + "grad_norm": 0.26046236298012326, + "learning_rate": 0.0004728090625170535, + "loss": 2.3666, + "step": 4638 + }, + { + "epoch": 0.5319344111913772, + "grad_norm": 0.2605265027565567, + "learning_rate": 0.0004726236439119094, + "loss": 2.494, + "step": 4639 + }, + { + "epoch": 0.5320490769407178, + "grad_norm": 0.24291911666018393, + "learning_rate": 0.0004724382290828118, + "loss": 2.4268, + "step": 4640 + }, + { + "epoch": 0.5321637426900585, + "grad_norm": 0.2588075201949955, + "learning_rate": 0.00047225281805533505, + "loss": 2.4908, + "step": 4641 + }, + { + "epoch": 0.5322784084393991, + "grad_norm": 0.24160792531525202, + "learning_rate": 0.000472067410855053, + "loss": 2.361, + "step": 4642 + }, + { + "epoch": 0.5323930741887398, + "grad_norm": 0.23306628718340477, + "learning_rate": 0.0004718820075075394, + "loss": 2.3776, + "step": 4643 + }, + { + "epoch": 0.5325077399380805, + "grad_norm": 0.2386203373139482, + "learning_rate": 0.0004716966080383669, + "loss": 2.3073, + "step": 4644 + }, + { + "epoch": 0.5326224056874211, + "grad_norm": 0.2510296549276965, + "learning_rate": 0.00047151121247310766, + "loss": 2.3556, + "step": 4645 + }, + { + "epoch": 0.5327370714367619, + "grad_norm": 0.25314235060277473, + "learning_rate": 0.0004713258208373338, + "loss": 2.4995, + "step": 4646 + }, + { + "epoch": 0.5328517371861025, + "grad_norm": 0.2547876646634815, + "learning_rate": 0.0004711404331566161, + "loss": 2.3924, + "step": 4647 + }, + { + "epoch": 0.5329664029354432, + "grad_norm": 0.25854629192530687, + "learning_rate": 0.0004709550494565257, + "loss": 2.4152, + "step": 4648 + }, + { + "epoch": 0.5330810686847839, + "grad_norm": 0.2949806201028242, + "learning_rate": 0.0004707696697626326, + "loss": 2.4248, + "step": 4649 + }, + { + "epoch": 0.5331957344341245, + "grad_norm": 0.25274410886762294, + "learning_rate": 0.00047058429410050645, + "loss": 2.3734, + "step": 4650 + }, + { + "epoch": 0.5333104001834652, + "grad_norm": 0.2573701840427738, + "learning_rate": 0.00047039892249571606, + "loss": 2.5985, + "step": 4651 + }, + { + "epoch": 0.5334250659328059, + "grad_norm": 0.23772344840999685, + "learning_rate": 0.00047021355497383, + "loss": 2.4164, + "step": 4652 + }, + { + "epoch": 0.5335397316821465, + "grad_norm": 0.23416193221939205, + "learning_rate": 0.00047002819156041645, + "loss": 2.4223, + "step": 4653 + }, + { + "epoch": 0.5336543974314872, + "grad_norm": 0.24830676232138943, + "learning_rate": 0.0004698428322810424, + "loss": 2.2769, + "step": 4654 + }, + { + "epoch": 0.5337690631808278, + "grad_norm": 0.2355305122469701, + "learning_rate": 0.000469657477161275, + "loss": 2.4686, + "step": 4655 + }, + { + "epoch": 0.5338837289301686, + "grad_norm": 0.25859001001994364, + "learning_rate": 0.00046947212622668017, + "loss": 2.487, + "step": 4656 + }, + { + "epoch": 0.5339983946795093, + "grad_norm": 0.24884032352594732, + "learning_rate": 0.00046928677950282337, + "loss": 2.4796, + "step": 4657 + }, + { + "epoch": 0.5341130604288499, + "grad_norm": 0.26646456108513084, + "learning_rate": 0.0004691014370152701, + "loss": 2.4837, + "step": 4658 + }, + { + "epoch": 0.5342277261781906, + "grad_norm": 0.24023186587407666, + "learning_rate": 0.00046891609878958463, + "loss": 2.4584, + "step": 4659 + }, + { + "epoch": 0.5343423919275313, + "grad_norm": 0.25585799558015776, + "learning_rate": 0.00046873076485133075, + "loss": 2.4911, + "step": 4660 + }, + { + "epoch": 0.5344570576768719, + "grad_norm": 0.24853595574876833, + "learning_rate": 0.0004685454352260719, + "loss": 2.4854, + "step": 4661 + }, + { + "epoch": 0.5345717234262126, + "grad_norm": 0.2411603503433762, + "learning_rate": 0.0004683601099393705, + "loss": 2.4212, + "step": 4662 + }, + { + "epoch": 0.5346863891755532, + "grad_norm": 0.2247806080422925, + "learning_rate": 0.000468174789016789, + "loss": 2.3824, + "step": 4663 + }, + { + "epoch": 0.5348010549248939, + "grad_norm": 0.2847055992246278, + "learning_rate": 0.00046798947248388864, + "loss": 2.4849, + "step": 4664 + }, + { + "epoch": 0.5349157206742347, + "grad_norm": 0.252731978857553, + "learning_rate": 0.0004678041603662305, + "loss": 2.4092, + "step": 4665 + }, + { + "epoch": 0.5350303864235753, + "grad_norm": 0.23539401991227432, + "learning_rate": 0.00046761885268937456, + "loss": 2.3521, + "step": 4666 + }, + { + "epoch": 0.535145052172916, + "grad_norm": 0.2578942169881139, + "learning_rate": 0.00046743354947888054, + "loss": 2.4373, + "step": 4667 + }, + { + "epoch": 0.5352597179222566, + "grad_norm": 0.2243803182369507, + "learning_rate": 0.00046724825076030783, + "loss": 2.4011, + "step": 4668 + }, + { + "epoch": 0.5353743836715973, + "grad_norm": 0.28175014377806196, + "learning_rate": 0.00046706295655921467, + "loss": 2.59, + "step": 4669 + }, + { + "epoch": 0.535489049420938, + "grad_norm": 0.26552379246847013, + "learning_rate": 0.00046687766690115863, + "loss": 2.3609, + "step": 4670 + }, + { + "epoch": 0.5356037151702786, + "grad_norm": 0.28233356636688, + "learning_rate": 0.00046669238181169727, + "loss": 2.486, + "step": 4671 + }, + { + "epoch": 0.5357183809196193, + "grad_norm": 0.24823490688650404, + "learning_rate": 0.0004665071013163866, + "loss": 2.4991, + "step": 4672 + }, + { + "epoch": 0.53583304666896, + "grad_norm": 0.2570614725073706, + "learning_rate": 0.0004663218254407831, + "loss": 2.4058, + "step": 4673 + }, + { + "epoch": 0.5359477124183006, + "grad_norm": 0.25187742805139063, + "learning_rate": 0.0004661365542104419, + "loss": 2.4474, + "step": 4674 + }, + { + "epoch": 0.5360623781676414, + "grad_norm": 0.2745169691112177, + "learning_rate": 0.0004659512876509175, + "loss": 2.437, + "step": 4675 + }, + { + "epoch": 0.536177043916982, + "grad_norm": 0.25768263770745614, + "learning_rate": 0.00046576602578776385, + "loss": 2.4054, + "step": 4676 + }, + { + "epoch": 0.5362917096663227, + "grad_norm": 0.23882942207105035, + "learning_rate": 0.00046558076864653433, + "loss": 2.3685, + "step": 4677 + }, + { + "epoch": 0.5364063754156634, + "grad_norm": 0.25091348128675345, + "learning_rate": 0.0004653955162527818, + "loss": 2.5181, + "step": 4678 + }, + { + "epoch": 0.536521041165004, + "grad_norm": 0.25672793862912213, + "learning_rate": 0.00046521026863205814, + "loss": 2.5716, + "step": 4679 + }, + { + "epoch": 0.5366357069143447, + "grad_norm": 0.21882746115729437, + "learning_rate": 0.00046502502580991485, + "loss": 2.458, + "step": 4680 + }, + { + "epoch": 0.5367503726636853, + "grad_norm": 0.24961594745227172, + "learning_rate": 0.0004648397878119026, + "loss": 2.451, + "step": 4681 + }, + { + "epoch": 0.536865038413026, + "grad_norm": 0.2311012025906882, + "learning_rate": 0.0004646545546635712, + "loss": 2.5847, + "step": 4682 + }, + { + "epoch": 0.5369797041623667, + "grad_norm": 0.23745200022888582, + "learning_rate": 0.0004644693263904703, + "loss": 2.5883, + "step": 4683 + }, + { + "epoch": 0.5370943699117073, + "grad_norm": 0.21413096050419006, + "learning_rate": 0.0004642841030181487, + "loss": 2.4016, + "step": 4684 + }, + { + "epoch": 0.537209035661048, + "grad_norm": 0.2162963520194744, + "learning_rate": 0.0004640988845721541, + "loss": 2.3175, + "step": 4685 + }, + { + "epoch": 0.5373237014103888, + "grad_norm": 0.22919499165847834, + "learning_rate": 0.0004639136710780342, + "loss": 2.3872, + "step": 4686 + }, + { + "epoch": 0.5374383671597294, + "grad_norm": 0.2623444224014999, + "learning_rate": 0.00046372846256133527, + "loss": 2.4355, + "step": 4687 + }, + { + "epoch": 0.5375530329090701, + "grad_norm": 0.24698989586847614, + "learning_rate": 0.0004635432590476038, + "loss": 2.3981, + "step": 4688 + }, + { + "epoch": 0.5376676986584107, + "grad_norm": 0.24739849099741557, + "learning_rate": 0.0004633580605623847, + "loss": 2.361, + "step": 4689 + }, + { + "epoch": 0.5377823644077514, + "grad_norm": 0.24418558000822083, + "learning_rate": 0.00046317286713122276, + "loss": 2.3681, + "step": 4690 + }, + { + "epoch": 0.5378970301570921, + "grad_norm": 0.2676032618808143, + "learning_rate": 0.0004629876787796617, + "loss": 2.3675, + "step": 4691 + }, + { + "epoch": 0.5380116959064327, + "grad_norm": 0.2647078977128044, + "learning_rate": 0.0004628024955332447, + "loss": 2.4199, + "step": 4692 + }, + { + "epoch": 0.5381263616557734, + "grad_norm": 0.2660521501892031, + "learning_rate": 0.00046261731741751466, + "loss": 2.5033, + "step": 4693 + }, + { + "epoch": 0.5382410274051141, + "grad_norm": 0.2818862046270914, + "learning_rate": 0.0004624321444580131, + "loss": 2.4484, + "step": 4694 + }, + { + "epoch": 0.5383556931544548, + "grad_norm": 0.26238181049862547, + "learning_rate": 0.00046224697668028095, + "loss": 2.5447, + "step": 4695 + }, + { + "epoch": 0.5384703589037955, + "grad_norm": 0.2594144196868433, + "learning_rate": 0.0004620618141098589, + "loss": 2.4122, + "step": 4696 + }, + { + "epoch": 0.5385850246531361, + "grad_norm": 0.24504914839549968, + "learning_rate": 0.00046187665677228614, + "loss": 2.4263, + "step": 4697 + }, + { + "epoch": 0.5386996904024768, + "grad_norm": 0.2612884973621422, + "learning_rate": 0.000461691504693102, + "loss": 2.431, + "step": 4698 + }, + { + "epoch": 0.5388143561518175, + "grad_norm": 0.26756207763517587, + "learning_rate": 0.00046150635789784475, + "loss": 2.3662, + "step": 4699 + }, + { + "epoch": 0.5389290219011581, + "grad_norm": 0.23674732847557112, + "learning_rate": 0.0004613212164120516, + "loss": 2.5086, + "step": 4700 + }, + { + "epoch": 0.5390436876504988, + "grad_norm": 0.2778632206003852, + "learning_rate": 0.00046113608026125924, + "loss": 2.4822, + "step": 4701 + }, + { + "epoch": 0.5391583533998394, + "grad_norm": 0.23541433221025979, + "learning_rate": 0.00046095094947100373, + "loss": 2.5762, + "step": 4702 + }, + { + "epoch": 0.5392730191491801, + "grad_norm": 0.227255928058899, + "learning_rate": 0.0004607658240668206, + "loss": 2.4016, + "step": 4703 + }, + { + "epoch": 0.5393876848985208, + "grad_norm": 0.23534395298788063, + "learning_rate": 0.0004605807040742441, + "loss": 2.3824, + "step": 4704 + }, + { + "epoch": 0.5395023506478615, + "grad_norm": 0.23285254134888833, + "learning_rate": 0.0004603955895188081, + "loss": 2.5895, + "step": 4705 + }, + { + "epoch": 0.5396170163972022, + "grad_norm": 0.25154874708570435, + "learning_rate": 0.00046021048042604564, + "loss": 2.5103, + "step": 4706 + }, + { + "epoch": 0.5397316821465429, + "grad_norm": 0.22828270997892236, + "learning_rate": 0.0004600253768214887, + "loss": 2.4274, + "step": 4707 + }, + { + "epoch": 0.5398463478958835, + "grad_norm": 0.24654957134655425, + "learning_rate": 0.0004598402787306692, + "loss": 2.3992, + "step": 4708 + }, + { + "epoch": 0.5399610136452242, + "grad_norm": 0.24316804112084903, + "learning_rate": 0.00045965518617911786, + "loss": 2.3994, + "step": 4709 + }, + { + "epoch": 0.5400756793945648, + "grad_norm": 0.25898987785664634, + "learning_rate": 0.00045947009919236435, + "loss": 2.4992, + "step": 4710 + }, + { + "epoch": 0.5401903451439055, + "grad_norm": 0.24206928430250374, + "learning_rate": 0.0004592850177959383, + "loss": 2.5685, + "step": 4711 + }, + { + "epoch": 0.5403050108932462, + "grad_norm": 0.2398433788231947, + "learning_rate": 0.00045909994201536765, + "loss": 2.3811, + "step": 4712 + }, + { + "epoch": 0.5404196766425868, + "grad_norm": 0.24803970957932098, + "learning_rate": 0.00045891487187618056, + "loss": 2.4484, + "step": 4713 + }, + { + "epoch": 0.5405343423919275, + "grad_norm": 0.2894925089842307, + "learning_rate": 0.0004587298074039037, + "loss": 2.4669, + "step": 4714 + }, + { + "epoch": 0.5406490081412682, + "grad_norm": 0.25899000261603516, + "learning_rate": 0.00045854474862406337, + "loss": 2.5324, + "step": 4715 + }, + { + "epoch": 0.5407636738906089, + "grad_norm": 0.2416786979782219, + "learning_rate": 0.0004583596955621846, + "loss": 2.4619, + "step": 4716 + }, + { + "epoch": 0.5408783396399496, + "grad_norm": 0.25436326823287436, + "learning_rate": 0.000458174648243792, + "loss": 2.5906, + "step": 4717 + }, + { + "epoch": 0.5409930053892902, + "grad_norm": 0.25829377279017485, + "learning_rate": 0.00045798960669440956, + "loss": 2.4339, + "step": 4718 + }, + { + "epoch": 0.5411076711386309, + "grad_norm": 0.24624154910659335, + "learning_rate": 0.0004578045709395602, + "loss": 2.3872, + "step": 4719 + }, + { + "epoch": 0.5412223368879716, + "grad_norm": 0.2570962744073782, + "learning_rate": 0.00045761954100476576, + "loss": 2.553, + "step": 4720 + }, + { + "epoch": 0.5413370026373122, + "grad_norm": 0.2356959898736343, + "learning_rate": 0.00045743451691554796, + "loss": 2.4378, + "step": 4721 + }, + { + "epoch": 0.5414516683866529, + "grad_norm": 0.255177822487017, + "learning_rate": 0.0004572494986974269, + "loss": 2.4139, + "step": 4722 + }, + { + "epoch": 0.5415663341359935, + "grad_norm": 0.23103776261627548, + "learning_rate": 0.00045706448637592275, + "loss": 2.4921, + "step": 4723 + }, + { + "epoch": 0.5416809998853342, + "grad_norm": 0.2660353078328097, + "learning_rate": 0.00045687947997655437, + "loss": 2.4574, + "step": 4724 + }, + { + "epoch": 0.541795665634675, + "grad_norm": 0.27301421696313993, + "learning_rate": 0.00045669447952483976, + "loss": 2.4992, + "step": 4725 + }, + { + "epoch": 0.5419103313840156, + "grad_norm": 0.2580928453299047, + "learning_rate": 0.0004565094850462961, + "loss": 2.485, + "step": 4726 + }, + { + "epoch": 0.5420249971333563, + "grad_norm": 0.2482778535546584, + "learning_rate": 0.0004563244965664399, + "loss": 2.4401, + "step": 4727 + }, + { + "epoch": 0.542139662882697, + "grad_norm": 0.2768493469736648, + "learning_rate": 0.00045613951411078714, + "loss": 2.4039, + "step": 4728 + }, + { + "epoch": 0.5422543286320376, + "grad_norm": 0.258993958289199, + "learning_rate": 0.00045595453770485217, + "loss": 2.4362, + "step": 4729 + }, + { + "epoch": 0.5423689943813783, + "grad_norm": 0.2558474728423535, + "learning_rate": 0.00045576956737414926, + "loss": 2.418, + "step": 4730 + }, + { + "epoch": 0.5424836601307189, + "grad_norm": 0.2586065491994409, + "learning_rate": 0.00045558460314419147, + "loss": 2.417, + "step": 4731 + }, + { + "epoch": 0.5425983258800596, + "grad_norm": 0.23259075145338187, + "learning_rate": 0.00045539964504049074, + "loss": 2.3852, + "step": 4732 + }, + { + "epoch": 0.5427129916294003, + "grad_norm": 0.23663969363633175, + "learning_rate": 0.000455214693088559, + "loss": 2.3989, + "step": 4733 + }, + { + "epoch": 0.542827657378741, + "grad_norm": 0.24490929969464034, + "learning_rate": 0.00045502974731390674, + "loss": 2.3706, + "step": 4734 + }, + { + "epoch": 0.5429423231280817, + "grad_norm": 0.2765656448914929, + "learning_rate": 0.0004548448077420435, + "loss": 2.4253, + "step": 4735 + }, + { + "epoch": 0.5430569888774223, + "grad_norm": 0.27089406484540574, + "learning_rate": 0.0004546598743984784, + "loss": 2.4422, + "step": 4736 + }, + { + "epoch": 0.543171654626763, + "grad_norm": 0.27995017804393396, + "learning_rate": 0.00045447494730871917, + "loss": 2.4289, + "step": 4737 + }, + { + "epoch": 0.5432863203761037, + "grad_norm": 0.2567006542265347, + "learning_rate": 0.00045429002649827345, + "loss": 2.5729, + "step": 4738 + }, + { + "epoch": 0.5434009861254443, + "grad_norm": 0.23974558643316918, + "learning_rate": 0.00045410511199264704, + "loss": 2.5268, + "step": 4739 + }, + { + "epoch": 0.543515651874785, + "grad_norm": 0.2591546779227263, + "learning_rate": 0.0004539202038173458, + "loss": 2.4038, + "step": 4740 + }, + { + "epoch": 0.5436303176241257, + "grad_norm": 0.2304362580459881, + "learning_rate": 0.00045373530199787395, + "loss": 2.3838, + "step": 4741 + }, + { + "epoch": 0.5437449833734663, + "grad_norm": 0.2519189436407284, + "learning_rate": 0.00045355040655973514, + "loss": 2.4927, + "step": 4742 + }, + { + "epoch": 0.543859649122807, + "grad_norm": 0.2418096188692322, + "learning_rate": 0.0004533655175284326, + "loss": 2.4134, + "step": 4743 + }, + { + "epoch": 0.5439743148721476, + "grad_norm": 0.23585058714850035, + "learning_rate": 0.00045318063492946796, + "loss": 2.5538, + "step": 4744 + }, + { + "epoch": 0.5440889806214884, + "grad_norm": 0.28307757995039556, + "learning_rate": 0.0004529957587883422, + "loss": 2.3316, + "step": 4745 + }, + { + "epoch": 0.5442036463708291, + "grad_norm": 0.22610493230476106, + "learning_rate": 0.00045281088913055564, + "loss": 2.4689, + "step": 4746 + }, + { + "epoch": 0.5443183121201697, + "grad_norm": 0.238522643635277, + "learning_rate": 0.00045262602598160714, + "loss": 2.408, + "step": 4747 + }, + { + "epoch": 0.5444329778695104, + "grad_norm": 0.2281824899491178, + "learning_rate": 0.00045244116936699546, + "loss": 2.5617, + "step": 4748 + }, + { + "epoch": 0.544547643618851, + "grad_norm": 0.22842107452931468, + "learning_rate": 0.00045225631931221805, + "loss": 2.4238, + "step": 4749 + }, + { + "epoch": 0.5446623093681917, + "grad_norm": 0.23344664141633537, + "learning_rate": 0.0004520714758427713, + "loss": 2.608, + "step": 4750 + }, + { + "epoch": 0.5447769751175324, + "grad_norm": 0.25368918330674783, + "learning_rate": 0.0004518866389841507, + "loss": 2.4255, + "step": 4751 + }, + { + "epoch": 0.544891640866873, + "grad_norm": 0.2742339288505932, + "learning_rate": 0.00045170180876185115, + "loss": 2.4858, + "step": 4752 + }, + { + "epoch": 0.5450063066162137, + "grad_norm": 0.24339335141298596, + "learning_rate": 0.0004515169852013665, + "loss": 2.4808, + "step": 4753 + }, + { + "epoch": 0.5451209723655545, + "grad_norm": 0.2465756717617404, + "learning_rate": 0.0004513321683281896, + "loss": 2.4352, + "step": 4754 + }, + { + "epoch": 0.5452356381148951, + "grad_norm": 0.2748192596502225, + "learning_rate": 0.0004511473581678124, + "loss": 2.3027, + "step": 4755 + }, + { + "epoch": 0.5453503038642358, + "grad_norm": 0.23259066108431095, + "learning_rate": 0.00045096255474572593, + "loss": 2.4893, + "step": 4756 + }, + { + "epoch": 0.5454649696135764, + "grad_norm": 0.25961116490693203, + "learning_rate": 0.0004507777580874201, + "loss": 2.4449, + "step": 4757 + }, + { + "epoch": 0.5455796353629171, + "grad_norm": 0.25020733335143996, + "learning_rate": 0.00045059296821838436, + "loss": 2.4756, + "step": 4758 + }, + { + "epoch": 0.5456943011122578, + "grad_norm": 0.2497629700019959, + "learning_rate": 0.00045040818516410704, + "loss": 2.4609, + "step": 4759 + }, + { + "epoch": 0.5458089668615984, + "grad_norm": 0.2538232845707576, + "learning_rate": 0.0004502234089500751, + "loss": 2.5547, + "step": 4760 + }, + { + "epoch": 0.5459236326109391, + "grad_norm": 0.23587438113223083, + "learning_rate": 0.00045003863960177523, + "loss": 2.4537, + "step": 4761 + }, + { + "epoch": 0.5460382983602798, + "grad_norm": 0.24172843415629053, + "learning_rate": 0.0004498538771446924, + "loss": 2.5872, + "step": 4762 + }, + { + "epoch": 0.5461529641096204, + "grad_norm": 0.23480705121712697, + "learning_rate": 0.0004496691216043116, + "loss": 2.3763, + "step": 4763 + }, + { + "epoch": 0.5462676298589612, + "grad_norm": 0.2431695497729364, + "learning_rate": 0.000449484373006116, + "loss": 2.4154, + "step": 4764 + }, + { + "epoch": 0.5463822956083018, + "grad_norm": 0.24406782807309, + "learning_rate": 0.00044929963137558827, + "loss": 2.4362, + "step": 4765 + }, + { + "epoch": 0.5464969613576425, + "grad_norm": 0.2757843448513498, + "learning_rate": 0.00044911489673820986, + "loss": 2.466, + "step": 4766 + }, + { + "epoch": 0.5466116271069832, + "grad_norm": 0.26371548926167176, + "learning_rate": 0.00044893016911946144, + "loss": 2.3774, + "step": 4767 + }, + { + "epoch": 0.5467262928563238, + "grad_norm": 0.245669806763138, + "learning_rate": 0.0004487454485448229, + "loss": 2.4632, + "step": 4768 + }, + { + "epoch": 0.5468409586056645, + "grad_norm": 0.27469767429938763, + "learning_rate": 0.0004485607350397727, + "loss": 2.4439, + "step": 4769 + }, + { + "epoch": 0.5469556243550051, + "grad_norm": 0.2694434982423201, + "learning_rate": 0.0004483760286297885, + "loss": 2.6263, + "step": 4770 + }, + { + "epoch": 0.5470702901043458, + "grad_norm": 0.24308423521874656, + "learning_rate": 0.00044819132934034717, + "loss": 2.3985, + "step": 4771 + }, + { + "epoch": 0.5471849558536865, + "grad_norm": 0.21250812520089382, + "learning_rate": 0.0004480066371969243, + "loss": 2.4223, + "step": 4772 + }, + { + "epoch": 0.5472996216030271, + "grad_norm": 0.23765566721697534, + "learning_rate": 0.00044782195222499465, + "loss": 2.4876, + "step": 4773 + }, + { + "epoch": 0.5474142873523679, + "grad_norm": 0.2542241916576706, + "learning_rate": 0.00044763727445003233, + "loss": 2.4335, + "step": 4774 + }, + { + "epoch": 0.5475289531017086, + "grad_norm": 0.24101558982538643, + "learning_rate": 0.00044745260389750984, + "loss": 2.4833, + "step": 4775 + }, + { + "epoch": 0.5476436188510492, + "grad_norm": 0.2588164553192838, + "learning_rate": 0.0004472679405928989, + "loss": 2.4122, + "step": 4776 + }, + { + "epoch": 0.5477582846003899, + "grad_norm": 0.2523151241094458, + "learning_rate": 0.00044708328456167037, + "loss": 2.4763, + "step": 4777 + }, + { + "epoch": 0.5478729503497305, + "grad_norm": 0.2497135084125547, + "learning_rate": 0.00044689863582929415, + "loss": 2.4549, + "step": 4778 + }, + { + "epoch": 0.5479876160990712, + "grad_norm": 0.2296860926274922, + "learning_rate": 0.0004467139944212388, + "loss": 2.4242, + "step": 4779 + }, + { + "epoch": 0.5481022818484119, + "grad_norm": 0.23376805407061785, + "learning_rate": 0.00044652936036297235, + "loss": 2.5057, + "step": 4780 + }, + { + "epoch": 0.5482169475977525, + "grad_norm": 0.2319956687226215, + "learning_rate": 0.0004463447336799614, + "loss": 2.4607, + "step": 4781 + }, + { + "epoch": 0.5483316133470932, + "grad_norm": 0.22590822955338075, + "learning_rate": 0.0004461601143976715, + "loss": 2.4667, + "step": 4782 + }, + { + "epoch": 0.5484462790964338, + "grad_norm": 0.238851169957579, + "learning_rate": 0.00044597550254156753, + "loss": 2.4381, + "step": 4783 + }, + { + "epoch": 0.5485609448457746, + "grad_norm": 0.25303266738789826, + "learning_rate": 0.0004457908981371134, + "loss": 2.6781, + "step": 4784 + }, + { + "epoch": 0.5486756105951153, + "grad_norm": 0.24889484938798123, + "learning_rate": 0.0004456063012097714, + "loss": 2.2309, + "step": 4785 + }, + { + "epoch": 0.5487902763444559, + "grad_norm": 0.23691959164838405, + "learning_rate": 0.0004454217117850034, + "loss": 2.4626, + "step": 4786 + }, + { + "epoch": 0.5489049420937966, + "grad_norm": 0.22827573812750077, + "learning_rate": 0.0004452371298882697, + "loss": 2.4969, + "step": 4787 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.21800070714855738, + "learning_rate": 0.0004450525555450303, + "loss": 2.3879, + "step": 4788 + }, + { + "epoch": 0.5491342735924779, + "grad_norm": 0.22492028767103975, + "learning_rate": 0.00044486798878074337, + "loss": 2.442, + "step": 4789 + }, + { + "epoch": 0.5492489393418186, + "grad_norm": 0.2620256696018022, + "learning_rate": 0.0004446834296208665, + "loss": 2.4345, + "step": 4790 + }, + { + "epoch": 0.5493636050911592, + "grad_norm": 0.28449624626526093, + "learning_rate": 0.00044449887809085603, + "loss": 2.5797, + "step": 4791 + }, + { + "epoch": 0.5494782708404999, + "grad_norm": 0.2552071158199875, + "learning_rate": 0.0004443143342161673, + "loss": 2.3689, + "step": 4792 + }, + { + "epoch": 0.5495929365898407, + "grad_norm": 0.26099397472656904, + "learning_rate": 0.0004441297980222549, + "loss": 2.4704, + "step": 4793 + }, + { + "epoch": 0.5497076023391813, + "grad_norm": 0.2533246002464952, + "learning_rate": 0.00044394526953457186, + "loss": 2.3271, + "step": 4794 + }, + { + "epoch": 0.549822268088522, + "grad_norm": 0.2540510707069286, + "learning_rate": 0.00044376074877857026, + "loss": 2.4818, + "step": 4795 + }, + { + "epoch": 0.5499369338378627, + "grad_norm": 0.25624462145109506, + "learning_rate": 0.00044357623577970153, + "loss": 2.533, + "step": 4796 + }, + { + "epoch": 0.5500515995872033, + "grad_norm": 0.28852682268350405, + "learning_rate": 0.0004433917305634153, + "loss": 2.5023, + "step": 4797 + }, + { + "epoch": 0.550166265336544, + "grad_norm": 0.2721323457078935, + "learning_rate": 0.0004432072331551608, + "loss": 2.5043, + "step": 4798 + }, + { + "epoch": 0.5502809310858846, + "grad_norm": 0.25130469887505796, + "learning_rate": 0.00044302274358038607, + "loss": 2.4418, + "step": 4799 + }, + { + "epoch": 0.5503955968352253, + "grad_norm": 0.24052669587058323, + "learning_rate": 0.00044283826186453784, + "loss": 2.4507, + "step": 4800 + }, + { + "epoch": 0.550510262584566, + "grad_norm": 0.2417444454044004, + "learning_rate": 0.00044265378803306165, + "loss": 2.502, + "step": 4801 + }, + { + "epoch": 0.5506249283339066, + "grad_norm": 0.26511687754838104, + "learning_rate": 0.0004424693221114022, + "loss": 2.7135, + "step": 4802 + }, + { + "epoch": 0.5507395940832474, + "grad_norm": 0.25608699706345367, + "learning_rate": 0.00044228486412500325, + "loss": 2.4244, + "step": 4803 + }, + { + "epoch": 0.550854259832588, + "grad_norm": 0.2349821935992536, + "learning_rate": 0.0004421004140993071, + "loss": 2.3313, + "step": 4804 + }, + { + "epoch": 0.5509689255819287, + "grad_norm": 0.24163751408062434, + "learning_rate": 0.00044191597205975525, + "loss": 2.2961, + "step": 4805 + }, + { + "epoch": 0.5510835913312694, + "grad_norm": 0.22484079336824905, + "learning_rate": 0.0004417315380317879, + "loss": 2.4006, + "step": 4806 + }, + { + "epoch": 0.55119825708061, + "grad_norm": 0.22689110579999638, + "learning_rate": 0.000441547112040844, + "loss": 2.518, + "step": 4807 + }, + { + "epoch": 0.5513129228299507, + "grad_norm": 0.24610240863174646, + "learning_rate": 0.0004413626941123618, + "loss": 2.4755, + "step": 4808 + }, + { + "epoch": 0.5514275885792914, + "grad_norm": 0.2346237024557651, + "learning_rate": 0.00044117828427177834, + "loss": 2.5628, + "step": 4809 + }, + { + "epoch": 0.551542254328632, + "grad_norm": 0.2603374765072642, + "learning_rate": 0.00044099388254452925, + "loss": 2.5665, + "step": 4810 + }, + { + "epoch": 0.5516569200779727, + "grad_norm": 0.2532558830398724, + "learning_rate": 0.0004408094889560494, + "loss": 2.4171, + "step": 4811 + }, + { + "epoch": 0.5517715858273133, + "grad_norm": 0.269002790605217, + "learning_rate": 0.00044062510353177207, + "loss": 2.3689, + "step": 4812 + }, + { + "epoch": 0.551886251576654, + "grad_norm": 0.26534334527646547, + "learning_rate": 0.00044044072629713014, + "loss": 2.6981, + "step": 4813 + }, + { + "epoch": 0.5520009173259948, + "grad_norm": 0.2412155199833553, + "learning_rate": 0.0004402563572775546, + "loss": 2.3746, + "step": 4814 + }, + { + "epoch": 0.5521155830753354, + "grad_norm": 0.2265862651280693, + "learning_rate": 0.00044007199649847585, + "loss": 2.3744, + "step": 4815 + }, + { + "epoch": 0.5522302488246761, + "grad_norm": 0.2528429983471182, + "learning_rate": 0.0004398876439853227, + "loss": 2.3356, + "step": 4816 + }, + { + "epoch": 0.5523449145740167, + "grad_norm": 0.2507067343275455, + "learning_rate": 0.0004397032997635232, + "loss": 2.5152, + "step": 4817 + }, + { + "epoch": 0.5524595803233574, + "grad_norm": 0.282679817104617, + "learning_rate": 0.00043951896385850426, + "loss": 2.4426, + "step": 4818 + }, + { + "epoch": 0.5525742460726981, + "grad_norm": 0.2858474799198362, + "learning_rate": 0.0004393346362956915, + "loss": 2.4098, + "step": 4819 + }, + { + "epoch": 0.5526889118220387, + "grad_norm": 0.25931048098294573, + "learning_rate": 0.00043915031710050907, + "loss": 2.5586, + "step": 4820 + }, + { + "epoch": 0.5528035775713794, + "grad_norm": 0.2975961876359454, + "learning_rate": 0.00043896600629838065, + "loss": 2.6139, + "step": 4821 + }, + { + "epoch": 0.5529182433207201, + "grad_norm": 0.23907077911512128, + "learning_rate": 0.000438781703914728, + "loss": 2.46, + "step": 4822 + }, + { + "epoch": 0.5530329090700608, + "grad_norm": 0.24174098678420827, + "learning_rate": 0.00043859740997497247, + "loss": 2.42, + "step": 4823 + }, + { + "epoch": 0.5531475748194015, + "grad_norm": 0.24615933250790234, + "learning_rate": 0.0004384131245045339, + "loss": 2.3337, + "step": 4824 + }, + { + "epoch": 0.5532622405687421, + "grad_norm": 0.26674061460191334, + "learning_rate": 0.0004382288475288309, + "loss": 2.41, + "step": 4825 + }, + { + "epoch": 0.5533769063180828, + "grad_norm": 0.24982233893714156, + "learning_rate": 0.00043804457907328076, + "loss": 2.4534, + "step": 4826 + }, + { + "epoch": 0.5534915720674235, + "grad_norm": 0.23021842315923646, + "learning_rate": 0.0004378603191632999, + "loss": 2.4235, + "step": 4827 + }, + { + "epoch": 0.5536062378167641, + "grad_norm": 0.286827671122779, + "learning_rate": 0.0004376760678243037, + "loss": 2.4163, + "step": 4828 + }, + { + "epoch": 0.5537209035661048, + "grad_norm": 0.24414778434140147, + "learning_rate": 0.0004374918250817059, + "loss": 2.4851, + "step": 4829 + }, + { + "epoch": 0.5538355693154455, + "grad_norm": 0.2427909785001659, + "learning_rate": 0.0004373075909609193, + "loss": 2.4781, + "step": 4830 + }, + { + "epoch": 0.5539502350647861, + "grad_norm": 0.253167953669717, + "learning_rate": 0.0004371233654873556, + "loss": 2.4978, + "step": 4831 + }, + { + "epoch": 0.5540649008141268, + "grad_norm": 0.24145368104312298, + "learning_rate": 0.0004369391486864249, + "loss": 2.5038, + "step": 4832 + }, + { + "epoch": 0.5541795665634675, + "grad_norm": 0.23578787823468977, + "learning_rate": 0.0004367549405835366, + "loss": 2.37, + "step": 4833 + }, + { + "epoch": 0.5542942323128082, + "grad_norm": 0.2920386891151897, + "learning_rate": 0.00043657074120409886, + "loss": 2.4679, + "step": 4834 + }, + { + "epoch": 0.5544088980621489, + "grad_norm": 0.24244960685159794, + "learning_rate": 0.0004363865505735182, + "loss": 2.5356, + "step": 4835 + }, + { + "epoch": 0.5545235638114895, + "grad_norm": 0.24090774482848445, + "learning_rate": 0.00043620236871720034, + "loss": 2.5388, + "step": 4836 + }, + { + "epoch": 0.5546382295608302, + "grad_norm": 0.2454130544457023, + "learning_rate": 0.00043601819566054943, + "loss": 2.5311, + "step": 4837 + }, + { + "epoch": 0.5547528953101708, + "grad_norm": 0.2376177636837363, + "learning_rate": 0.00043583403142896903, + "loss": 2.3974, + "step": 4838 + }, + { + "epoch": 0.5548675610595115, + "grad_norm": 0.2559091897614342, + "learning_rate": 0.00043564987604786086, + "loss": 2.3667, + "step": 4839 + }, + { + "epoch": 0.5549822268088522, + "grad_norm": 0.2671649276817025, + "learning_rate": 0.0004354657295426257, + "loss": 2.3363, + "step": 4840 + }, + { + "epoch": 0.5550968925581928, + "grad_norm": 0.24775055897088888, + "learning_rate": 0.0004352815919386629, + "loss": 2.4832, + "step": 4841 + }, + { + "epoch": 0.5552115583075335, + "grad_norm": 0.2645627712034118, + "learning_rate": 0.0004350974632613708, + "loss": 2.4937, + "step": 4842 + }, + { + "epoch": 0.5553262240568743, + "grad_norm": 0.2738756110755357, + "learning_rate": 0.0004349133435361466, + "loss": 2.4529, + "step": 4843 + }, + { + "epoch": 0.5554408898062149, + "grad_norm": 0.25294932230726797, + "learning_rate": 0.00043472923278838613, + "loss": 2.3918, + "step": 4844 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.26641653131424764, + "learning_rate": 0.0004345451310434836, + "loss": 2.4804, + "step": 4845 + }, + { + "epoch": 0.5556702213048962, + "grad_norm": 0.25896322684829565, + "learning_rate": 0.0004343610383268327, + "loss": 2.5174, + "step": 4846 + }, + { + "epoch": 0.5557848870542369, + "grad_norm": 0.2446832849067908, + "learning_rate": 0.00043417695466382524, + "loss": 2.4287, + "step": 4847 + }, + { + "epoch": 0.5558995528035776, + "grad_norm": 0.2622762843559269, + "learning_rate": 0.00043399288007985216, + "loss": 2.4142, + "step": 4848 + }, + { + "epoch": 0.5560142185529182, + "grad_norm": 0.2645581899981976, + "learning_rate": 0.00043380881460030327, + "loss": 2.3993, + "step": 4849 + }, + { + "epoch": 0.5561288843022589, + "grad_norm": 0.2945858120371693, + "learning_rate": 0.00043362475825056663, + "loss": 2.4832, + "step": 4850 + }, + { + "epoch": 0.5562435500515995, + "grad_norm": 0.2542062804006728, + "learning_rate": 0.00043344071105602927, + "loss": 2.4325, + "step": 4851 + }, + { + "epoch": 0.5563582158009402, + "grad_norm": 0.25524041626340704, + "learning_rate": 0.00043325667304207696, + "loss": 2.3308, + "step": 4852 + }, + { + "epoch": 0.556472881550281, + "grad_norm": 0.24942235264108364, + "learning_rate": 0.00043307264423409457, + "loss": 2.3737, + "step": 4853 + }, + { + "epoch": 0.5565875472996216, + "grad_norm": 0.2882378611895105, + "learning_rate": 0.0004328886246574651, + "loss": 2.2909, + "step": 4854 + }, + { + "epoch": 0.5567022130489623, + "grad_norm": 0.2546627066954701, + "learning_rate": 0.0004327046143375707, + "loss": 2.4884, + "step": 4855 + }, + { + "epoch": 0.556816878798303, + "grad_norm": 0.26281990293380664, + "learning_rate": 0.00043252061329979196, + "loss": 2.4826, + "step": 4856 + }, + { + "epoch": 0.5569315445476436, + "grad_norm": 0.2488906194697834, + "learning_rate": 0.0004323366215695081, + "loss": 2.4684, + "step": 4857 + }, + { + "epoch": 0.5570462102969843, + "grad_norm": 0.2165027454939841, + "learning_rate": 0.0004321526391720977, + "loss": 2.4967, + "step": 4858 + }, + { + "epoch": 0.5571608760463249, + "grad_norm": 0.25771986599903696, + "learning_rate": 0.00043196866613293746, + "loss": 2.5132, + "step": 4859 + }, + { + "epoch": 0.5572755417956656, + "grad_norm": 0.23557432991120528, + "learning_rate": 0.00043178470247740285, + "loss": 2.4724, + "step": 4860 + }, + { + "epoch": 0.5573902075450063, + "grad_norm": 0.2638220588035415, + "learning_rate": 0.00043160074823086835, + "loss": 2.4175, + "step": 4861 + }, + { + "epoch": 0.557504873294347, + "grad_norm": 0.2478846332566774, + "learning_rate": 0.00043141680341870657, + "loss": 2.4862, + "step": 4862 + }, + { + "epoch": 0.5576195390436877, + "grad_norm": 0.24989382796014142, + "learning_rate": 0.0004312328680662897, + "loss": 2.548, + "step": 4863 + }, + { + "epoch": 0.5577342047930284, + "grad_norm": 0.23417787595455894, + "learning_rate": 0.0004310489421989878, + "loss": 2.4768, + "step": 4864 + }, + { + "epoch": 0.557848870542369, + "grad_norm": 0.25759721254830326, + "learning_rate": 0.00043086502584217004, + "loss": 2.4369, + "step": 4865 + }, + { + "epoch": 0.5579635362917097, + "grad_norm": 0.25896098994381306, + "learning_rate": 0.000430681119021204, + "loss": 2.4641, + "step": 4866 + }, + { + "epoch": 0.5580782020410503, + "grad_norm": 0.2258031219498759, + "learning_rate": 0.0004304972217614562, + "loss": 2.5096, + "step": 4867 + }, + { + "epoch": 0.558192867790391, + "grad_norm": 0.23478104341848002, + "learning_rate": 0.000430313334088292, + "loss": 2.5601, + "step": 4868 + }, + { + "epoch": 0.5583075335397317, + "grad_norm": 0.23128315269199018, + "learning_rate": 0.00043012945602707506, + "loss": 2.4953, + "step": 4869 + }, + { + "epoch": 0.5584221992890723, + "grad_norm": 0.2235934607120747, + "learning_rate": 0.00042994558760316767, + "loss": 2.4664, + "step": 4870 + }, + { + "epoch": 0.558536865038413, + "grad_norm": 0.25388743267954794, + "learning_rate": 0.0004297617288419312, + "loss": 2.4863, + "step": 4871 + }, + { + "epoch": 0.5586515307877536, + "grad_norm": 0.2483914620988069, + "learning_rate": 0.00042957787976872515, + "loss": 2.4015, + "step": 4872 + }, + { + "epoch": 0.5587661965370944, + "grad_norm": 0.24851778591337426, + "learning_rate": 0.0004293940404089084, + "loss": 2.4096, + "step": 4873 + }, + { + "epoch": 0.5588808622864351, + "grad_norm": 0.253936578497908, + "learning_rate": 0.00042921021078783794, + "loss": 2.5402, + "step": 4874 + }, + { + "epoch": 0.5589955280357757, + "grad_norm": 0.2553858357488869, + "learning_rate": 0.00042902639093086954, + "loss": 2.4461, + "step": 4875 + }, + { + "epoch": 0.5591101937851164, + "grad_norm": 0.2372255160378026, + "learning_rate": 0.0004288425808633575, + "loss": 2.4038, + "step": 4876 + }, + { + "epoch": 0.5592248595344571, + "grad_norm": 0.23724388962521775, + "learning_rate": 0.000428658780610655, + "loss": 2.5145, + "step": 4877 + }, + { + "epoch": 0.5593395252837977, + "grad_norm": 0.2505327135657401, + "learning_rate": 0.00042847499019811396, + "loss": 2.3965, + "step": 4878 + }, + { + "epoch": 0.5594541910331384, + "grad_norm": 0.24082436548360425, + "learning_rate": 0.0004282912096510846, + "loss": 2.4938, + "step": 4879 + }, + { + "epoch": 0.559568856782479, + "grad_norm": 0.22894455485668871, + "learning_rate": 0.00042810743899491605, + "loss": 2.4597, + "step": 4880 + }, + { + "epoch": 0.5596835225318197, + "grad_norm": 0.27021313263866303, + "learning_rate": 0.0004279236782549559, + "loss": 2.3288, + "step": 4881 + }, + { + "epoch": 0.5597981882811605, + "grad_norm": 0.22730225284093525, + "learning_rate": 0.0004277399274565502, + "loss": 2.3843, + "step": 4882 + }, + { + "epoch": 0.5599128540305011, + "grad_norm": 0.2649108397364408, + "learning_rate": 0.00042755618662504425, + "loss": 2.5714, + "step": 4883 + }, + { + "epoch": 0.5600275197798418, + "grad_norm": 0.23838060448196088, + "learning_rate": 0.00042737245578578154, + "loss": 2.4044, + "step": 4884 + }, + { + "epoch": 0.5601421855291824, + "grad_norm": 0.2638573793088811, + "learning_rate": 0.00042718873496410407, + "loss": 2.4816, + "step": 4885 + }, + { + "epoch": 0.5602568512785231, + "grad_norm": 0.2648290928472831, + "learning_rate": 0.00042700502418535277, + "loss": 2.4739, + "step": 4886 + }, + { + "epoch": 0.5603715170278638, + "grad_norm": 0.23894492566449418, + "learning_rate": 0.00042682132347486676, + "loss": 2.3284, + "step": 4887 + }, + { + "epoch": 0.5604861827772044, + "grad_norm": 0.2402670088342951, + "learning_rate": 0.00042663763285798444, + "loss": 2.3162, + "step": 4888 + }, + { + "epoch": 0.5606008485265451, + "grad_norm": 0.2652815362614118, + "learning_rate": 0.00042645395236004226, + "loss": 2.6069, + "step": 4889 + }, + { + "epoch": 0.5607155142758858, + "grad_norm": 0.2559776449159992, + "learning_rate": 0.0004262702820063754, + "loss": 2.5234, + "step": 4890 + }, + { + "epoch": 0.5608301800252264, + "grad_norm": 0.2426786195464696, + "learning_rate": 0.0004260866218223177, + "loss": 2.399, + "step": 4891 + }, + { + "epoch": 0.5609448457745672, + "grad_norm": 0.26698254481411127, + "learning_rate": 0.00042590297183320156, + "loss": 2.3718, + "step": 4892 + }, + { + "epoch": 0.5610595115239078, + "grad_norm": 0.26883105127140416, + "learning_rate": 0.00042571933206435813, + "loss": 2.439, + "step": 4893 + }, + { + "epoch": 0.5611741772732485, + "grad_norm": 0.2524544550876167, + "learning_rate": 0.00042553570254111697, + "loss": 2.4446, + "step": 4894 + }, + { + "epoch": 0.5612888430225892, + "grad_norm": 0.25464092400980126, + "learning_rate": 0.0004253520832888061, + "loss": 2.4696, + "step": 4895 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 0.24338251152511772, + "learning_rate": 0.00042516847433275257, + "loss": 2.395, + "step": 4896 + }, + { + "epoch": 0.5615181745212705, + "grad_norm": 0.2563180763101968, + "learning_rate": 0.00042498487569828135, + "loss": 2.4257, + "step": 4897 + }, + { + "epoch": 0.5616328402706112, + "grad_norm": 0.24530434972226864, + "learning_rate": 0.00042480128741071674, + "loss": 2.3181, + "step": 4898 + }, + { + "epoch": 0.5617475060199518, + "grad_norm": 0.24984142600840645, + "learning_rate": 0.0004246177094953812, + "loss": 2.4263, + "step": 4899 + }, + { + "epoch": 0.5618621717692925, + "grad_norm": 0.29386776057788194, + "learning_rate": 0.00042443414197759565, + "loss": 2.5327, + "step": 4900 + }, + { + "epoch": 0.5619768375186331, + "grad_norm": 0.22677102868342894, + "learning_rate": 0.0004242505848826798, + "loss": 2.4321, + "step": 4901 + }, + { + "epoch": 0.5620915032679739, + "grad_norm": 0.25922261468985014, + "learning_rate": 0.0004240670382359517, + "loss": 2.4383, + "step": 4902 + }, + { + "epoch": 0.5622061690173146, + "grad_norm": 0.2507186314083744, + "learning_rate": 0.0004238835020627285, + "loss": 2.3512, + "step": 4903 + }, + { + "epoch": 0.5623208347666552, + "grad_norm": 0.2476373779137207, + "learning_rate": 0.00042369997638832515, + "loss": 2.3509, + "step": 4904 + }, + { + "epoch": 0.5624355005159959, + "grad_norm": 0.2692043134895928, + "learning_rate": 0.0004235164612380557, + "loss": 2.4558, + "step": 4905 + }, + { + "epoch": 0.5625501662653365, + "grad_norm": 0.24333623662612258, + "learning_rate": 0.0004233329566372326, + "loss": 2.4246, + "step": 4906 + }, + { + "epoch": 0.5626648320146772, + "grad_norm": 0.23950960330645835, + "learning_rate": 0.0004231494626111665, + "loss": 2.4713, + "step": 4907 + }, + { + "epoch": 0.5627794977640179, + "grad_norm": 0.2758028885336742, + "learning_rate": 0.00042296597918516717, + "loss": 2.5374, + "step": 4908 + }, + { + "epoch": 0.5628941635133585, + "grad_norm": 0.27789722969499764, + "learning_rate": 0.0004227825063845427, + "loss": 2.5723, + "step": 4909 + }, + { + "epoch": 0.5630088292626992, + "grad_norm": 0.24353642476783302, + "learning_rate": 0.0004225990442345996, + "loss": 2.3756, + "step": 4910 + }, + { + "epoch": 0.56312349501204, + "grad_norm": 0.2458334667688009, + "learning_rate": 0.00042241559276064297, + "loss": 2.4076, + "step": 4911 + }, + { + "epoch": 0.5632381607613806, + "grad_norm": 0.2729030042695683, + "learning_rate": 0.0004222321519879762, + "loss": 2.5115, + "step": 4912 + }, + { + "epoch": 0.5633528265107213, + "grad_norm": 0.252502709404377, + "learning_rate": 0.00042204872194190194, + "loss": 2.4485, + "step": 4913 + }, + { + "epoch": 0.5634674922600619, + "grad_norm": 0.24861345475897326, + "learning_rate": 0.0004218653026477204, + "loss": 2.5666, + "step": 4914 + }, + { + "epoch": 0.5635821580094026, + "grad_norm": 0.22254850372656043, + "learning_rate": 0.00042168189413073124, + "loss": 2.5101, + "step": 4915 + }, + { + "epoch": 0.5636968237587433, + "grad_norm": 0.24672798872409168, + "learning_rate": 0.00042149849641623166, + "loss": 2.4821, + "step": 4916 + }, + { + "epoch": 0.5638114895080839, + "grad_norm": 0.24014874610772252, + "learning_rate": 0.00042131510952951806, + "loss": 2.4685, + "step": 4917 + }, + { + "epoch": 0.5639261552574246, + "grad_norm": 0.22741422591953592, + "learning_rate": 0.00042113173349588544, + "loss": 2.3497, + "step": 4918 + }, + { + "epoch": 0.5640408210067652, + "grad_norm": 0.2273740470363072, + "learning_rate": 0.00042094836834062686, + "loss": 2.4727, + "step": 4919 + }, + { + "epoch": 0.5641554867561059, + "grad_norm": 0.2387965910549213, + "learning_rate": 0.0004207650140890339, + "loss": 2.5299, + "step": 4920 + }, + { + "epoch": 0.5642701525054467, + "grad_norm": 0.23733339658162125, + "learning_rate": 0.0004205816707663969, + "loss": 2.4013, + "step": 4921 + }, + { + "epoch": 0.5643848182547873, + "grad_norm": 0.26553282989404914, + "learning_rate": 0.00042039833839800444, + "loss": 2.3392, + "step": 4922 + }, + { + "epoch": 0.564499484004128, + "grad_norm": 0.24471383608341513, + "learning_rate": 0.0004202150170091439, + "loss": 2.4644, + "step": 4923 + }, + { + "epoch": 0.5646141497534687, + "grad_norm": 0.23393592612213054, + "learning_rate": 0.0004200317066251011, + "loss": 2.4007, + "step": 4924 + }, + { + "epoch": 0.5647288155028093, + "grad_norm": 0.24215179716579294, + "learning_rate": 0.00041984840727116, + "loss": 2.4905, + "step": 4925 + }, + { + "epoch": 0.56484348125215, + "grad_norm": 0.23905396601960657, + "learning_rate": 0.0004196651189726032, + "loss": 2.4086, + "step": 4926 + }, + { + "epoch": 0.5649581470014906, + "grad_norm": 0.24883208810305038, + "learning_rate": 0.00041948184175471175, + "loss": 2.6536, + "step": 4927 + }, + { + "epoch": 0.5650728127508313, + "grad_norm": 0.2515775101440606, + "learning_rate": 0.00041929857564276574, + "loss": 2.5192, + "step": 4928 + }, + { + "epoch": 0.565187478500172, + "grad_norm": 0.24490707878154433, + "learning_rate": 0.0004191153206620427, + "loss": 2.5293, + "step": 4929 + }, + { + "epoch": 0.5653021442495126, + "grad_norm": 0.22906064671648874, + "learning_rate": 0.00041893207683781953, + "loss": 2.5218, + "step": 4930 + }, + { + "epoch": 0.5654168099988534, + "grad_norm": 0.24325593525300496, + "learning_rate": 0.0004187488441953711, + "loss": 2.3544, + "step": 4931 + }, + { + "epoch": 0.5655314757481941, + "grad_norm": 0.2558655745278075, + "learning_rate": 0.0004185656227599705, + "loss": 2.4883, + "step": 4932 + }, + { + "epoch": 0.5656461414975347, + "grad_norm": 0.3085505469798391, + "learning_rate": 0.00041838241255689015, + "loss": 2.6035, + "step": 4933 + }, + { + "epoch": 0.5657608072468754, + "grad_norm": 0.2453178686534555, + "learning_rate": 0.0004181992136114003, + "loss": 2.459, + "step": 4934 + }, + { + "epoch": 0.565875472996216, + "grad_norm": 0.24832127561749337, + "learning_rate": 0.0004180160259487695, + "loss": 2.4957, + "step": 4935 + }, + { + "epoch": 0.5659901387455567, + "grad_norm": 0.2325191296115635, + "learning_rate": 0.00041783284959426536, + "loss": 2.4064, + "step": 4936 + }, + { + "epoch": 0.5661048044948974, + "grad_norm": 0.23004709154776268, + "learning_rate": 0.00041764968457315313, + "loss": 2.3672, + "step": 4937 + }, + { + "epoch": 0.566219470244238, + "grad_norm": 0.23130771927941238, + "learning_rate": 0.0004174665309106974, + "loss": 2.4628, + "step": 4938 + }, + { + "epoch": 0.5663341359935787, + "grad_norm": 0.2492399193196304, + "learning_rate": 0.00041728338863216037, + "loss": 2.424, + "step": 4939 + }, + { + "epoch": 0.5664488017429193, + "grad_norm": 0.2387036290167383, + "learning_rate": 0.00041710025776280324, + "loss": 2.4757, + "step": 4940 + }, + { + "epoch": 0.56656346749226, + "grad_norm": 0.2579005160141031, + "learning_rate": 0.0004169171383278853, + "loss": 2.4225, + "step": 4941 + }, + { + "epoch": 0.5666781332416008, + "grad_norm": 0.24058982313737814, + "learning_rate": 0.00041673403035266427, + "loss": 2.3818, + "step": 4942 + }, + { + "epoch": 0.5667927989909414, + "grad_norm": 0.2670992961083317, + "learning_rate": 0.00041655093386239677, + "loss": 2.4502, + "step": 4943 + }, + { + "epoch": 0.5669074647402821, + "grad_norm": 0.23070627090023876, + "learning_rate": 0.0004163678488823373, + "loss": 2.5496, + "step": 4944 + }, + { + "epoch": 0.5670221304896228, + "grad_norm": 0.249895098141157, + "learning_rate": 0.00041618477543773876, + "loss": 2.5463, + "step": 4945 + }, + { + "epoch": 0.5671367962389634, + "grad_norm": 0.24158548854674405, + "learning_rate": 0.00041600171355385293, + "loss": 2.483, + "step": 4946 + }, + { + "epoch": 0.5672514619883041, + "grad_norm": 0.23560677576318195, + "learning_rate": 0.0004158186632559293, + "loss": 2.425, + "step": 4947 + }, + { + "epoch": 0.5673661277376447, + "grad_norm": 0.24915892407762452, + "learning_rate": 0.0004156356245692166, + "loss": 2.5013, + "step": 4948 + }, + { + "epoch": 0.5674807934869854, + "grad_norm": 0.23654270998193125, + "learning_rate": 0.00041545259751896136, + "loss": 2.4007, + "step": 4949 + }, + { + "epoch": 0.5675954592363261, + "grad_norm": 0.2220732795267781, + "learning_rate": 0.0004152695821304088, + "loss": 2.4143, + "step": 4950 + }, + { + "epoch": 0.5677101249856668, + "grad_norm": 0.26851873014468003, + "learning_rate": 0.00041508657842880204, + "loss": 2.4724, + "step": 4951 + }, + { + "epoch": 0.5678247907350075, + "grad_norm": 0.2478712374942227, + "learning_rate": 0.0004149035864393832, + "loss": 2.448, + "step": 4952 + }, + { + "epoch": 0.5679394564843481, + "grad_norm": 0.24972375616551537, + "learning_rate": 0.00041472060618739264, + "loss": 2.3692, + "step": 4953 + }, + { + "epoch": 0.5680541222336888, + "grad_norm": 0.2603092447141002, + "learning_rate": 0.00041453763769806886, + "loss": 2.3871, + "step": 4954 + }, + { + "epoch": 0.5681687879830295, + "grad_norm": 0.24006965038814282, + "learning_rate": 0.00041435468099664896, + "loss": 2.3907, + "step": 4955 + }, + { + "epoch": 0.5682834537323701, + "grad_norm": 0.2487257553112925, + "learning_rate": 0.0004141717361083683, + "loss": 2.4635, + "step": 4956 + }, + { + "epoch": 0.5683981194817108, + "grad_norm": 0.2465090302221992, + "learning_rate": 0.00041398880305846034, + "loss": 2.6166, + "step": 4957 + }, + { + "epoch": 0.5685127852310515, + "grad_norm": 0.23274574072710982, + "learning_rate": 0.0004138058818721576, + "loss": 2.5487, + "step": 4958 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 0.28852515261643535, + "learning_rate": 0.00041362297257469053, + "loss": 2.4145, + "step": 4959 + }, + { + "epoch": 0.5687421167297328, + "grad_norm": 0.29588218355642887, + "learning_rate": 0.00041344007519128793, + "loss": 2.5522, + "step": 4960 + }, + { + "epoch": 0.5688567824790735, + "grad_norm": 0.22777146641677498, + "learning_rate": 0.0004132571897471769, + "loss": 2.5213, + "step": 4961 + }, + { + "epoch": 0.5689714482284142, + "grad_norm": 0.27159006231697913, + "learning_rate": 0.00041307431626758296, + "loss": 2.4479, + "step": 4962 + }, + { + "epoch": 0.5690861139777549, + "grad_norm": 0.22502095375227055, + "learning_rate": 0.0004128914547777303, + "loss": 2.3659, + "step": 4963 + }, + { + "epoch": 0.5692007797270955, + "grad_norm": 0.25411900618427435, + "learning_rate": 0.00041270860530284097, + "loss": 2.5514, + "step": 4964 + }, + { + "epoch": 0.5693154454764362, + "grad_norm": 0.2482581321113893, + "learning_rate": 0.0004125257678681357, + "loss": 2.4987, + "step": 4965 + }, + { + "epoch": 0.5694301112257769, + "grad_norm": 0.25103544233672387, + "learning_rate": 0.0004123429424988332, + "loss": 2.3657, + "step": 4966 + }, + { + "epoch": 0.5695447769751175, + "grad_norm": 0.2551137577516324, + "learning_rate": 0.00041216012922015076, + "loss": 2.4499, + "step": 4967 + }, + { + "epoch": 0.5696594427244582, + "grad_norm": 0.240734279180522, + "learning_rate": 0.0004119773280573044, + "loss": 2.5309, + "step": 4968 + }, + { + "epoch": 0.5697741084737988, + "grad_norm": 0.23234678420881874, + "learning_rate": 0.0004117945390355078, + "loss": 2.465, + "step": 4969 + }, + { + "epoch": 0.5698887742231395, + "grad_norm": 0.22087348076078123, + "learning_rate": 0.000411611762179973, + "loss": 2.5495, + "step": 4970 + }, + { + "epoch": 0.5700034399724803, + "grad_norm": 0.2515343611435465, + "learning_rate": 0.0004114289975159109, + "loss": 2.357, + "step": 4971 + }, + { + "epoch": 0.5701181057218209, + "grad_norm": 0.23111997926693914, + "learning_rate": 0.00041124624506853, + "loss": 2.433, + "step": 4972 + }, + { + "epoch": 0.5702327714711616, + "grad_norm": 0.25073422304041754, + "learning_rate": 0.000411063504863038, + "loss": 2.5441, + "step": 4973 + }, + { + "epoch": 0.5703474372205022, + "grad_norm": 0.2369632413734191, + "learning_rate": 0.0004108807769246403, + "loss": 2.5702, + "step": 4974 + }, + { + "epoch": 0.5704621029698429, + "grad_norm": 0.2356751599811318, + "learning_rate": 0.0004106980612785407, + "loss": 2.452, + "step": 4975 + }, + { + "epoch": 0.5705767687191836, + "grad_norm": 0.23679153328467326, + "learning_rate": 0.0004105153579499411, + "loss": 2.517, + "step": 4976 + }, + { + "epoch": 0.5706914344685242, + "grad_norm": 0.24373090189496652, + "learning_rate": 0.0004103326669640421, + "loss": 2.4005, + "step": 4977 + }, + { + "epoch": 0.5708061002178649, + "grad_norm": 0.24078361012545765, + "learning_rate": 0.00041014998834604265, + "loss": 2.5166, + "step": 4978 + }, + { + "epoch": 0.5709207659672056, + "grad_norm": 0.23285832180597507, + "learning_rate": 0.0004099673221211395, + "loss": 2.4188, + "step": 4979 + }, + { + "epoch": 0.5710354317165462, + "grad_norm": 0.24794252343421705, + "learning_rate": 0.00040978466831452824, + "loss": 2.3682, + "step": 4980 + }, + { + "epoch": 0.571150097465887, + "grad_norm": 0.2664849311181591, + "learning_rate": 0.00040960202695140233, + "loss": 2.3678, + "step": 4981 + }, + { + "epoch": 0.5712647632152276, + "grad_norm": 0.25371214019794397, + "learning_rate": 0.0004094193980569534, + "loss": 2.3604, + "step": 4982 + }, + { + "epoch": 0.5713794289645683, + "grad_norm": 0.25233694550672614, + "learning_rate": 0.00040923678165637195, + "loss": 2.3862, + "step": 4983 + }, + { + "epoch": 0.571494094713909, + "grad_norm": 0.23428120722197104, + "learning_rate": 0.00040905417777484655, + "loss": 2.4834, + "step": 4984 + }, + { + "epoch": 0.5716087604632496, + "grad_norm": 0.24161821402865863, + "learning_rate": 0.0004088715864375636, + "loss": 2.3369, + "step": 4985 + }, + { + "epoch": 0.5717234262125903, + "grad_norm": 0.27425058296629806, + "learning_rate": 0.00040868900766970835, + "loss": 2.4576, + "step": 4986 + }, + { + "epoch": 0.5718380919619309, + "grad_norm": 0.2570451729392658, + "learning_rate": 0.0004085064414964638, + "loss": 2.6059, + "step": 4987 + }, + { + "epoch": 0.5719527577112716, + "grad_norm": 0.2617067632594886, + "learning_rate": 0.0004083238879430117, + "loss": 2.5504, + "step": 4988 + }, + { + "epoch": 0.5720674234606123, + "grad_norm": 0.25595958583870637, + "learning_rate": 0.0004081413470345317, + "loss": 2.4119, + "step": 4989 + }, + { + "epoch": 0.572182089209953, + "grad_norm": 0.24897617526276666, + "learning_rate": 0.000407958818796202, + "loss": 2.4577, + "step": 4990 + }, + { + "epoch": 0.5722967549592937, + "grad_norm": 0.2408816396388078, + "learning_rate": 0.0004077763032531987, + "loss": 2.5488, + "step": 4991 + }, + { + "epoch": 0.5724114207086344, + "grad_norm": 0.24116116110685146, + "learning_rate": 0.0004075938004306963, + "loss": 2.4115, + "step": 4992 + }, + { + "epoch": 0.572526086457975, + "grad_norm": 0.22783101352931112, + "learning_rate": 0.0004074113103538679, + "loss": 2.5377, + "step": 4993 + }, + { + "epoch": 0.5726407522073157, + "grad_norm": 0.24684727131855588, + "learning_rate": 0.0004072288330478844, + "loss": 2.4513, + "step": 4994 + }, + { + "epoch": 0.5727554179566563, + "grad_norm": 0.2575865935143395, + "learning_rate": 0.00040704636853791486, + "loss": 2.4231, + "step": 4995 + }, + { + "epoch": 0.572870083705997, + "grad_norm": 0.244373473530503, + "learning_rate": 0.00040686391684912704, + "loss": 2.3674, + "step": 4996 + }, + { + "epoch": 0.5729847494553377, + "grad_norm": 0.2291286251653978, + "learning_rate": 0.0004066814780066863, + "loss": 2.4735, + "step": 4997 + }, + { + "epoch": 0.5730994152046783, + "grad_norm": 0.2510218254496051, + "learning_rate": 0.000406499052035757, + "loss": 2.4592, + "step": 4998 + }, + { + "epoch": 0.573214080954019, + "grad_norm": 0.2618742572643672, + "learning_rate": 0.00040631663896150127, + "loss": 2.4264, + "step": 4999 + }, + { + "epoch": 0.5733287467033598, + "grad_norm": 0.24508574516627288, + "learning_rate": 0.0004061342388090794, + "loss": 2.4508, + "step": 5000 + }, + { + "epoch": 0.5734434124527004, + "grad_norm": 0.26517361701772874, + "learning_rate": 0.00040595185160365, + "loss": 2.4589, + "step": 5001 + }, + { + "epoch": 0.5735580782020411, + "grad_norm": 0.2584045114623395, + "learning_rate": 0.00040576947737036985, + "loss": 2.3148, + "step": 5002 + }, + { + "epoch": 0.5736727439513817, + "grad_norm": 0.26357680107745307, + "learning_rate": 0.00040558711613439426, + "loss": 2.4467, + "step": 5003 + }, + { + "epoch": 0.5737874097007224, + "grad_norm": 0.2815497403049391, + "learning_rate": 0.0004054047679208762, + "loss": 2.6143, + "step": 5004 + }, + { + "epoch": 0.5739020754500631, + "grad_norm": 0.2943102244378706, + "learning_rate": 0.0004052224327549674, + "loss": 2.5064, + "step": 5005 + }, + { + "epoch": 0.5740167411994037, + "grad_norm": 0.2500933055883957, + "learning_rate": 0.0004050401106618174, + "loss": 2.3959, + "step": 5006 + }, + { + "epoch": 0.5741314069487444, + "grad_norm": 0.22715994894760586, + "learning_rate": 0.00040485780166657384, + "loss": 2.3756, + "step": 5007 + }, + { + "epoch": 0.574246072698085, + "grad_norm": 0.250134571121391, + "learning_rate": 0.00040467550579438295, + "loss": 2.4686, + "step": 5008 + }, + { + "epoch": 0.5743607384474257, + "grad_norm": 0.26218010014355303, + "learning_rate": 0.0004044932230703892, + "loss": 2.4689, + "step": 5009 + }, + { + "epoch": 0.5744754041967665, + "grad_norm": 0.27574039824673974, + "learning_rate": 0.0004043109535197347, + "loss": 2.4893, + "step": 5010 + }, + { + "epoch": 0.5745900699461071, + "grad_norm": 0.2738689805186912, + "learning_rate": 0.0004041286971675602, + "loss": 2.3798, + "step": 5011 + }, + { + "epoch": 0.5747047356954478, + "grad_norm": 0.2544767575793516, + "learning_rate": 0.0004039464540390043, + "loss": 2.4219, + "step": 5012 + }, + { + "epoch": 0.5748194014447885, + "grad_norm": 0.2540632078588226, + "learning_rate": 0.00040376422415920425, + "loss": 2.3558, + "step": 5013 + }, + { + "epoch": 0.5749340671941291, + "grad_norm": 0.24574294133316146, + "learning_rate": 0.00040358200755329505, + "loss": 2.5075, + "step": 5014 + }, + { + "epoch": 0.5750487329434698, + "grad_norm": 0.26746431676640986, + "learning_rate": 0.0004033998042464101, + "loss": 2.2936, + "step": 5015 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.26172140562254753, + "learning_rate": 0.00040321761426368065, + "loss": 2.3932, + "step": 5016 + }, + { + "epoch": 0.5752780644421511, + "grad_norm": 0.2439308984364897, + "learning_rate": 0.0004030354376302363, + "loss": 2.318, + "step": 5017 + }, + { + "epoch": 0.5753927301914918, + "grad_norm": 0.2542153649695607, + "learning_rate": 0.00040285327437120533, + "loss": 2.5466, + "step": 5018 + }, + { + "epoch": 0.5755073959408324, + "grad_norm": 0.2728987046727881, + "learning_rate": 0.0004026711245117134, + "loss": 2.5047, + "step": 5019 + }, + { + "epoch": 0.5756220616901732, + "grad_norm": 0.2362835383886875, + "learning_rate": 0.00040248898807688436, + "loss": 2.6127, + "step": 5020 + }, + { + "epoch": 0.5757367274395138, + "grad_norm": 0.23374921880662256, + "learning_rate": 0.0004023068650918409, + "loss": 2.4577, + "step": 5021 + }, + { + "epoch": 0.5758513931888545, + "grad_norm": 0.2478615945335469, + "learning_rate": 0.0004021247555817029, + "loss": 2.3813, + "step": 5022 + }, + { + "epoch": 0.5759660589381952, + "grad_norm": 0.2680961036638813, + "learning_rate": 0.00040194265957158937, + "loss": 2.5342, + "step": 5023 + }, + { + "epoch": 0.5760807246875358, + "grad_norm": 0.2703612385876938, + "learning_rate": 0.0004017605770866169, + "loss": 2.4698, + "step": 5024 + }, + { + "epoch": 0.5761953904368765, + "grad_norm": 0.26580634980699064, + "learning_rate": 0.0004015785081519002, + "loss": 2.4616, + "step": 5025 + }, + { + "epoch": 0.5763100561862172, + "grad_norm": 0.24193585503969292, + "learning_rate": 0.0004013964527925521, + "loss": 2.4514, + "step": 5026 + }, + { + "epoch": 0.5764247219355578, + "grad_norm": 0.2554291481177588, + "learning_rate": 0.0004012144110336837, + "loss": 2.4022, + "step": 5027 + }, + { + "epoch": 0.5765393876848985, + "grad_norm": 0.24575255516281505, + "learning_rate": 0.0004010323829004045, + "loss": 2.4387, + "step": 5028 + }, + { + "epoch": 0.5766540534342391, + "grad_norm": 0.24829422060193634, + "learning_rate": 0.0004008503684178214, + "loss": 2.3711, + "step": 5029 + }, + { + "epoch": 0.5767687191835799, + "grad_norm": 0.22745846231294545, + "learning_rate": 0.0004006683676110402, + "loss": 2.4349, + "step": 5030 + }, + { + "epoch": 0.5768833849329206, + "grad_norm": 0.25414273888483707, + "learning_rate": 0.00040048638050516424, + "loss": 2.3732, + "step": 5031 + }, + { + "epoch": 0.5769980506822612, + "grad_norm": 0.24394908602608312, + "learning_rate": 0.00040030440712529494, + "loss": 2.484, + "step": 5032 + }, + { + "epoch": 0.5771127164316019, + "grad_norm": 0.2522149488774059, + "learning_rate": 0.00040012244749653244, + "loss": 2.5205, + "step": 5033 + }, + { + "epoch": 0.5772273821809426, + "grad_norm": 0.2471588564838673, + "learning_rate": 0.00039994050164397457, + "loss": 2.4099, + "step": 5034 + }, + { + "epoch": 0.5773420479302832, + "grad_norm": 0.24311856279452515, + "learning_rate": 0.00039975856959271707, + "loss": 2.4746, + "step": 5035 + }, + { + "epoch": 0.5774567136796239, + "grad_norm": 0.2830515290716525, + "learning_rate": 0.00039957665136785425, + "loss": 2.4548, + "step": 5036 + }, + { + "epoch": 0.5775713794289645, + "grad_norm": 0.26662539141638475, + "learning_rate": 0.0003993947469944779, + "loss": 2.4929, + "step": 5037 + }, + { + "epoch": 0.5776860451783052, + "grad_norm": 0.2584142809831617, + "learning_rate": 0.0003992128564976787, + "loss": 2.4212, + "step": 5038 + }, + { + "epoch": 0.577800710927646, + "grad_norm": 0.24505748413684836, + "learning_rate": 0.00039903097990254467, + "loss": 2.308, + "step": 5039 + }, + { + "epoch": 0.5779153766769866, + "grad_norm": 0.2738466558146099, + "learning_rate": 0.00039884911723416243, + "loss": 2.4598, + "step": 5040 + }, + { + "epoch": 0.5780300424263273, + "grad_norm": 0.26009202116066776, + "learning_rate": 0.00039866726851761625, + "loss": 2.4846, + "step": 5041 + }, + { + "epoch": 0.5781447081756679, + "grad_norm": 0.25752906127190345, + "learning_rate": 0.0003984854337779887, + "loss": 2.5153, + "step": 5042 + }, + { + "epoch": 0.5782593739250086, + "grad_norm": 0.23544412902635597, + "learning_rate": 0.00039830361304036074, + "loss": 2.4548, + "step": 5043 + }, + { + "epoch": 0.5783740396743493, + "grad_norm": 0.24210025081083186, + "learning_rate": 0.00039812180632981084, + "loss": 2.4365, + "step": 5044 + }, + { + "epoch": 0.5784887054236899, + "grad_norm": 0.24423491119591487, + "learning_rate": 0.0003979400136714156, + "loss": 2.3371, + "step": 5045 + }, + { + "epoch": 0.5786033711730306, + "grad_norm": 0.2565386641593241, + "learning_rate": 0.00039775823509025023, + "loss": 2.3662, + "step": 5046 + }, + { + "epoch": 0.5787180369223713, + "grad_norm": 0.2456841947874958, + "learning_rate": 0.00039757647061138717, + "loss": 2.4482, + "step": 5047 + }, + { + "epoch": 0.5788327026717119, + "grad_norm": 0.25895394305836317, + "learning_rate": 0.00039739472025989776, + "loss": 2.4652, + "step": 5048 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.2508965260081382, + "learning_rate": 0.00039721298406085096, + "loss": 2.4232, + "step": 5049 + }, + { + "epoch": 0.5790620341703933, + "grad_norm": 0.24100756074686788, + "learning_rate": 0.0003970312620393137, + "loss": 2.3737, + "step": 5050 + }, + { + "epoch": 0.579176699919734, + "grad_norm": 0.25636627096187176, + "learning_rate": 0.00039684955422035094, + "loss": 2.3714, + "step": 5051 + }, + { + "epoch": 0.5792913656690747, + "grad_norm": 0.24493546915612638, + "learning_rate": 0.00039666786062902585, + "loss": 2.536, + "step": 5052 + }, + { + "epoch": 0.5794060314184153, + "grad_norm": 0.24890900174574493, + "learning_rate": 0.0003964861812904, + "loss": 2.3702, + "step": 5053 + }, + { + "epoch": 0.579520697167756, + "grad_norm": 0.2465741681949579, + "learning_rate": 0.00039630451622953204, + "loss": 2.5329, + "step": 5054 + }, + { + "epoch": 0.5796353629170966, + "grad_norm": 0.23526520178097005, + "learning_rate": 0.0003961228654714797, + "loss": 2.3701, + "step": 5055 + }, + { + "epoch": 0.5797500286664373, + "grad_norm": 0.23174315551798486, + "learning_rate": 0.00039594122904129794, + "loss": 2.3131, + "step": 5056 + }, + { + "epoch": 0.579864694415778, + "grad_norm": 0.23478348321827575, + "learning_rate": 0.00039575960696403977, + "loss": 2.4803, + "step": 5057 + }, + { + "epoch": 0.5799793601651186, + "grad_norm": 0.2673030411901626, + "learning_rate": 0.000395577999264757, + "loss": 2.4607, + "step": 5058 + }, + { + "epoch": 0.5800940259144594, + "grad_norm": 0.24203746326427245, + "learning_rate": 0.0003953964059684989, + "loss": 2.5467, + "step": 5059 + }, + { + "epoch": 0.5802086916638001, + "grad_norm": 0.2704064739299854, + "learning_rate": 0.00039521482710031257, + "loss": 2.3754, + "step": 5060 + }, + { + "epoch": 0.5803233574131407, + "grad_norm": 0.2599606850637984, + "learning_rate": 0.00039503326268524355, + "loss": 2.4309, + "step": 5061 + }, + { + "epoch": 0.5804380231624814, + "grad_norm": 0.2523235674404862, + "learning_rate": 0.00039485171274833484, + "loss": 2.4223, + "step": 5062 + }, + { + "epoch": 0.580552688911822, + "grad_norm": 0.252881143135449, + "learning_rate": 0.0003946701773146283, + "loss": 2.4115, + "step": 5063 + }, + { + "epoch": 0.5806673546611627, + "grad_norm": 0.2361170998519746, + "learning_rate": 0.00039448865640916294, + "loss": 2.4754, + "step": 5064 + }, + { + "epoch": 0.5807820204105034, + "grad_norm": 0.2723019477140788, + "learning_rate": 0.0003943071500569763, + "loss": 2.4067, + "step": 5065 + }, + { + "epoch": 0.580896686159844, + "grad_norm": 0.2677907972770214, + "learning_rate": 0.0003941256582831035, + "loss": 2.4862, + "step": 5066 + }, + { + "epoch": 0.5810113519091847, + "grad_norm": 0.23811825442679022, + "learning_rate": 0.00039394418111257786, + "loss": 2.5868, + "step": 5067 + }, + { + "epoch": 0.5811260176585255, + "grad_norm": 0.23395874909472864, + "learning_rate": 0.00039376271857043105, + "loss": 2.472, + "step": 5068 + }, + { + "epoch": 0.581240683407866, + "grad_norm": 0.27622535593774344, + "learning_rate": 0.0003935812706816921, + "loss": 2.489, + "step": 5069 + }, + { + "epoch": 0.5813553491572068, + "grad_norm": 0.23427250282300477, + "learning_rate": 0.00039339983747138836, + "loss": 2.2698, + "step": 5070 + }, + { + "epoch": 0.5814700149065474, + "grad_norm": 0.26246258835864084, + "learning_rate": 0.00039321841896454505, + "loss": 2.5501, + "step": 5071 + }, + { + "epoch": 0.5815846806558881, + "grad_norm": 0.2355133164645691, + "learning_rate": 0.00039303701518618526, + "loss": 2.5025, + "step": 5072 + }, + { + "epoch": 0.5816993464052288, + "grad_norm": 0.22468195801257737, + "learning_rate": 0.0003928556261613304, + "loss": 2.3843, + "step": 5073 + }, + { + "epoch": 0.5818140121545694, + "grad_norm": 0.25635137594889784, + "learning_rate": 0.0003926742519149997, + "loss": 2.477, + "step": 5074 + }, + { + "epoch": 0.5819286779039101, + "grad_norm": 0.25737719771148077, + "learning_rate": 0.00039249289247221005, + "loss": 2.512, + "step": 5075 + }, + { + "epoch": 0.5820433436532507, + "grad_norm": 0.20689402098413232, + "learning_rate": 0.0003923115478579765, + "loss": 2.3257, + "step": 5076 + }, + { + "epoch": 0.5821580094025914, + "grad_norm": 0.23565201934851593, + "learning_rate": 0.0003921302180973122, + "loss": 2.4065, + "step": 5077 + }, + { + "epoch": 0.5822726751519322, + "grad_norm": 0.25220776282812907, + "learning_rate": 0.00039194890321522823, + "loss": 2.4514, + "step": 5078 + }, + { + "epoch": 0.5823873409012728, + "grad_norm": 0.24143421815285085, + "learning_rate": 0.00039176760323673336, + "loss": 2.3765, + "step": 5079 + }, + { + "epoch": 0.5825020066506135, + "grad_norm": 0.2636850660172274, + "learning_rate": 0.00039158631818683466, + "loss": 2.5025, + "step": 5080 + }, + { + "epoch": 0.5826166723999542, + "grad_norm": 0.2665380235270446, + "learning_rate": 0.0003914050480905368, + "loss": 2.5385, + "step": 5081 + }, + { + "epoch": 0.5827313381492948, + "grad_norm": 0.24059664363062083, + "learning_rate": 0.0003912237929728423, + "loss": 2.5053, + "step": 5082 + }, + { + "epoch": 0.5828460038986355, + "grad_norm": 0.25706087748751005, + "learning_rate": 0.00039104255285875224, + "loss": 2.3346, + "step": 5083 + }, + { + "epoch": 0.5829606696479761, + "grad_norm": 0.26485229627628987, + "learning_rate": 0.0003908613277732652, + "loss": 2.5231, + "step": 5084 + }, + { + "epoch": 0.5830753353973168, + "grad_norm": 0.22552195086138882, + "learning_rate": 0.0003906801177413775, + "loss": 2.4214, + "step": 5085 + }, + { + "epoch": 0.5831900011466575, + "grad_norm": 0.258545708869077, + "learning_rate": 0.0003904989227880838, + "loss": 2.4413, + "step": 5086 + }, + { + "epoch": 0.5833046668959981, + "grad_norm": 0.25195722959452027, + "learning_rate": 0.00039031774293837623, + "loss": 2.5531, + "step": 5087 + }, + { + "epoch": 0.5834193326453389, + "grad_norm": 0.2442888082596338, + "learning_rate": 0.0003901365782172456, + "loss": 2.5232, + "step": 5088 + }, + { + "epoch": 0.5835339983946795, + "grad_norm": 0.22462173176703037, + "learning_rate": 0.0003899554286496796, + "loss": 2.4378, + "step": 5089 + }, + { + "epoch": 0.5836486641440202, + "grad_norm": 0.24443331795574558, + "learning_rate": 0.0003897742942606647, + "loss": 2.4465, + "step": 5090 + }, + { + "epoch": 0.5837633298933609, + "grad_norm": 0.2617752188904536, + "learning_rate": 0.0003895931750751847, + "loss": 2.5525, + "step": 5091 + }, + { + "epoch": 0.5838779956427015, + "grad_norm": 0.2357586763821419, + "learning_rate": 0.00038941207111822156, + "loss": 2.4645, + "step": 5092 + }, + { + "epoch": 0.5839926613920422, + "grad_norm": 0.23462237796448132, + "learning_rate": 0.0003892309824147554, + "loss": 2.3949, + "step": 5093 + }, + { + "epoch": 0.5841073271413829, + "grad_norm": 0.22403263868852266, + "learning_rate": 0.0003890499089897638, + "loss": 2.4275, + "step": 5094 + }, + { + "epoch": 0.5842219928907235, + "grad_norm": 0.2262607106441164, + "learning_rate": 0.0003888688508682223, + "loss": 2.484, + "step": 5095 + }, + { + "epoch": 0.5843366586400642, + "grad_norm": 0.22706550788083926, + "learning_rate": 0.0003886878080751045, + "loss": 2.3659, + "step": 5096 + }, + { + "epoch": 0.5844513243894048, + "grad_norm": 0.2904966655390043, + "learning_rate": 0.0003885067806353816, + "loss": 2.4453, + "step": 5097 + }, + { + "epoch": 0.5845659901387456, + "grad_norm": 0.233395204038724, + "learning_rate": 0.00038832576857402313, + "loss": 2.3913, + "step": 5098 + }, + { + "epoch": 0.5846806558880863, + "grad_norm": 0.27260707595832806, + "learning_rate": 0.0003881447719159963, + "loss": 2.4847, + "step": 5099 + }, + { + "epoch": 0.5847953216374269, + "grad_norm": 0.3157668319774191, + "learning_rate": 0.00038796379068626604, + "loss": 2.6039, + "step": 5100 + }, + { + "epoch": 0.5849099873867676, + "grad_norm": 0.2601864630542841, + "learning_rate": 0.00038778282490979513, + "loss": 2.3642, + "step": 5101 + }, + { + "epoch": 0.5850246531361083, + "grad_norm": 0.25849911496167405, + "learning_rate": 0.0003876018746115443, + "loss": 2.5578, + "step": 5102 + }, + { + "epoch": 0.5851393188854489, + "grad_norm": 0.23673207521087006, + "learning_rate": 0.0003874209398164727, + "loss": 2.5212, + "step": 5103 + }, + { + "epoch": 0.5852539846347896, + "grad_norm": 0.24963372403303236, + "learning_rate": 0.0003872400205495363, + "loss": 2.4637, + "step": 5104 + }, + { + "epoch": 0.5853686503841302, + "grad_norm": 0.25841570086865334, + "learning_rate": 0.0003870591168356898, + "loss": 2.4079, + "step": 5105 + }, + { + "epoch": 0.5854833161334709, + "grad_norm": 0.22831346063674995, + "learning_rate": 0.0003868782286998853, + "loss": 2.3969, + "step": 5106 + }, + { + "epoch": 0.5855979818828116, + "grad_norm": 0.2539261123956983, + "learning_rate": 0.0003866973561670727, + "loss": 2.3606, + "step": 5107 + }, + { + "epoch": 0.5857126476321523, + "grad_norm": 0.2543712059027437, + "learning_rate": 0.00038651649926220005, + "loss": 2.4078, + "step": 5108 + }, + { + "epoch": 0.585827313381493, + "grad_norm": 0.2582872878528295, + "learning_rate": 0.0003863356580102133, + "loss": 2.6518, + "step": 5109 + }, + { + "epoch": 0.5859419791308336, + "grad_norm": 0.24359303332375276, + "learning_rate": 0.00038615483243605584, + "loss": 2.4019, + "step": 5110 + }, + { + "epoch": 0.5860566448801743, + "grad_norm": 0.2389561542487212, + "learning_rate": 0.0003859740225646692, + "loss": 2.639, + "step": 5111 + }, + { + "epoch": 0.586171310629515, + "grad_norm": 0.23655965261489872, + "learning_rate": 0.0003857932284209924, + "loss": 2.593, + "step": 5112 + }, + { + "epoch": 0.5862859763788556, + "grad_norm": 0.2389807607585272, + "learning_rate": 0.00038561245002996297, + "loss": 2.5735, + "step": 5113 + }, + { + "epoch": 0.5864006421281963, + "grad_norm": 0.24123560674734348, + "learning_rate": 0.00038543168741651557, + "loss": 2.4884, + "step": 5114 + }, + { + "epoch": 0.586515307877537, + "grad_norm": 0.2570912547735875, + "learning_rate": 0.00038525094060558307, + "loss": 2.4702, + "step": 5115 + }, + { + "epoch": 0.5866299736268776, + "grad_norm": 0.21642849264555775, + "learning_rate": 0.00038507020962209594, + "loss": 2.4302, + "step": 5116 + }, + { + "epoch": 0.5867446393762183, + "grad_norm": 0.25150177391554046, + "learning_rate": 0.00038488949449098247, + "loss": 2.5713, + "step": 5117 + }, + { + "epoch": 0.586859305125559, + "grad_norm": 0.22825456955090323, + "learning_rate": 0.0003847087952371693, + "loss": 2.4753, + "step": 5118 + }, + { + "epoch": 0.5869739708748997, + "grad_norm": 0.2324367981284907, + "learning_rate": 0.00038452811188558015, + "loss": 2.3943, + "step": 5119 + }, + { + "epoch": 0.5870886366242404, + "grad_norm": 0.23053754454441402, + "learning_rate": 0.0003843474444611368, + "loss": 2.5536, + "step": 5120 + }, + { + "epoch": 0.587203302373581, + "grad_norm": 0.24336870442232017, + "learning_rate": 0.000384166792988759, + "loss": 2.4241, + "step": 5121 + }, + { + "epoch": 0.5873179681229217, + "grad_norm": 0.26445927468055774, + "learning_rate": 0.0003839861574933639, + "loss": 2.4166, + "step": 5122 + }, + { + "epoch": 0.5874326338722623, + "grad_norm": 0.2375854637209677, + "learning_rate": 0.0003838055379998671, + "loss": 2.4852, + "step": 5123 + }, + { + "epoch": 0.587547299621603, + "grad_norm": 0.21839824598756138, + "learning_rate": 0.0003836249345331816, + "loss": 2.3611, + "step": 5124 + }, + { + "epoch": 0.5876619653709437, + "grad_norm": 0.24070482123619102, + "learning_rate": 0.000383444347118218, + "loss": 2.3565, + "step": 5125 + }, + { + "epoch": 0.5877766311202843, + "grad_norm": 0.25694149981460424, + "learning_rate": 0.0003832637757798849, + "loss": 2.3863, + "step": 5126 + }, + { + "epoch": 0.587891296869625, + "grad_norm": 0.2548723885866467, + "learning_rate": 0.00038308322054308867, + "loss": 2.4829, + "step": 5127 + }, + { + "epoch": 0.5880059626189658, + "grad_norm": 0.25401275538990925, + "learning_rate": 0.00038290268143273366, + "loss": 2.3436, + "step": 5128 + }, + { + "epoch": 0.5881206283683064, + "grad_norm": 0.2593290721715852, + "learning_rate": 0.00038272215847372156, + "loss": 2.3948, + "step": 5129 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.24015610468909754, + "learning_rate": 0.00038254165169095236, + "loss": 2.375, + "step": 5130 + }, + { + "epoch": 0.5883499598669877, + "grad_norm": 0.2784877153979556, + "learning_rate": 0.00038236116110932326, + "loss": 2.5675, + "step": 5131 + }, + { + "epoch": 0.5884646256163284, + "grad_norm": 0.26807598371237545, + "learning_rate": 0.00038218068675372944, + "loss": 2.6188, + "step": 5132 + }, + { + "epoch": 0.5885792913656691, + "grad_norm": 0.25456400319575573, + "learning_rate": 0.00038200022864906416, + "loss": 2.4575, + "step": 5133 + }, + { + "epoch": 0.5886939571150097, + "grad_norm": 0.2633685375489386, + "learning_rate": 0.00038181978682021807, + "loss": 2.5152, + "step": 5134 + }, + { + "epoch": 0.5888086228643504, + "grad_norm": 0.2279632026284544, + "learning_rate": 0.00038163936129207966, + "loss": 2.3716, + "step": 5135 + }, + { + "epoch": 0.5889232886136911, + "grad_norm": 0.22647178921101202, + "learning_rate": 0.00038145895208953524, + "loss": 2.3648, + "step": 5136 + }, + { + "epoch": 0.5890379543630317, + "grad_norm": 0.28524370005834754, + "learning_rate": 0.0003812785592374686, + "loss": 2.3491, + "step": 5137 + }, + { + "epoch": 0.5891526201123725, + "grad_norm": 0.23828153082710252, + "learning_rate": 0.0003810981827607619, + "loss": 2.3706, + "step": 5138 + }, + { + "epoch": 0.5892672858617131, + "grad_norm": 0.22684081710844117, + "learning_rate": 0.0003809178226842943, + "loss": 2.3649, + "step": 5139 + }, + { + "epoch": 0.5893819516110538, + "grad_norm": 0.2480258561568927, + "learning_rate": 0.0003807374790329432, + "loss": 2.3319, + "step": 5140 + }, + { + "epoch": 0.5894966173603945, + "grad_norm": 0.2538901759850134, + "learning_rate": 0.0003805571518315835, + "loss": 2.4634, + "step": 5141 + }, + { + "epoch": 0.5896112831097351, + "grad_norm": 0.2620400671794665, + "learning_rate": 0.00038037684110508774, + "loss": 2.5996, + "step": 5142 + }, + { + "epoch": 0.5897259488590758, + "grad_norm": 0.25339051014132397, + "learning_rate": 0.00038019654687832677, + "loss": 2.4585, + "step": 5143 + }, + { + "epoch": 0.5898406146084164, + "grad_norm": 0.24830535467373968, + "learning_rate": 0.0003800162691761685, + "loss": 2.3516, + "step": 5144 + }, + { + "epoch": 0.5899552803577571, + "grad_norm": 0.23001181752788724, + "learning_rate": 0.00037983600802347873, + "loss": 2.3587, + "step": 5145 + }, + { + "epoch": 0.5900699461070978, + "grad_norm": 0.26866400418278735, + "learning_rate": 0.00037965576344512115, + "loss": 2.4302, + "step": 5146 + }, + { + "epoch": 0.5901846118564384, + "grad_norm": 0.253672309672238, + "learning_rate": 0.0003794755354659569, + "loss": 2.4196, + "step": 5147 + }, + { + "epoch": 0.5902992776057792, + "grad_norm": 0.2936446606417755, + "learning_rate": 0.0003792953241108452, + "loss": 2.4583, + "step": 5148 + }, + { + "epoch": 0.5904139433551199, + "grad_norm": 0.259918351178376, + "learning_rate": 0.0003791151294046428, + "loss": 2.5675, + "step": 5149 + }, + { + "epoch": 0.5905286091044605, + "grad_norm": 0.2492783641204892, + "learning_rate": 0.000378934951372204, + "loss": 2.2789, + "step": 5150 + }, + { + "epoch": 0.5906432748538012, + "grad_norm": 0.230375421930286, + "learning_rate": 0.0003787547900383808, + "loss": 2.5946, + "step": 5151 + }, + { + "epoch": 0.5907579406031418, + "grad_norm": 0.23813157043930716, + "learning_rate": 0.0003785746454280231, + "loss": 2.5842, + "step": 5152 + }, + { + "epoch": 0.5908726063524825, + "grad_norm": 0.28499346791973534, + "learning_rate": 0.00037839451756597873, + "loss": 2.4785, + "step": 5153 + }, + { + "epoch": 0.5909872721018232, + "grad_norm": 0.23008861987223928, + "learning_rate": 0.0003782144064770925, + "loss": 2.33, + "step": 5154 + }, + { + "epoch": 0.5911019378511638, + "grad_norm": 0.24500450507557636, + "learning_rate": 0.0003780343121862077, + "loss": 2.486, + "step": 5155 + }, + { + "epoch": 0.5912166036005045, + "grad_norm": 0.22027311168219074, + "learning_rate": 0.0003778542347181645, + "loss": 2.2973, + "step": 5156 + }, + { + "epoch": 0.5913312693498453, + "grad_norm": 0.23704132328570804, + "learning_rate": 0.00037767417409780116, + "loss": 2.3339, + "step": 5157 + }, + { + "epoch": 0.5914459350991859, + "grad_norm": 0.2656755154419864, + "learning_rate": 0.00037749413034995384, + "loss": 2.3659, + "step": 5158 + }, + { + "epoch": 0.5915606008485266, + "grad_norm": 0.23435159851520573, + "learning_rate": 0.00037731410349945614, + "loss": 2.3299, + "step": 5159 + }, + { + "epoch": 0.5916752665978672, + "grad_norm": 0.24382278785597827, + "learning_rate": 0.0003771340935711392, + "loss": 2.4224, + "step": 5160 + }, + { + "epoch": 0.5917899323472079, + "grad_norm": 0.23004772697730544, + "learning_rate": 0.00037695410058983213, + "loss": 2.4473, + "step": 5161 + }, + { + "epoch": 0.5919045980965486, + "grad_norm": 0.2384115129277421, + "learning_rate": 0.0003767741245803612, + "loss": 2.4481, + "step": 5162 + }, + { + "epoch": 0.5920192638458892, + "grad_norm": 0.2668427586192323, + "learning_rate": 0.0003765941655675511, + "loss": 2.4544, + "step": 5163 + }, + { + "epoch": 0.5921339295952299, + "grad_norm": 0.2948163050877117, + "learning_rate": 0.0003764142235762234, + "loss": 2.5963, + "step": 5164 + }, + { + "epoch": 0.5922485953445705, + "grad_norm": 0.251970855727244, + "learning_rate": 0.0003762342986311979, + "loss": 2.369, + "step": 5165 + }, + { + "epoch": 0.5923632610939112, + "grad_norm": 0.2540598628161775, + "learning_rate": 0.00037605439075729166, + "loss": 2.4363, + "step": 5166 + }, + { + "epoch": 0.592477926843252, + "grad_norm": 0.26671291686672455, + "learning_rate": 0.00037587449997931945, + "loss": 2.386, + "step": 5167 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.23963713349476787, + "learning_rate": 0.0003756946263220941, + "loss": 2.611, + "step": 5168 + }, + { + "epoch": 0.5927072583419333, + "grad_norm": 0.2567121271517095, + "learning_rate": 0.0003755147698104256, + "loss": 2.4759, + "step": 5169 + }, + { + "epoch": 0.592821924091274, + "grad_norm": 0.25353372198431134, + "learning_rate": 0.0003753349304691216, + "loss": 2.3632, + "step": 5170 + }, + { + "epoch": 0.5929365898406146, + "grad_norm": 0.2600916161516471, + "learning_rate": 0.00037515510832298774, + "loss": 2.5963, + "step": 5171 + }, + { + "epoch": 0.5930512555899553, + "grad_norm": 0.25872364979107476, + "learning_rate": 0.0003749753033968267, + "loss": 2.4614, + "step": 5172 + }, + { + "epoch": 0.5931659213392959, + "grad_norm": 0.2529198813004184, + "learning_rate": 0.00037479551571543946, + "loss": 2.3794, + "step": 5173 + }, + { + "epoch": 0.5932805870886366, + "grad_norm": 0.2572923042974584, + "learning_rate": 0.0003746157453036243, + "loss": 2.2531, + "step": 5174 + }, + { + "epoch": 0.5933952528379773, + "grad_norm": 0.22537207848971327, + "learning_rate": 0.0003744359921861771, + "loss": 2.4603, + "step": 5175 + }, + { + "epoch": 0.5935099185873179, + "grad_norm": 0.2646339168110124, + "learning_rate": 0.0003742562563878911, + "loss": 2.3739, + "step": 5176 + }, + { + "epoch": 0.5936245843366587, + "grad_norm": 0.249340296652455, + "learning_rate": 0.00037407653793355766, + "loss": 2.5506, + "step": 5177 + }, + { + "epoch": 0.5937392500859993, + "grad_norm": 0.24853961395488436, + "learning_rate": 0.0003738968368479656, + "loss": 2.5452, + "step": 5178 + }, + { + "epoch": 0.59385391583534, + "grad_norm": 0.23898825980332986, + "learning_rate": 0.00037371715315590113, + "loss": 2.3822, + "step": 5179 + }, + { + "epoch": 0.5939685815846807, + "grad_norm": 0.23979273772098084, + "learning_rate": 0.00037353748688214827, + "loss": 2.3917, + "step": 5180 + }, + { + "epoch": 0.5940832473340213, + "grad_norm": 0.23304283742061266, + "learning_rate": 0.0003733578380514885, + "loss": 2.4465, + "step": 5181 + }, + { + "epoch": 0.594197913083362, + "grad_norm": 0.244749265371565, + "learning_rate": 0.0003731782066887007, + "loss": 2.3779, + "step": 5182 + }, + { + "epoch": 0.5943125788327027, + "grad_norm": 0.24056692933959664, + "learning_rate": 0.000372998592818562, + "loss": 2.4192, + "step": 5183 + }, + { + "epoch": 0.5944272445820433, + "grad_norm": 0.22937043339566887, + "learning_rate": 0.00037281899646584665, + "loss": 2.3419, + "step": 5184 + }, + { + "epoch": 0.594541910331384, + "grad_norm": 0.2612073378271702, + "learning_rate": 0.00037263941765532636, + "loss": 2.3747, + "step": 5185 + }, + { + "epoch": 0.5946565760807246, + "grad_norm": 0.2317889381707554, + "learning_rate": 0.0003724598564117708, + "loss": 2.3791, + "step": 5186 + }, + { + "epoch": 0.5947712418300654, + "grad_norm": 0.23536942526013294, + "learning_rate": 0.0003722803127599467, + "loss": 2.4637, + "step": 5187 + }, + { + "epoch": 0.5948859075794061, + "grad_norm": 0.23659790097348804, + "learning_rate": 0.00037210078672461904, + "loss": 2.3089, + "step": 5188 + }, + { + "epoch": 0.5950005733287467, + "grad_norm": 0.2656512664794035, + "learning_rate": 0.00037192127833054984, + "loss": 2.4447, + "step": 5189 + }, + { + "epoch": 0.5951152390780874, + "grad_norm": 0.24772561533391785, + "learning_rate": 0.000371741787602499, + "loss": 2.4652, + "step": 5190 + }, + { + "epoch": 0.5952299048274281, + "grad_norm": 0.2704458006417386, + "learning_rate": 0.00037156231456522356, + "loss": 2.5326, + "step": 5191 + }, + { + "epoch": 0.5953445705767687, + "grad_norm": 0.23577741711198177, + "learning_rate": 0.00037138285924347847, + "loss": 2.4445, + "step": 5192 + }, + { + "epoch": 0.5954592363261094, + "grad_norm": 0.25107323311785995, + "learning_rate": 0.00037120342166201657, + "loss": 2.3266, + "step": 5193 + }, + { + "epoch": 0.59557390207545, + "grad_norm": 0.2639120249856345, + "learning_rate": 0.00037102400184558756, + "loss": 2.5847, + "step": 5194 + }, + { + "epoch": 0.5956885678247907, + "grad_norm": 0.23016154688523172, + "learning_rate": 0.0003708445998189388, + "loss": 2.2762, + "step": 5195 + }, + { + "epoch": 0.5958032335741315, + "grad_norm": 0.21968766876680054, + "learning_rate": 0.00037066521560681577, + "loss": 2.4486, + "step": 5196 + }, + { + "epoch": 0.5959178993234721, + "grad_norm": 0.2474525986223619, + "learning_rate": 0.00037048584923396065, + "loss": 2.4081, + "step": 5197 + }, + { + "epoch": 0.5960325650728128, + "grad_norm": 0.26720726176898263, + "learning_rate": 0.000370306500725114, + "loss": 2.4163, + "step": 5198 + }, + { + "epoch": 0.5961472308221534, + "grad_norm": 0.25818794110757337, + "learning_rate": 0.00037012717010501345, + "loss": 2.5431, + "step": 5199 + }, + { + "epoch": 0.5962618965714941, + "grad_norm": 0.262054617895628, + "learning_rate": 0.0003699478573983942, + "loss": 2.3955, + "step": 5200 + }, + { + "epoch": 0.5963765623208348, + "grad_norm": 0.253990997349075, + "learning_rate": 0.0003697685626299889, + "loss": 2.371, + "step": 5201 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 0.2414532342476216, + "learning_rate": 0.00036958928582452787, + "loss": 2.5223, + "step": 5202 + }, + { + "epoch": 0.5966058938195161, + "grad_norm": 0.2664248820803913, + "learning_rate": 0.0003694100270067392, + "loss": 2.5165, + "step": 5203 + }, + { + "epoch": 0.5967205595688568, + "grad_norm": 0.26658400291592627, + "learning_rate": 0.00036923078620134785, + "loss": 2.4906, + "step": 5204 + }, + { + "epoch": 0.5968352253181974, + "grad_norm": 0.23903412500930632, + "learning_rate": 0.0003690515634330771, + "loss": 2.3778, + "step": 5205 + }, + { + "epoch": 0.5969498910675382, + "grad_norm": 0.23694566185799668, + "learning_rate": 0.000368872358726647, + "loss": 2.5241, + "step": 5206 + }, + { + "epoch": 0.5970645568168788, + "grad_norm": 0.27354646591190823, + "learning_rate": 0.00036869317210677524, + "loss": 2.4304, + "step": 5207 + }, + { + "epoch": 0.5971792225662195, + "grad_norm": 0.277008459080636, + "learning_rate": 0.0003685140035981776, + "loss": 2.3558, + "step": 5208 + }, + { + "epoch": 0.5972938883155602, + "grad_norm": 0.24723880611501, + "learning_rate": 0.0003683348532255669, + "loss": 2.4497, + "step": 5209 + }, + { + "epoch": 0.5974085540649008, + "grad_norm": 0.24899880573157604, + "learning_rate": 0.00036815572101365335, + "loss": 2.4871, + "step": 5210 + }, + { + "epoch": 0.5975232198142415, + "grad_norm": 0.2524386294370823, + "learning_rate": 0.000367976606987145, + "loss": 2.3075, + "step": 5211 + }, + { + "epoch": 0.5976378855635821, + "grad_norm": 0.23611411339056015, + "learning_rate": 0.00036779751117074695, + "loss": 2.4105, + "step": 5212 + }, + { + "epoch": 0.5977525513129228, + "grad_norm": 0.24400823344718622, + "learning_rate": 0.00036761843358916235, + "loss": 2.5683, + "step": 5213 + }, + { + "epoch": 0.5978672170622635, + "grad_norm": 0.24481773572436466, + "learning_rate": 0.0003674393742670914, + "loss": 2.49, + "step": 5214 + }, + { + "epoch": 0.5979818828116041, + "grad_norm": 0.2655993211569076, + "learning_rate": 0.000367260333229232, + "loss": 2.3868, + "step": 5215 + }, + { + "epoch": 0.5980965485609449, + "grad_norm": 0.25755946919280404, + "learning_rate": 0.00036708131050027925, + "loss": 2.522, + "step": 5216 + }, + { + "epoch": 0.5982112143102856, + "grad_norm": 0.25282300496118976, + "learning_rate": 0.000366902306104926, + "loss": 2.4832, + "step": 5217 + }, + { + "epoch": 0.5983258800596262, + "grad_norm": 0.26455383199806126, + "learning_rate": 0.00036672332006786274, + "loss": 2.3676, + "step": 5218 + }, + { + "epoch": 0.5984405458089669, + "grad_norm": 0.24852381927720799, + "learning_rate": 0.00036654435241377704, + "loss": 2.4214, + "step": 5219 + }, + { + "epoch": 0.5985552115583075, + "grad_norm": 0.25550939683738993, + "learning_rate": 0.00036636540316735384, + "loss": 2.3789, + "step": 5220 + }, + { + "epoch": 0.5986698773076482, + "grad_norm": 0.23369424677905018, + "learning_rate": 0.0003661864723532761, + "loss": 2.4665, + "step": 5221 + }, + { + "epoch": 0.5987845430569889, + "grad_norm": 0.2511536869835362, + "learning_rate": 0.0003660075599962236, + "loss": 2.2889, + "step": 5222 + }, + { + "epoch": 0.5988992088063295, + "grad_norm": 0.2546526074279082, + "learning_rate": 0.0003658286661208741, + "loss": 2.5958, + "step": 5223 + }, + { + "epoch": 0.5990138745556702, + "grad_norm": 0.2545174935018608, + "learning_rate": 0.00036564979075190277, + "loss": 2.4371, + "step": 5224 + }, + { + "epoch": 0.599128540305011, + "grad_norm": 0.24902735732294498, + "learning_rate": 0.0003654709339139818, + "loss": 2.5822, + "step": 5225 + }, + { + "epoch": 0.5992432060543516, + "grad_norm": 0.2560034123376561, + "learning_rate": 0.000365292095631781, + "loss": 2.5191, + "step": 5226 + }, + { + "epoch": 0.5993578718036923, + "grad_norm": 0.2701721320301501, + "learning_rate": 0.0003651132759299678, + "loss": 2.4605, + "step": 5227 + }, + { + "epoch": 0.5994725375530329, + "grad_norm": 0.23574831544721947, + "learning_rate": 0.0003649344748332072, + "loss": 2.4539, + "step": 5228 + }, + { + "epoch": 0.5995872033023736, + "grad_norm": 0.24842497262509552, + "learning_rate": 0.0003647556923661611, + "loss": 2.4594, + "step": 5229 + }, + { + "epoch": 0.5997018690517143, + "grad_norm": 0.2471358150526401, + "learning_rate": 0.0003645769285534894, + "loss": 2.2773, + "step": 5230 + }, + { + "epoch": 0.5998165348010549, + "grad_norm": 0.25336000889568916, + "learning_rate": 0.00036439818341984896, + "loss": 2.4491, + "step": 5231 + }, + { + "epoch": 0.5999312005503956, + "grad_norm": 0.24038133706702142, + "learning_rate": 0.0003642194569898941, + "loss": 2.3441, + "step": 5232 + }, + { + "epoch": 0.6000458662997362, + "grad_norm": 0.23827485018902722, + "learning_rate": 0.0003640407492882771, + "loss": 2.3761, + "step": 5233 + }, + { + "epoch": 0.6001605320490769, + "grad_norm": 0.25009552268642, + "learning_rate": 0.0003638620603396472, + "loss": 2.5022, + "step": 5234 + }, + { + "epoch": 0.6002751977984176, + "grad_norm": 0.26892645124857023, + "learning_rate": 0.00036368339016865087, + "loss": 2.5234, + "step": 5235 + }, + { + "epoch": 0.6003898635477583, + "grad_norm": 0.24086734630253276, + "learning_rate": 0.00036350473879993264, + "loss": 2.5373, + "step": 5236 + }, + { + "epoch": 0.600504529297099, + "grad_norm": 0.26438194766427175, + "learning_rate": 0.00036332610625813356, + "loss": 2.3789, + "step": 5237 + }, + { + "epoch": 0.6006191950464397, + "grad_norm": 0.2503044947664895, + "learning_rate": 0.00036314749256789313, + "loss": 2.3734, + "step": 5238 + }, + { + "epoch": 0.6007338607957803, + "grad_norm": 0.25485476236541055, + "learning_rate": 0.00036296889775384736, + "loss": 2.3764, + "step": 5239 + }, + { + "epoch": 0.600848526545121, + "grad_norm": 0.23020168719237788, + "learning_rate": 0.0003627903218406302, + "loss": 2.4581, + "step": 5240 + }, + { + "epoch": 0.6009631922944616, + "grad_norm": 0.2390960560803545, + "learning_rate": 0.00036261176485287254, + "loss": 2.4428, + "step": 5241 + }, + { + "epoch": 0.6010778580438023, + "grad_norm": 0.2443200328671786, + "learning_rate": 0.00036243322681520296, + "loss": 2.4613, + "step": 5242 + }, + { + "epoch": 0.601192523793143, + "grad_norm": 0.21772933236602846, + "learning_rate": 0.00036225470775224776, + "loss": 2.3255, + "step": 5243 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.2410248479107569, + "learning_rate": 0.00036207620768862993, + "loss": 2.5717, + "step": 5244 + }, + { + "epoch": 0.6014218552918243, + "grad_norm": 0.2701897960180836, + "learning_rate": 0.00036189772664897, + "loss": 2.521, + "step": 5245 + }, + { + "epoch": 0.601536521041165, + "grad_norm": 0.2696818783052496, + "learning_rate": 0.0003617192646578864, + "loss": 2.5238, + "step": 5246 + }, + { + "epoch": 0.6016511867905057, + "grad_norm": 0.25830125228983225, + "learning_rate": 0.0003615408217399941, + "loss": 2.5162, + "step": 5247 + }, + { + "epoch": 0.6017658525398464, + "grad_norm": 0.24510156602320524, + "learning_rate": 0.0003613623979199064, + "loss": 2.4919, + "step": 5248 + }, + { + "epoch": 0.601880518289187, + "grad_norm": 0.27127744347076677, + "learning_rate": 0.00036118399322223325, + "loss": 2.3951, + "step": 5249 + }, + { + "epoch": 0.6019951840385277, + "grad_norm": 0.24790545510913592, + "learning_rate": 0.0003610056076715821, + "loss": 2.3373, + "step": 5250 + }, + { + "epoch": 0.6021098497878684, + "grad_norm": 0.2395050306206691, + "learning_rate": 0.0003608272412925579, + "loss": 2.4889, + "step": 5251 + }, + { + "epoch": 0.602224515537209, + "grad_norm": 0.24885710111219678, + "learning_rate": 0.00036064889410976277, + "loss": 2.4734, + "step": 5252 + }, + { + "epoch": 0.6023391812865497, + "grad_norm": 0.22501903944497442, + "learning_rate": 0.0003604705661477966, + "loss": 2.4403, + "step": 5253 + }, + { + "epoch": 0.6024538470358903, + "grad_norm": 0.22437609639941897, + "learning_rate": 0.00036029225743125607, + "loss": 2.3677, + "step": 5254 + }, + { + "epoch": 0.602568512785231, + "grad_norm": 0.2378894472366075, + "learning_rate": 0.0003601139679847356, + "loss": 2.3169, + "step": 5255 + }, + { + "epoch": 0.6026831785345718, + "grad_norm": 0.25234647245825503, + "learning_rate": 0.00035993569783282687, + "loss": 2.5075, + "step": 5256 + }, + { + "epoch": 0.6027978442839124, + "grad_norm": 0.23323413610208354, + "learning_rate": 0.0003597574470001185, + "loss": 2.4357, + "step": 5257 + }, + { + "epoch": 0.6029125100332531, + "grad_norm": 0.24073819978385405, + "learning_rate": 0.0003595792155111971, + "loss": 2.4144, + "step": 5258 + }, + { + "epoch": 0.6030271757825938, + "grad_norm": 0.2651961054183406, + "learning_rate": 0.0003594010033906464, + "loss": 2.5571, + "step": 5259 + }, + { + "epoch": 0.6031418415319344, + "grad_norm": 0.23949874324184325, + "learning_rate": 0.0003592228106630472, + "loss": 2.4422, + "step": 5260 + }, + { + "epoch": 0.6032565072812751, + "grad_norm": 0.26359831191054023, + "learning_rate": 0.00035904463735297784, + "loss": 2.3899, + "step": 5261 + }, + { + "epoch": 0.6033711730306157, + "grad_norm": 0.2086565184507233, + "learning_rate": 0.00035886648348501364, + "loss": 2.4416, + "step": 5262 + }, + { + "epoch": 0.6034858387799564, + "grad_norm": 0.23061913271872425, + "learning_rate": 0.0003586883490837281, + "loss": 2.3939, + "step": 5263 + }, + { + "epoch": 0.6036005045292971, + "grad_norm": 0.223308440355955, + "learning_rate": 0.000358510234173691, + "loss": 2.5251, + "step": 5264 + }, + { + "epoch": 0.6037151702786377, + "grad_norm": 0.25036887415783954, + "learning_rate": 0.00035833213877947017, + "loss": 2.4599, + "step": 5265 + }, + { + "epoch": 0.6038298360279785, + "grad_norm": 0.2628068287257144, + "learning_rate": 0.0003581540629256303, + "loss": 2.6148, + "step": 5266 + }, + { + "epoch": 0.6039445017773191, + "grad_norm": 0.2586578251399628, + "learning_rate": 0.00035797600663673346, + "loss": 2.5151, + "step": 5267 + }, + { + "epoch": 0.6040591675266598, + "grad_norm": 0.2397139805977986, + "learning_rate": 0.00035779796993733946, + "loss": 2.4285, + "step": 5268 + }, + { + "epoch": 0.6041738332760005, + "grad_norm": 0.2826388600170993, + "learning_rate": 0.000357619952852005, + "loss": 2.5821, + "step": 5269 + }, + { + "epoch": 0.6042884990253411, + "grad_norm": 0.22960536477199134, + "learning_rate": 0.0003574419554052839, + "loss": 2.4805, + "step": 5270 + }, + { + "epoch": 0.6044031647746818, + "grad_norm": 0.25788793428903745, + "learning_rate": 0.0003572639776217277, + "loss": 2.5264, + "step": 5271 + }, + { + "epoch": 0.6045178305240225, + "grad_norm": 0.2517578499262476, + "learning_rate": 0.0003570860195258848, + "loss": 2.4289, + "step": 5272 + }, + { + "epoch": 0.6046324962733631, + "grad_norm": 0.22030637604971712, + "learning_rate": 0.00035690808114230145, + "loss": 2.2829, + "step": 5273 + }, + { + "epoch": 0.6047471620227038, + "grad_norm": 0.2703497515528306, + "learning_rate": 0.0003567301624955208, + "loss": 2.3831, + "step": 5274 + }, + { + "epoch": 0.6048618277720444, + "grad_norm": 0.25363086414629726, + "learning_rate": 0.00035655226361008325, + "loss": 2.567, + "step": 5275 + }, + { + "epoch": 0.6049764935213852, + "grad_norm": 0.24746203728924887, + "learning_rate": 0.0003563743845105265, + "loss": 2.408, + "step": 5276 + }, + { + "epoch": 0.6050911592707259, + "grad_norm": 0.24375154873142385, + "learning_rate": 0.0003561965252213855, + "loss": 2.3578, + "step": 5277 + }, + { + "epoch": 0.6052058250200665, + "grad_norm": 0.23743623433277636, + "learning_rate": 0.00035601868576719297, + "loss": 2.4494, + "step": 5278 + }, + { + "epoch": 0.6053204907694072, + "grad_norm": 0.2566299983399756, + "learning_rate": 0.0003558408661724781, + "loss": 2.4442, + "step": 5279 + }, + { + "epoch": 0.6054351565187478, + "grad_norm": 0.25666514634085624, + "learning_rate": 0.0003556630664617679, + "loss": 2.384, + "step": 5280 + }, + { + "epoch": 0.6055498222680885, + "grad_norm": 0.2354687967004414, + "learning_rate": 0.00035548528665958637, + "loss": 2.4164, + "step": 5281 + }, + { + "epoch": 0.6056644880174292, + "grad_norm": 0.26590811114037705, + "learning_rate": 0.00035530752679045465, + "loss": 2.4596, + "step": 5282 + }, + { + "epoch": 0.6057791537667698, + "grad_norm": 0.27734069168319, + "learning_rate": 0.00035512978687889156, + "loss": 2.485, + "step": 5283 + }, + { + "epoch": 0.6058938195161105, + "grad_norm": 0.2697654086713395, + "learning_rate": 0.00035495206694941304, + "loss": 2.4415, + "step": 5284 + }, + { + "epoch": 0.6060084852654513, + "grad_norm": 0.23765136557901423, + "learning_rate": 0.0003547743670265319, + "loss": 2.4124, + "step": 5285 + }, + { + "epoch": 0.6061231510147919, + "grad_norm": 0.2840736867979609, + "learning_rate": 0.00035459668713475875, + "loss": 2.3979, + "step": 5286 + }, + { + "epoch": 0.6062378167641326, + "grad_norm": 0.25820808907061493, + "learning_rate": 0.0003544190272986007, + "loss": 2.3158, + "step": 5287 + }, + { + "epoch": 0.6063524825134732, + "grad_norm": 0.25055694869910355, + "learning_rate": 0.0003542413875425631, + "loss": 2.4006, + "step": 5288 + }, + { + "epoch": 0.6064671482628139, + "grad_norm": 0.2759542549345469, + "learning_rate": 0.0003540637678911475, + "loss": 2.4224, + "step": 5289 + }, + { + "epoch": 0.6065818140121546, + "grad_norm": 0.24797148858259108, + "learning_rate": 0.00035388616836885334, + "loss": 2.5718, + "step": 5290 + }, + { + "epoch": 0.6066964797614952, + "grad_norm": 0.26304279965926813, + "learning_rate": 0.00035370858900017713, + "loss": 2.5309, + "step": 5291 + }, + { + "epoch": 0.6068111455108359, + "grad_norm": 0.25200293855260864, + "learning_rate": 0.00035353102980961227, + "loss": 2.3644, + "step": 5292 + }, + { + "epoch": 0.6069258112601766, + "grad_norm": 0.24138956654637037, + "learning_rate": 0.00035335349082165013, + "loss": 2.4127, + "step": 5293 + }, + { + "epoch": 0.6070404770095172, + "grad_norm": 0.22749679120027289, + "learning_rate": 0.00035317597206077866, + "loss": 2.4741, + "step": 5294 + }, + { + "epoch": 0.607155142758858, + "grad_norm": 0.2519236486163959, + "learning_rate": 0.00035299847355148295, + "loss": 2.2825, + "step": 5295 + }, + { + "epoch": 0.6072698085081986, + "grad_norm": 0.2387713141570366, + "learning_rate": 0.00035282099531824585, + "loss": 2.3967, + "step": 5296 + }, + { + "epoch": 0.6073844742575393, + "grad_norm": 0.24280946925227198, + "learning_rate": 0.0003526435373855468, + "loss": 2.4135, + "step": 5297 + }, + { + "epoch": 0.60749914000688, + "grad_norm": 0.23181557819701756, + "learning_rate": 0.0003524660997778629, + "loss": 2.4425, + "step": 5298 + }, + { + "epoch": 0.6076138057562206, + "grad_norm": 0.23932708120345042, + "learning_rate": 0.0003522886825196684, + "loss": 2.636, + "step": 5299 + }, + { + "epoch": 0.6077284715055613, + "grad_norm": 0.24252249074982363, + "learning_rate": 0.0003521112856354346, + "loss": 2.4565, + "step": 5300 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 0.22905014202754032, + "learning_rate": 0.00035193390914962973, + "loss": 2.3795, + "step": 5301 + }, + { + "epoch": 0.6079578030042426, + "grad_norm": 0.22919836267462645, + "learning_rate": 0.0003517565530867196, + "loss": 2.414, + "step": 5302 + }, + { + "epoch": 0.6080724687535833, + "grad_norm": 0.23040647159033725, + "learning_rate": 0.0003515792174711675, + "loss": 2.4626, + "step": 5303 + }, + { + "epoch": 0.6081871345029239, + "grad_norm": 0.2765265900062124, + "learning_rate": 0.00035140190232743303, + "loss": 2.4132, + "step": 5304 + }, + { + "epoch": 0.6083018002522647, + "grad_norm": 0.25576890938940805, + "learning_rate": 0.00035122460767997364, + "loss": 2.4717, + "step": 5305 + }, + { + "epoch": 0.6084164660016054, + "grad_norm": 0.23646117768922026, + "learning_rate": 0.0003510473335532438, + "loss": 2.4564, + "step": 5306 + }, + { + "epoch": 0.608531131750946, + "grad_norm": 0.2524754209515238, + "learning_rate": 0.0003508700799716947, + "loss": 2.5803, + "step": 5307 + }, + { + "epoch": 0.6086457975002867, + "grad_norm": 0.24059152236415793, + "learning_rate": 0.0003506928469597756, + "loss": 2.5121, + "step": 5308 + }, + { + "epoch": 0.6087604632496273, + "grad_norm": 0.27906973337441204, + "learning_rate": 0.00035051563454193226, + "loss": 2.445, + "step": 5309 + }, + { + "epoch": 0.608875128998968, + "grad_norm": 0.2494819472270163, + "learning_rate": 0.0003503384427426076, + "loss": 2.4065, + "step": 5310 + }, + { + "epoch": 0.6089897947483087, + "grad_norm": 0.24190555133948896, + "learning_rate": 0.00035016127158624203, + "loss": 2.4636, + "step": 5311 + }, + { + "epoch": 0.6091044604976493, + "grad_norm": 0.2431297784236629, + "learning_rate": 0.00034998412109727274, + "loss": 2.4355, + "step": 5312 + }, + { + "epoch": 0.60921912624699, + "grad_norm": 0.22485520845922946, + "learning_rate": 0.0003498069913001345, + "loss": 2.4302, + "step": 5313 + }, + { + "epoch": 0.6093337919963306, + "grad_norm": 0.25101323798351055, + "learning_rate": 0.0003496298822192588, + "loss": 2.4538, + "step": 5314 + }, + { + "epoch": 0.6094484577456714, + "grad_norm": 0.26425404006436537, + "learning_rate": 0.00034945279387907466, + "loss": 2.3939, + "step": 5315 + }, + { + "epoch": 0.6095631234950121, + "grad_norm": 0.2314290867886926, + "learning_rate": 0.00034927572630400773, + "loss": 2.4765, + "step": 5316 + }, + { + "epoch": 0.6096777892443527, + "grad_norm": 0.2500960088918181, + "learning_rate": 0.0003490986795184813, + "loss": 2.4621, + "step": 5317 + }, + { + "epoch": 0.6097924549936934, + "grad_norm": 0.22868225792125754, + "learning_rate": 0.0003489216535469157, + "loss": 2.4038, + "step": 5318 + }, + { + "epoch": 0.6099071207430341, + "grad_norm": 0.24570568204533658, + "learning_rate": 0.0003487446484137282, + "loss": 2.4946, + "step": 5319 + }, + { + "epoch": 0.6100217864923747, + "grad_norm": 0.24154158676179366, + "learning_rate": 0.0003485676641433332, + "loss": 2.5139, + "step": 5320 + }, + { + "epoch": 0.6101364522417154, + "grad_norm": 0.22998705878680656, + "learning_rate": 0.0003483907007601425, + "loss": 2.4438, + "step": 5321 + }, + { + "epoch": 0.610251117991056, + "grad_norm": 0.24924259417746333, + "learning_rate": 0.00034821375828856437, + "loss": 2.3711, + "step": 5322 + }, + { + "epoch": 0.6103657837403967, + "grad_norm": 0.25715444458758147, + "learning_rate": 0.00034803683675300513, + "loss": 2.5113, + "step": 5323 + }, + { + "epoch": 0.6104804494897375, + "grad_norm": 0.23253186839330517, + "learning_rate": 0.0003478599361778677, + "loss": 2.4714, + "step": 5324 + }, + { + "epoch": 0.6105951152390781, + "grad_norm": 0.24962603018695653, + "learning_rate": 0.000347683056587552, + "loss": 2.4384, + "step": 5325 + }, + { + "epoch": 0.6107097809884188, + "grad_norm": 0.2651151403688997, + "learning_rate": 0.0003475061980064551, + "loss": 2.3589, + "step": 5326 + }, + { + "epoch": 0.6108244467377595, + "grad_norm": 0.2497386988672167, + "learning_rate": 0.00034732936045897133, + "loss": 2.3886, + "step": 5327 + }, + { + "epoch": 0.6109391124871001, + "grad_norm": 0.22820889102795114, + "learning_rate": 0.0003471525439694924, + "loss": 2.4139, + "step": 5328 + }, + { + "epoch": 0.6110537782364408, + "grad_norm": 0.2476447657138765, + "learning_rate": 0.0003469757485624062, + "loss": 2.6063, + "step": 5329 + }, + { + "epoch": 0.6111684439857814, + "grad_norm": 0.2573423269290262, + "learning_rate": 0.00034679897426209883, + "loss": 2.458, + "step": 5330 + }, + { + "epoch": 0.6112831097351221, + "grad_norm": 0.29318343669114416, + "learning_rate": 0.0003466222210929526, + "loss": 2.3243, + "step": 5331 + }, + { + "epoch": 0.6113977754844628, + "grad_norm": 0.28252808873586965, + "learning_rate": 0.00034644548907934714, + "loss": 2.3889, + "step": 5332 + }, + { + "epoch": 0.6115124412338034, + "grad_norm": 0.2648378042282343, + "learning_rate": 0.0003462687782456596, + "loss": 2.4225, + "step": 5333 + }, + { + "epoch": 0.6116271069831442, + "grad_norm": 0.24412091446145343, + "learning_rate": 0.00034609208861626373, + "loss": 2.5899, + "step": 5334 + }, + { + "epoch": 0.6117417727324848, + "grad_norm": 0.24491556753776308, + "learning_rate": 0.0003459154202155305, + "loss": 2.3439, + "step": 5335 + }, + { + "epoch": 0.6118564384818255, + "grad_norm": 0.26291881972271297, + "learning_rate": 0.0003457387730678279, + "loss": 2.3998, + "step": 5336 + }, + { + "epoch": 0.6119711042311662, + "grad_norm": 0.24365504385917092, + "learning_rate": 0.000345562147197521, + "loss": 2.4626, + "step": 5337 + }, + { + "epoch": 0.6120857699805068, + "grad_norm": 0.23634335306218962, + "learning_rate": 0.00034538554262897217, + "loss": 2.4631, + "step": 5338 + }, + { + "epoch": 0.6122004357298475, + "grad_norm": 0.24538168385464576, + "learning_rate": 0.0003452089593865404, + "loss": 2.341, + "step": 5339 + }, + { + "epoch": 0.6123151014791882, + "grad_norm": 0.230633745840467, + "learning_rate": 0.0003450323974945822, + "loss": 2.2428, + "step": 5340 + }, + { + "epoch": 0.6124297672285288, + "grad_norm": 0.262455514925542, + "learning_rate": 0.0003448558569774507, + "loss": 2.4891, + "step": 5341 + }, + { + "epoch": 0.6125444329778695, + "grad_norm": 0.25328951010068307, + "learning_rate": 0.0003446793378594963, + "loss": 2.4043, + "step": 5342 + }, + { + "epoch": 0.6126590987272101, + "grad_norm": 0.26598490696861865, + "learning_rate": 0.0003445028401650668, + "loss": 2.5008, + "step": 5343 + }, + { + "epoch": 0.6127737644765509, + "grad_norm": 0.23458522331521278, + "learning_rate": 0.00034432636391850655, + "loss": 2.5524, + "step": 5344 + }, + { + "epoch": 0.6128884302258916, + "grad_norm": 0.2478789298453673, + "learning_rate": 0.0003441499091441568, + "loss": 2.3756, + "step": 5345 + }, + { + "epoch": 0.6130030959752322, + "grad_norm": 0.27509912124723396, + "learning_rate": 0.00034397347586635643, + "loss": 2.3928, + "step": 5346 + }, + { + "epoch": 0.6131177617245729, + "grad_norm": 0.279331823798233, + "learning_rate": 0.00034379706410944064, + "loss": 2.2535, + "step": 5347 + }, + { + "epoch": 0.6132324274739135, + "grad_norm": 0.257585247726897, + "learning_rate": 0.00034362067389774256, + "loss": 2.5285, + "step": 5348 + }, + { + "epoch": 0.6133470932232542, + "grad_norm": 0.23506507696650417, + "learning_rate": 0.0003434443052555917, + "loss": 2.3818, + "step": 5349 + }, + { + "epoch": 0.6134617589725949, + "grad_norm": 0.24023361477867655, + "learning_rate": 0.00034326795820731473, + "loss": 2.3018, + "step": 5350 + }, + { + "epoch": 0.6135764247219355, + "grad_norm": 0.23772648096040327, + "learning_rate": 0.0003430916327772352, + "loss": 2.4359, + "step": 5351 + }, + { + "epoch": 0.6136910904712762, + "grad_norm": 0.2532255201833486, + "learning_rate": 0.00034291532898967394, + "loss": 2.4785, + "step": 5352 + }, + { + "epoch": 0.613805756220617, + "grad_norm": 0.2434741852604548, + "learning_rate": 0.0003427390468689489, + "loss": 2.46, + "step": 5353 + }, + { + "epoch": 0.6139204219699576, + "grad_norm": 0.2515945637004177, + "learning_rate": 0.00034256278643937454, + "loss": 2.3422, + "step": 5354 + }, + { + "epoch": 0.6140350877192983, + "grad_norm": 0.26528098987973076, + "learning_rate": 0.000342386547725263, + "loss": 2.5322, + "step": 5355 + }, + { + "epoch": 0.6141497534686389, + "grad_norm": 0.2609598632082265, + "learning_rate": 0.00034221033075092265, + "loss": 2.4962, + "step": 5356 + }, + { + "epoch": 0.6142644192179796, + "grad_norm": 0.25369520182966604, + "learning_rate": 0.00034203413554065933, + "loss": 2.5861, + "step": 5357 + }, + { + "epoch": 0.6143790849673203, + "grad_norm": 0.23665506797778044, + "learning_rate": 0.000341857962118776, + "loss": 2.3958, + "step": 5358 + }, + { + "epoch": 0.6144937507166609, + "grad_norm": 0.2441334740332325, + "learning_rate": 0.00034168181050957247, + "loss": 2.3437, + "step": 5359 + }, + { + "epoch": 0.6146084164660016, + "grad_norm": 0.22214658520071795, + "learning_rate": 0.0003415056807373452, + "loss": 2.2619, + "step": 5360 + }, + { + "epoch": 0.6147230822153423, + "grad_norm": 0.23875081726655506, + "learning_rate": 0.00034132957282638824, + "loss": 2.3703, + "step": 5361 + }, + { + "epoch": 0.6148377479646829, + "grad_norm": 0.26443556683443215, + "learning_rate": 0.0003411534868009919, + "loss": 2.4505, + "step": 5362 + }, + { + "epoch": 0.6149524137140236, + "grad_norm": 0.2687047986855505, + "learning_rate": 0.0003409774226854444, + "loss": 2.3973, + "step": 5363 + }, + { + "epoch": 0.6150670794633643, + "grad_norm": 0.24301725854488332, + "learning_rate": 0.0003408013805040301, + "loss": 2.2918, + "step": 5364 + }, + { + "epoch": 0.615181745212705, + "grad_norm": 0.24735351836206526, + "learning_rate": 0.00034062536028103087, + "loss": 2.4766, + "step": 5365 + }, + { + "epoch": 0.6152964109620457, + "grad_norm": 0.246171574088853, + "learning_rate": 0.0003404493620407252, + "loss": 2.3657, + "step": 5366 + }, + { + "epoch": 0.6154110767113863, + "grad_norm": 0.2718113017641868, + "learning_rate": 0.0003402733858073885, + "loss": 2.5335, + "step": 5367 + }, + { + "epoch": 0.615525742460727, + "grad_norm": 0.2372415201337167, + "learning_rate": 0.0003400974316052938, + "loss": 2.4527, + "step": 5368 + }, + { + "epoch": 0.6156404082100676, + "grad_norm": 0.2417647084620309, + "learning_rate": 0.0003399214994587104, + "loss": 2.3643, + "step": 5369 + }, + { + "epoch": 0.6157550739594083, + "grad_norm": 0.262513289649278, + "learning_rate": 0.0003397455893919047, + "loss": 2.4304, + "step": 5370 + }, + { + "epoch": 0.615869739708749, + "grad_norm": 0.24577929035906254, + "learning_rate": 0.00033956970142914026, + "loss": 2.3848, + "step": 5371 + }, + { + "epoch": 0.6159844054580896, + "grad_norm": 0.259299308815684, + "learning_rate": 0.0003393938355946773, + "loss": 2.4103, + "step": 5372 + }, + { + "epoch": 0.6160990712074303, + "grad_norm": 0.25697462288424944, + "learning_rate": 0.0003392179919127734, + "loss": 2.4373, + "step": 5373 + }, + { + "epoch": 0.6162137369567711, + "grad_norm": 0.26246140448745964, + "learning_rate": 0.0003390421704076827, + "loss": 2.5576, + "step": 5374 + }, + { + "epoch": 0.6163284027061117, + "grad_norm": 0.2397669094543542, + "learning_rate": 0.0003388663711036566, + "loss": 2.5079, + "step": 5375 + }, + { + "epoch": 0.6164430684554524, + "grad_norm": 0.30014075123792505, + "learning_rate": 0.00033869059402494295, + "loss": 2.4433, + "step": 5376 + }, + { + "epoch": 0.616557734204793, + "grad_norm": 0.2333488140283522, + "learning_rate": 0.0003385148391957869, + "loss": 2.3793, + "step": 5377 + }, + { + "epoch": 0.6166723999541337, + "grad_norm": 0.24448568344986832, + "learning_rate": 0.0003383391066404308, + "loss": 2.4711, + "step": 5378 + }, + { + "epoch": 0.6167870657034744, + "grad_norm": 0.25274471680646865, + "learning_rate": 0.00033816339638311334, + "loss": 2.5064, + "step": 5379 + }, + { + "epoch": 0.616901731452815, + "grad_norm": 0.22863551919023348, + "learning_rate": 0.0003379877084480706, + "loss": 2.3477, + "step": 5380 + }, + { + "epoch": 0.6170163972021557, + "grad_norm": 0.23754900820380526, + "learning_rate": 0.00033781204285953515, + "loss": 2.488, + "step": 5381 + }, + { + "epoch": 0.6171310629514963, + "grad_norm": 0.2472998426717085, + "learning_rate": 0.00033763639964173663, + "loss": 2.4095, + "step": 5382 + }, + { + "epoch": 0.617245728700837, + "grad_norm": 0.24491155008254736, + "learning_rate": 0.000337460778818902, + "loss": 2.3917, + "step": 5383 + }, + { + "epoch": 0.6173603944501778, + "grad_norm": 0.25190449317129016, + "learning_rate": 0.00033728518041525467, + "loss": 2.4115, + "step": 5384 + }, + { + "epoch": 0.6174750601995184, + "grad_norm": 0.23093202841879926, + "learning_rate": 0.00033710960445501505, + "loss": 2.3739, + "step": 5385 + }, + { + "epoch": 0.6175897259488591, + "grad_norm": 0.24569168772200303, + "learning_rate": 0.0003369340509624006, + "loss": 2.4419, + "step": 5386 + }, + { + "epoch": 0.6177043916981998, + "grad_norm": 0.2562113725187753, + "learning_rate": 0.0003367585199616253, + "loss": 2.3976, + "step": 5387 + }, + { + "epoch": 0.6178190574475404, + "grad_norm": 0.22528454890243937, + "learning_rate": 0.0003365830114769007, + "loss": 2.3696, + "step": 5388 + }, + { + "epoch": 0.6179337231968811, + "grad_norm": 0.22612204732915472, + "learning_rate": 0.0003364075255324345, + "loss": 2.4453, + "step": 5389 + }, + { + "epoch": 0.6180483889462217, + "grad_norm": 0.23281230148204482, + "learning_rate": 0.00033623206215243197, + "loss": 2.4307, + "step": 5390 + }, + { + "epoch": 0.6181630546955624, + "grad_norm": 0.23161343870385326, + "learning_rate": 0.0003360566213610947, + "loss": 2.3729, + "step": 5391 + }, + { + "epoch": 0.6182777204449031, + "grad_norm": 0.2324230132405558, + "learning_rate": 0.0003358812031826213, + "loss": 2.424, + "step": 5392 + }, + { + "epoch": 0.6183923861942437, + "grad_norm": 0.2572644201045394, + "learning_rate": 0.0003357058076412077, + "loss": 2.4777, + "step": 5393 + }, + { + "epoch": 0.6185070519435845, + "grad_norm": 0.2308997959575474, + "learning_rate": 0.0003355304347610463, + "loss": 2.4043, + "step": 5394 + }, + { + "epoch": 0.6186217176929252, + "grad_norm": 0.25205588247545285, + "learning_rate": 0.0003353550845663261, + "loss": 2.2769, + "step": 5395 + }, + { + "epoch": 0.6187363834422658, + "grad_norm": 0.2669851319857069, + "learning_rate": 0.0003351797570812338, + "loss": 2.36, + "step": 5396 + }, + { + "epoch": 0.6188510491916065, + "grad_norm": 0.22808200599646938, + "learning_rate": 0.00033500445232995205, + "loss": 2.4196, + "step": 5397 + }, + { + "epoch": 0.6189657149409471, + "grad_norm": 0.24670492708992056, + "learning_rate": 0.00033482917033666104, + "loss": 2.4569, + "step": 5398 + }, + { + "epoch": 0.6190803806902878, + "grad_norm": 0.24139017048679554, + "learning_rate": 0.00033465391112553767, + "loss": 2.4359, + "step": 5399 + }, + { + "epoch": 0.6191950464396285, + "grad_norm": 0.24756109501414555, + "learning_rate": 0.0003344786747207555, + "loss": 2.4111, + "step": 5400 + }, + { + "epoch": 0.6193097121889691, + "grad_norm": 0.2866634266847848, + "learning_rate": 0.0003343034611464849, + "loss": 2.5018, + "step": 5401 + }, + { + "epoch": 0.6194243779383098, + "grad_norm": 0.24863575240457936, + "learning_rate": 0.0003341282704268933, + "loss": 2.3661, + "step": 5402 + }, + { + "epoch": 0.6195390436876504, + "grad_norm": 0.23477959265776807, + "learning_rate": 0.0003339531025861451, + "loss": 2.3637, + "step": 5403 + }, + { + "epoch": 0.6196537094369912, + "grad_norm": 0.24253278868488765, + "learning_rate": 0.0003337779576484012, + "loss": 2.6492, + "step": 5404 + }, + { + "epoch": 0.6197683751863319, + "grad_norm": 0.26010857383535524, + "learning_rate": 0.00033360283563781975, + "loss": 2.5186, + "step": 5405 + }, + { + "epoch": 0.6198830409356725, + "grad_norm": 0.21839527200976241, + "learning_rate": 0.0003334277365785551, + "loss": 2.413, + "step": 5406 + }, + { + "epoch": 0.6199977066850132, + "grad_norm": 0.27046315936372234, + "learning_rate": 0.0003332526604947591, + "loss": 2.3708, + "step": 5407 + }, + { + "epoch": 0.6201123724343539, + "grad_norm": 0.24933712577442851, + "learning_rate": 0.00033307760741058004, + "loss": 2.561, + "step": 5408 + }, + { + "epoch": 0.6202270381836945, + "grad_norm": 0.24644868132629197, + "learning_rate": 0.0003329025773501634, + "loss": 2.3822, + "step": 5409 + }, + { + "epoch": 0.6203417039330352, + "grad_norm": 0.2607458816152741, + "learning_rate": 0.0003327275703376508, + "loss": 2.3869, + "step": 5410 + }, + { + "epoch": 0.6204563696823758, + "grad_norm": 0.2426621633219935, + "learning_rate": 0.0003325525863971816, + "loss": 2.3628, + "step": 5411 + }, + { + "epoch": 0.6205710354317165, + "grad_norm": 0.2537107141174077, + "learning_rate": 0.000332377625552891, + "loss": 2.5197, + "step": 5412 + }, + { + "epoch": 0.6206857011810573, + "grad_norm": 0.2524824562248918, + "learning_rate": 0.00033220268782891197, + "loss": 2.4506, + "step": 5413 + }, + { + "epoch": 0.6208003669303979, + "grad_norm": 0.2452999470591215, + "learning_rate": 0.0003320277732493736, + "loss": 2.4518, + "step": 5414 + }, + { + "epoch": 0.6209150326797386, + "grad_norm": 0.2640588940936114, + "learning_rate": 0.0003318528818384021, + "loss": 2.4165, + "step": 5415 + }, + { + "epoch": 0.6210296984290792, + "grad_norm": 0.23576368038103548, + "learning_rate": 0.00033167801362012035, + "loss": 2.4621, + "step": 5416 + }, + { + "epoch": 0.6211443641784199, + "grad_norm": 0.2510023379825873, + "learning_rate": 0.00033150316861864793, + "loss": 2.3299, + "step": 5417 + }, + { + "epoch": 0.6212590299277606, + "grad_norm": 0.26690381175729844, + "learning_rate": 0.00033132834685810185, + "loss": 2.4362, + "step": 5418 + }, + { + "epoch": 0.6213736956771012, + "grad_norm": 0.27413337620667927, + "learning_rate": 0.0003311535483625951, + "loss": 2.5005, + "step": 5419 + }, + { + "epoch": 0.6214883614264419, + "grad_norm": 0.22795907387295283, + "learning_rate": 0.00033097877315623776, + "loss": 2.4265, + "step": 5420 + }, + { + "epoch": 0.6216030271757826, + "grad_norm": 0.23039072247904063, + "learning_rate": 0.0003308040212631369, + "loss": 2.5533, + "step": 5421 + }, + { + "epoch": 0.6217176929251232, + "grad_norm": 0.24550036024223942, + "learning_rate": 0.00033062929270739595, + "loss": 2.4412, + "step": 5422 + }, + { + "epoch": 0.621832358674464, + "grad_norm": 0.2706890173584199, + "learning_rate": 0.0003304545875131157, + "loss": 2.4122, + "step": 5423 + }, + { + "epoch": 0.6219470244238046, + "grad_norm": 0.22197097543097835, + "learning_rate": 0.0003302799057043934, + "loss": 2.571, + "step": 5424 + }, + { + "epoch": 0.6220616901731453, + "grad_norm": 0.24134249971579047, + "learning_rate": 0.0003301052473053228, + "loss": 2.3304, + "step": 5425 + }, + { + "epoch": 0.622176355922486, + "grad_norm": 0.258388470223987, + "learning_rate": 0.0003299306123399949, + "loss": 2.4532, + "step": 5426 + }, + { + "epoch": 0.6222910216718266, + "grad_norm": 0.2593944358738077, + "learning_rate": 0.000329756000832497, + "loss": 2.4445, + "step": 5427 + }, + { + "epoch": 0.6224056874211673, + "grad_norm": 0.2656660588069641, + "learning_rate": 0.0003295814128069139, + "loss": 2.3967, + "step": 5428 + }, + { + "epoch": 0.622520353170508, + "grad_norm": 0.231867502400429, + "learning_rate": 0.00032940684828732637, + "loss": 2.4026, + "step": 5429 + }, + { + "epoch": 0.6226350189198486, + "grad_norm": 0.25092649502049413, + "learning_rate": 0.00032923230729781235, + "loss": 2.3368, + "step": 5430 + }, + { + "epoch": 0.6227496846691893, + "grad_norm": 0.25358475929739666, + "learning_rate": 0.00032905778986244634, + "loss": 2.4898, + "step": 5431 + }, + { + "epoch": 0.6228643504185299, + "grad_norm": 0.26379407018434853, + "learning_rate": 0.0003288832960052998, + "loss": 2.3477, + "step": 5432 + }, + { + "epoch": 0.6229790161678707, + "grad_norm": 0.25057578476317127, + "learning_rate": 0.0003287088257504407, + "loss": 2.325, + "step": 5433 + }, + { + "epoch": 0.6230936819172114, + "grad_norm": 0.26982041231049947, + "learning_rate": 0.0003285343791219341, + "loss": 2.4719, + "step": 5434 + }, + { + "epoch": 0.623208347666552, + "grad_norm": 0.2614465627258252, + "learning_rate": 0.0003283599561438414, + "loss": 2.446, + "step": 5435 + }, + { + "epoch": 0.6233230134158927, + "grad_norm": 0.25195433707880965, + "learning_rate": 0.0003281855568402211, + "loss": 2.421, + "step": 5436 + }, + { + "epoch": 0.6234376791652333, + "grad_norm": 0.22649154864278348, + "learning_rate": 0.00032801118123512794, + "loss": 2.4118, + "step": 5437 + }, + { + "epoch": 0.623552344914574, + "grad_norm": 0.24825349844754543, + "learning_rate": 0.00032783682935261417, + "loss": 2.4357, + "step": 5438 + }, + { + "epoch": 0.6236670106639147, + "grad_norm": 0.24996870611544156, + "learning_rate": 0.00032766250121672784, + "loss": 2.2983, + "step": 5439 + }, + { + "epoch": 0.6237816764132553, + "grad_norm": 0.2701061304116632, + "learning_rate": 0.00032748819685151465, + "loss": 2.4118, + "step": 5440 + }, + { + "epoch": 0.623896342162596, + "grad_norm": 0.25082489388495977, + "learning_rate": 0.00032731391628101615, + "loss": 2.3431, + "step": 5441 + }, + { + "epoch": 0.6240110079119368, + "grad_norm": 0.253547312280989, + "learning_rate": 0.00032713965952927115, + "loss": 2.4363, + "step": 5442 + }, + { + "epoch": 0.6241256736612774, + "grad_norm": 0.23819635344120613, + "learning_rate": 0.00032696542662031527, + "loss": 2.3073, + "step": 5443 + }, + { + "epoch": 0.6242403394106181, + "grad_norm": 0.24724090571936858, + "learning_rate": 0.0003267912175781803, + "loss": 2.6248, + "step": 5444 + }, + { + "epoch": 0.6243550051599587, + "grad_norm": 0.23203628175832056, + "learning_rate": 0.0003266170324268954, + "loss": 2.4089, + "step": 5445 + }, + { + "epoch": 0.6244696709092994, + "grad_norm": 0.2551391047297994, + "learning_rate": 0.0003264428711904859, + "loss": 2.3243, + "step": 5446 + }, + { + "epoch": 0.6245843366586401, + "grad_norm": 0.2328920030844889, + "learning_rate": 0.00032626873389297374, + "loss": 2.5156, + "step": 5447 + }, + { + "epoch": 0.6246990024079807, + "grad_norm": 0.24468383494260626, + "learning_rate": 0.00032609462055837826, + "loss": 2.5453, + "step": 5448 + }, + { + "epoch": 0.6248136681573214, + "grad_norm": 0.2509440208584527, + "learning_rate": 0.000325920531210715, + "loss": 2.493, + "step": 5449 + }, + { + "epoch": 0.624928333906662, + "grad_norm": 0.21954094729802456, + "learning_rate": 0.00032574646587399614, + "loss": 2.3892, + "step": 5450 + }, + { + "epoch": 0.6250429996560027, + "grad_norm": 0.2555694687744024, + "learning_rate": 0.0003255724245722308, + "loss": 2.4452, + "step": 5451 + }, + { + "epoch": 0.6251576654053435, + "grad_norm": 0.2410474378451684, + "learning_rate": 0.0003253984073294245, + "loss": 2.5267, + "step": 5452 + }, + { + "epoch": 0.6252723311546841, + "grad_norm": 0.23963320541046418, + "learning_rate": 0.0003252244141695798, + "loss": 2.4459, + "step": 5453 + }, + { + "epoch": 0.6253869969040248, + "grad_norm": 0.2351413068638118, + "learning_rate": 0.0003250504451166956, + "loss": 2.4315, + "step": 5454 + }, + { + "epoch": 0.6255016626533655, + "grad_norm": 0.23543347380739077, + "learning_rate": 0.0003248765001947677, + "loss": 2.4486, + "step": 5455 + }, + { + "epoch": 0.6256163284027061, + "grad_norm": 0.24500196711101976, + "learning_rate": 0.00032470257942778837, + "loss": 2.3563, + "step": 5456 + }, + { + "epoch": 0.6257309941520468, + "grad_norm": 0.26353541801449626, + "learning_rate": 0.00032452868283974675, + "loss": 2.4245, + "step": 5457 + }, + { + "epoch": 0.6258456599013874, + "grad_norm": 0.21686078378451998, + "learning_rate": 0.00032435481045462854, + "loss": 2.2888, + "step": 5458 + }, + { + "epoch": 0.6259603256507281, + "grad_norm": 0.2415018207485043, + "learning_rate": 0.0003241809622964162, + "loss": 2.469, + "step": 5459 + }, + { + "epoch": 0.6260749914000688, + "grad_norm": 0.23086990660280785, + "learning_rate": 0.00032400713838908856, + "loss": 2.4293, + "step": 5460 + }, + { + "epoch": 0.6261896571494094, + "grad_norm": 0.23851945675743613, + "learning_rate": 0.00032383333875662165, + "loss": 2.5474, + "step": 5461 + }, + { + "epoch": 0.6263043228987502, + "grad_norm": 0.2690089692349101, + "learning_rate": 0.00032365956342298737, + "loss": 2.6645, + "step": 5462 + }, + { + "epoch": 0.6264189886480909, + "grad_norm": 0.24439181347511676, + "learning_rate": 0.0003234858124121551, + "loss": 2.51, + "step": 5463 + }, + { + "epoch": 0.6265336543974315, + "grad_norm": 0.26903258539880825, + "learning_rate": 0.00032331208574809035, + "loss": 2.5185, + "step": 5464 + }, + { + "epoch": 0.6266483201467722, + "grad_norm": 0.26012570838808835, + "learning_rate": 0.0003231383834547554, + "loss": 2.3846, + "step": 5465 + }, + { + "epoch": 0.6267629858961128, + "grad_norm": 0.2296438192908017, + "learning_rate": 0.0003229647055561091, + "loss": 2.3762, + "step": 5466 + }, + { + "epoch": 0.6268776516454535, + "grad_norm": 0.23473058490373952, + "learning_rate": 0.000322791052076107, + "loss": 2.376, + "step": 5467 + }, + { + "epoch": 0.6269923173947942, + "grad_norm": 0.23961258660261542, + "learning_rate": 0.00032261742303870154, + "loss": 2.6897, + "step": 5468 + }, + { + "epoch": 0.6271069831441348, + "grad_norm": 0.2600271461463504, + "learning_rate": 0.0003224438184678412, + "loss": 2.4828, + "step": 5469 + }, + { + "epoch": 0.6272216488934755, + "grad_norm": 0.2652892508562063, + "learning_rate": 0.0003222702383874717, + "loss": 2.4042, + "step": 5470 + }, + { + "epoch": 0.6273363146428161, + "grad_norm": 0.2797844459288448, + "learning_rate": 0.00032209668282153506, + "loss": 2.5332, + "step": 5471 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.28364195810296916, + "learning_rate": 0.0003219231517939696, + "loss": 2.4018, + "step": 5472 + }, + { + "epoch": 0.6275656461414976, + "grad_norm": 0.25439327998842626, + "learning_rate": 0.00032174964532871096, + "loss": 2.3814, + "step": 5473 + }, + { + "epoch": 0.6276803118908382, + "grad_norm": 0.26109929265176657, + "learning_rate": 0.00032157616344969113, + "loss": 2.4081, + "step": 5474 + }, + { + "epoch": 0.6277949776401789, + "grad_norm": 0.23569691371734813, + "learning_rate": 0.00032140270618083845, + "loss": 2.4352, + "step": 5475 + }, + { + "epoch": 0.6279096433895196, + "grad_norm": 0.2520229030454903, + "learning_rate": 0.0003212292735460781, + "loss": 2.3584, + "step": 5476 + }, + { + "epoch": 0.6280243091388602, + "grad_norm": 0.23998395655732369, + "learning_rate": 0.0003210558655693316, + "loss": 2.4266, + "step": 5477 + }, + { + "epoch": 0.6281389748882009, + "grad_norm": 0.25625259390238025, + "learning_rate": 0.00032088248227451767, + "loss": 2.5287, + "step": 5478 + }, + { + "epoch": 0.6282536406375415, + "grad_norm": 0.2501529640672899, + "learning_rate": 0.00032070912368555086, + "loss": 2.482, + "step": 5479 + }, + { + "epoch": 0.6283683063868822, + "grad_norm": 0.2345353127462096, + "learning_rate": 0.000320535789826343, + "loss": 2.1879, + "step": 5480 + }, + { + "epoch": 0.628482972136223, + "grad_norm": 0.23688116286080194, + "learning_rate": 0.0003203624807208019, + "loss": 2.3143, + "step": 5481 + }, + { + "epoch": 0.6285976378855636, + "grad_norm": 0.22545522103322052, + "learning_rate": 0.00032018919639283246, + "loss": 2.4015, + "step": 5482 + }, + { + "epoch": 0.6287123036349043, + "grad_norm": 0.21959471011646534, + "learning_rate": 0.0003200159368663358, + "loss": 2.4177, + "step": 5483 + }, + { + "epoch": 0.6288269693842449, + "grad_norm": 0.2501554885623485, + "learning_rate": 0.00031984270216520996, + "loss": 2.2608, + "step": 5484 + }, + { + "epoch": 0.6289416351335856, + "grad_norm": 0.24316059208074314, + "learning_rate": 0.0003196694923133491, + "loss": 2.3335, + "step": 5485 + }, + { + "epoch": 0.6290563008829263, + "grad_norm": 0.23623194923116173, + "learning_rate": 0.0003194963073346445, + "loss": 2.491, + "step": 5486 + }, + { + "epoch": 0.6291709666322669, + "grad_norm": 0.24110661172697564, + "learning_rate": 0.00031932314725298345, + "loss": 2.399, + "step": 5487 + }, + { + "epoch": 0.6292856323816076, + "grad_norm": 0.26376468003678255, + "learning_rate": 0.00031915001209225025, + "loss": 2.557, + "step": 5488 + }, + { + "epoch": 0.6294002981309483, + "grad_norm": 0.26543914747271163, + "learning_rate": 0.00031897690187632574, + "loss": 2.4134, + "step": 5489 + }, + { + "epoch": 0.6295149638802889, + "grad_norm": 0.2508087983585551, + "learning_rate": 0.00031880381662908693, + "loss": 2.5225, + "step": 5490 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.26452554061035183, + "learning_rate": 0.00031863075637440764, + "loss": 2.3985, + "step": 5491 + }, + { + "epoch": 0.6297442953789703, + "grad_norm": 0.25791049454827547, + "learning_rate": 0.00031845772113615813, + "loss": 2.565, + "step": 5492 + }, + { + "epoch": 0.629858961128311, + "grad_norm": 0.2578357698457459, + "learning_rate": 0.00031828471093820577, + "loss": 2.3649, + "step": 5493 + }, + { + "epoch": 0.6299736268776517, + "grad_norm": 0.2694390128545404, + "learning_rate": 0.0003181117258044136, + "loss": 2.4125, + "step": 5494 + }, + { + "epoch": 0.6300882926269923, + "grad_norm": 0.25791065427027676, + "learning_rate": 0.0003179387657586418, + "loss": 2.4542, + "step": 5495 + }, + { + "epoch": 0.630202958376333, + "grad_norm": 0.2423263581930325, + "learning_rate": 0.00031776583082474685, + "loss": 2.4937, + "step": 5496 + }, + { + "epoch": 0.6303176241256737, + "grad_norm": 0.2916520850055632, + "learning_rate": 0.0003175929210265817, + "loss": 2.3076, + "step": 5497 + }, + { + "epoch": 0.6304322898750143, + "grad_norm": 0.24161540883828006, + "learning_rate": 0.0003174200363879962, + "loss": 2.3997, + "step": 5498 + }, + { + "epoch": 0.630546955624355, + "grad_norm": 0.23804532370719833, + "learning_rate": 0.00031724717693283643, + "loss": 2.4723, + "step": 5499 + }, + { + "epoch": 0.6306616213736956, + "grad_norm": 0.2467822792884135, + "learning_rate": 0.000317074342684945, + "loss": 2.4242, + "step": 5500 + }, + { + "epoch": 0.6307762871230363, + "grad_norm": 0.23737049206667943, + "learning_rate": 0.0003169015336681612, + "loss": 2.4923, + "step": 5501 + }, + { + "epoch": 0.6308909528723771, + "grad_norm": 0.2506743641223088, + "learning_rate": 0.00031672874990632037, + "loss": 2.3795, + "step": 5502 + }, + { + "epoch": 0.6310056186217177, + "grad_norm": 0.23909620289245787, + "learning_rate": 0.0003165559914232553, + "loss": 2.3816, + "step": 5503 + }, + { + "epoch": 0.6311202843710584, + "grad_norm": 0.2375816680017304, + "learning_rate": 0.00031638325824279433, + "loss": 2.4596, + "step": 5504 + }, + { + "epoch": 0.631234950120399, + "grad_norm": 0.26140138403476304, + "learning_rate": 0.00031621055038876303, + "loss": 2.4322, + "step": 5505 + }, + { + "epoch": 0.6313496158697397, + "grad_norm": 0.22467097639761296, + "learning_rate": 0.00031603786788498265, + "loss": 2.5442, + "step": 5506 + }, + { + "epoch": 0.6314642816190804, + "grad_norm": 0.24217797317589312, + "learning_rate": 0.0003158652107552719, + "loss": 2.5105, + "step": 5507 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.23632741961744183, + "learning_rate": 0.0003156925790234454, + "loss": 2.3843, + "step": 5508 + }, + { + "epoch": 0.6316936131177617, + "grad_norm": 0.2383480290351181, + "learning_rate": 0.00031551997271331446, + "loss": 2.2803, + "step": 5509 + }, + { + "epoch": 0.6318082788671024, + "grad_norm": 0.26298746529120715, + "learning_rate": 0.0003153473918486867, + "loss": 2.5991, + "step": 5510 + }, + { + "epoch": 0.631922944616443, + "grad_norm": 0.2504096133030129, + "learning_rate": 0.0003151748364533665, + "loss": 2.4849, + "step": 5511 + }, + { + "epoch": 0.6320376103657838, + "grad_norm": 0.23958811871731764, + "learning_rate": 0.00031500230655115435, + "loss": 2.2696, + "step": 5512 + }, + { + "epoch": 0.6321522761151244, + "grad_norm": 0.25787432311562236, + "learning_rate": 0.0003148298021658478, + "loss": 2.4625, + "step": 5513 + }, + { + "epoch": 0.6322669418644651, + "grad_norm": 0.23531505375305742, + "learning_rate": 0.0003146573233212404, + "loss": 2.3895, + "step": 5514 + }, + { + "epoch": 0.6323816076138058, + "grad_norm": 0.2261254023625664, + "learning_rate": 0.0003144848700411223, + "loss": 2.4434, + "step": 5515 + }, + { + "epoch": 0.6324962733631464, + "grad_norm": 0.23953155573569931, + "learning_rate": 0.00031431244234928, + "loss": 2.4295, + "step": 5516 + }, + { + "epoch": 0.6326109391124871, + "grad_norm": 0.28301790978497304, + "learning_rate": 0.0003141400402694967, + "loss": 2.3678, + "step": 5517 + }, + { + "epoch": 0.6327256048618277, + "grad_norm": 0.25936219853458653, + "learning_rate": 0.00031396766382555226, + "loss": 2.4512, + "step": 5518 + }, + { + "epoch": 0.6328402706111684, + "grad_norm": 0.26562696973589034, + "learning_rate": 0.00031379531304122235, + "loss": 2.5733, + "step": 5519 + }, + { + "epoch": 0.6329549363605091, + "grad_norm": 0.26699292220508375, + "learning_rate": 0.00031362298794027977, + "loss": 2.4473, + "step": 5520 + }, + { + "epoch": 0.6330696021098497, + "grad_norm": 0.2558328254177126, + "learning_rate": 0.00031345068854649326, + "loss": 2.5546, + "step": 5521 + }, + { + "epoch": 0.6331842678591905, + "grad_norm": 0.23306523667239934, + "learning_rate": 0.00031327841488362816, + "loss": 2.6028, + "step": 5522 + }, + { + "epoch": 0.6332989336085312, + "grad_norm": 0.2681128333970615, + "learning_rate": 0.0003131061669754466, + "loss": 2.3875, + "step": 5523 + }, + { + "epoch": 0.6334135993578718, + "grad_norm": 0.24744012009539318, + "learning_rate": 0.0003129339448457069, + "loss": 2.3353, + "step": 5524 + }, + { + "epoch": 0.6335282651072125, + "grad_norm": 0.26523398461921827, + "learning_rate": 0.0003127617485181635, + "loss": 2.4449, + "step": 5525 + }, + { + "epoch": 0.6336429308565531, + "grad_norm": 0.23182057506235487, + "learning_rate": 0.000312589578016568, + "loss": 2.4889, + "step": 5526 + }, + { + "epoch": 0.6337575966058938, + "grad_norm": 0.25115581753960103, + "learning_rate": 0.00031241743336466767, + "loss": 2.4399, + "step": 5527 + }, + { + "epoch": 0.6338722623552345, + "grad_norm": 0.26182569183589843, + "learning_rate": 0.00031224531458620686, + "loss": 2.4776, + "step": 5528 + }, + { + "epoch": 0.6339869281045751, + "grad_norm": 0.24567204287346073, + "learning_rate": 0.0003120732217049259, + "loss": 2.3643, + "step": 5529 + }, + { + "epoch": 0.6341015938539158, + "grad_norm": 0.21424622793344852, + "learning_rate": 0.00031190115474456195, + "loss": 2.4366, + "step": 5530 + }, + { + "epoch": 0.6342162596032566, + "grad_norm": 0.23698383033086595, + "learning_rate": 0.00031172911372884807, + "loss": 2.3229, + "step": 5531 + }, + { + "epoch": 0.6343309253525972, + "grad_norm": 0.25363311166436553, + "learning_rate": 0.0003115570986815141, + "loss": 2.4634, + "step": 5532 + }, + { + "epoch": 0.6344455911019379, + "grad_norm": 0.24652759686856138, + "learning_rate": 0.0003113851096262865, + "loss": 2.5557, + "step": 5533 + }, + { + "epoch": 0.6345602568512785, + "grad_norm": 0.24408343214048267, + "learning_rate": 0.0003112131465868877, + "loss": 2.4431, + "step": 5534 + }, + { + "epoch": 0.6346749226006192, + "grad_norm": 0.23515488338936574, + "learning_rate": 0.00031104120958703666, + "loss": 2.4399, + "step": 5535 + }, + { + "epoch": 0.6347895883499599, + "grad_norm": 0.24296324443275436, + "learning_rate": 0.00031086929865044896, + "loss": 2.3646, + "step": 5536 + }, + { + "epoch": 0.6349042540993005, + "grad_norm": 0.268929637905429, + "learning_rate": 0.0003106974138008362, + "loss": 2.3471, + "step": 5537 + }, + { + "epoch": 0.6350189198486412, + "grad_norm": 0.25946396242332137, + "learning_rate": 0.00031052555506190673, + "loss": 2.4154, + "step": 5538 + }, + { + "epoch": 0.6351335855979818, + "grad_norm": 0.2287616870704528, + "learning_rate": 0.00031035372245736544, + "loss": 2.2702, + "step": 5539 + }, + { + "epoch": 0.6352482513473225, + "grad_norm": 0.2479208663589942, + "learning_rate": 0.0003101819160109131, + "loss": 2.3541, + "step": 5540 + }, + { + "epoch": 0.6353629170966633, + "grad_norm": 0.24710559483474936, + "learning_rate": 0.00031001013574624714, + "loss": 2.364, + "step": 5541 + }, + { + "epoch": 0.6354775828460039, + "grad_norm": 0.2486665537378549, + "learning_rate": 0.00030983838168706124, + "loss": 2.3355, + "step": 5542 + }, + { + "epoch": 0.6355922485953446, + "grad_norm": 0.24808884454052754, + "learning_rate": 0.0003096666538570461, + "loss": 2.4294, + "step": 5543 + }, + { + "epoch": 0.6357069143446853, + "grad_norm": 0.25631861026758807, + "learning_rate": 0.0003094949522798877, + "loss": 2.4378, + "step": 5544 + }, + { + "epoch": 0.6358215800940259, + "grad_norm": 0.25654630331148426, + "learning_rate": 0.0003093232769792695, + "loss": 2.4426, + "step": 5545 + }, + { + "epoch": 0.6359362458433666, + "grad_norm": 0.25625823062406006, + "learning_rate": 0.0003091516279788706, + "loss": 2.5253, + "step": 5546 + }, + { + "epoch": 0.6360509115927072, + "grad_norm": 0.2447769550132519, + "learning_rate": 0.0003089800053023665, + "loss": 2.4636, + "step": 5547 + }, + { + "epoch": 0.6361655773420479, + "grad_norm": 0.279545014330734, + "learning_rate": 0.00030880840897342955, + "loss": 2.5956, + "step": 5548 + }, + { + "epoch": 0.6362802430913886, + "grad_norm": 0.2742271169893888, + "learning_rate": 0.0003086368390157283, + "loss": 2.3668, + "step": 5549 + }, + { + "epoch": 0.6363949088407292, + "grad_norm": 0.2560739523412276, + "learning_rate": 0.0003084652954529273, + "loss": 2.388, + "step": 5550 + }, + { + "epoch": 0.63650957459007, + "grad_norm": 0.2725102637306408, + "learning_rate": 0.00030829377830868785, + "loss": 2.4671, + "step": 5551 + }, + { + "epoch": 0.6366242403394106, + "grad_norm": 0.27811919686798214, + "learning_rate": 0.0003081222876066675, + "loss": 2.3782, + "step": 5552 + }, + { + "epoch": 0.6367389060887513, + "grad_norm": 0.2278704699688382, + "learning_rate": 0.00030795082337051985, + "loss": 2.4658, + "step": 5553 + }, + { + "epoch": 0.636853571838092, + "grad_norm": 0.2343502685248253, + "learning_rate": 0.0003077793856238954, + "loss": 2.337, + "step": 5554 + }, + { + "epoch": 0.6369682375874326, + "grad_norm": 0.2418894524517346, + "learning_rate": 0.0003076079743904407, + "loss": 2.3272, + "step": 5555 + }, + { + "epoch": 0.6370829033367733, + "grad_norm": 0.2674808932411423, + "learning_rate": 0.00030743658969379864, + "loss": 2.448, + "step": 5556 + }, + { + "epoch": 0.637197569086114, + "grad_norm": 0.26549930620050916, + "learning_rate": 0.0003072652315576084, + "loss": 2.4675, + "step": 5557 + }, + { + "epoch": 0.6373122348354546, + "grad_norm": 0.23494644688036828, + "learning_rate": 0.00030709390000550543, + "loss": 2.427, + "step": 5558 + }, + { + "epoch": 0.6374269005847953, + "grad_norm": 0.2360840166240721, + "learning_rate": 0.00030692259506112207, + "loss": 2.562, + "step": 5559 + }, + { + "epoch": 0.637541566334136, + "grad_norm": 0.2561945452470874, + "learning_rate": 0.0003067513167480862, + "loss": 2.4625, + "step": 5560 + }, + { + "epoch": 0.6376562320834767, + "grad_norm": 0.2606740278367096, + "learning_rate": 0.00030658006509002265, + "loss": 2.3394, + "step": 5561 + }, + { + "epoch": 0.6377708978328174, + "grad_norm": 0.255192048020537, + "learning_rate": 0.0003064088401105521, + "loss": 2.551, + "step": 5562 + }, + { + "epoch": 0.637885563582158, + "grad_norm": 0.22265055027920913, + "learning_rate": 0.00030623764183329176, + "loss": 2.3916, + "step": 5563 + }, + { + "epoch": 0.6380002293314987, + "grad_norm": 0.2538054116207791, + "learning_rate": 0.0003060664702818555, + "loss": 2.4574, + "step": 5564 + }, + { + "epoch": 0.6381148950808394, + "grad_norm": 0.2381936565759509, + "learning_rate": 0.000305895325479853, + "loss": 2.4271, + "step": 5565 + }, + { + "epoch": 0.63822956083018, + "grad_norm": 0.2391724091021789, + "learning_rate": 0.00030572420745089024, + "loss": 2.3996, + "step": 5566 + }, + { + "epoch": 0.6383442265795207, + "grad_norm": 0.22596674411763887, + "learning_rate": 0.00030555311621856984, + "loss": 2.3073, + "step": 5567 + }, + { + "epoch": 0.6384588923288613, + "grad_norm": 0.23910245204790712, + "learning_rate": 0.0003053820518064905, + "loss": 2.4449, + "step": 5568 + }, + { + "epoch": 0.638573558078202, + "grad_norm": 0.24863720160124406, + "learning_rate": 0.00030521101423824736, + "loss": 2.3923, + "step": 5569 + }, + { + "epoch": 0.6386882238275428, + "grad_norm": 0.23848836686992558, + "learning_rate": 0.0003050400035374319, + "loss": 2.5183, + "step": 5570 + }, + { + "epoch": 0.6388028895768834, + "grad_norm": 0.23244563722155503, + "learning_rate": 0.0003048690197276318, + "loss": 2.3985, + "step": 5571 + }, + { + "epoch": 0.6389175553262241, + "grad_norm": 0.23980346381708229, + "learning_rate": 0.0003046980628324306, + "loss": 2.4689, + "step": 5572 + }, + { + "epoch": 0.6390322210755647, + "grad_norm": 0.24048800239971505, + "learning_rate": 0.0003045271328754089, + "loss": 2.4929, + "step": 5573 + }, + { + "epoch": 0.6391468868249054, + "grad_norm": 0.2833472758316603, + "learning_rate": 0.00030435622988014333, + "loss": 2.5184, + "step": 5574 + }, + { + "epoch": 0.6392615525742461, + "grad_norm": 0.24345676726661816, + "learning_rate": 0.0003041853538702064, + "loss": 2.3298, + "step": 5575 + }, + { + "epoch": 0.6393762183235867, + "grad_norm": 0.23737334691864767, + "learning_rate": 0.00030401450486916743, + "loss": 2.4799, + "step": 5576 + }, + { + "epoch": 0.6394908840729274, + "grad_norm": 0.2456564074696597, + "learning_rate": 0.00030384368290059175, + "loss": 2.6203, + "step": 5577 + }, + { + "epoch": 0.6396055498222681, + "grad_norm": 0.2433604148140225, + "learning_rate": 0.00030367288798804063, + "loss": 2.4013, + "step": 5578 + }, + { + "epoch": 0.6397202155716087, + "grad_norm": 0.23804416385711794, + "learning_rate": 0.0003035021201550725, + "loss": 2.4361, + "step": 5579 + }, + { + "epoch": 0.6398348813209495, + "grad_norm": 0.24675106347917813, + "learning_rate": 0.0003033313794252413, + "loss": 2.4898, + "step": 5580 + }, + { + "epoch": 0.6399495470702901, + "grad_norm": 0.2877622192447196, + "learning_rate": 0.00030316066582209745, + "loss": 2.5742, + "step": 5581 + }, + { + "epoch": 0.6400642128196308, + "grad_norm": 0.2690196002395355, + "learning_rate": 0.0003029899793691877, + "loss": 2.4379, + "step": 5582 + }, + { + "epoch": 0.6401788785689715, + "grad_norm": 0.24370696613004647, + "learning_rate": 0.00030281932009005474, + "loss": 2.3857, + "step": 5583 + }, + { + "epoch": 0.6402935443183121, + "grad_norm": 0.25075507329585384, + "learning_rate": 0.0003026486880082382, + "loss": 2.2878, + "step": 5584 + }, + { + "epoch": 0.6404082100676528, + "grad_norm": 0.2712703315860943, + "learning_rate": 0.00030247808314727315, + "loss": 2.4744, + "step": 5585 + }, + { + "epoch": 0.6405228758169934, + "grad_norm": 0.2433559738733202, + "learning_rate": 0.0003023075055306915, + "loss": 2.387, + "step": 5586 + }, + { + "epoch": 0.6406375415663341, + "grad_norm": 0.22891710661314116, + "learning_rate": 0.0003021369551820211, + "loss": 2.3227, + "step": 5587 + }, + { + "epoch": 0.6407522073156748, + "grad_norm": 0.25020110788364225, + "learning_rate": 0.0003019664321247859, + "loss": 2.5849, + "step": 5588 + }, + { + "epoch": 0.6408668730650154, + "grad_norm": 0.24850155323763826, + "learning_rate": 0.0003017959363825068, + "loss": 2.4952, + "step": 5589 + }, + { + "epoch": 0.6409815388143562, + "grad_norm": 0.25391151201367884, + "learning_rate": 0.00030162546797870014, + "loss": 2.4506, + "step": 5590 + }, + { + "epoch": 0.6410962045636969, + "grad_norm": 0.25703356684184364, + "learning_rate": 0.0003014550269368788, + "loss": 2.3004, + "step": 5591 + }, + { + "epoch": 0.6412108703130375, + "grad_norm": 0.2466140134033812, + "learning_rate": 0.0003012846132805519, + "loss": 2.4447, + "step": 5592 + }, + { + "epoch": 0.6413255360623782, + "grad_norm": 0.22938962171974508, + "learning_rate": 0.00030111422703322447, + "loss": 2.4067, + "step": 5593 + }, + { + "epoch": 0.6414402018117188, + "grad_norm": 0.2210941104555438, + "learning_rate": 0.00030094386821839846, + "loss": 2.3416, + "step": 5594 + }, + { + "epoch": 0.6415548675610595, + "grad_norm": 0.21476503437330766, + "learning_rate": 0.00030077353685957167, + "loss": 2.4579, + "step": 5595 + }, + { + "epoch": 0.6416695333104002, + "grad_norm": 0.2448900982724273, + "learning_rate": 0.0003006032329802378, + "loss": 2.4469, + "step": 5596 + }, + { + "epoch": 0.6417841990597408, + "grad_norm": 0.3042977985089514, + "learning_rate": 0.000300432956603887, + "loss": 2.4168, + "step": 5597 + }, + { + "epoch": 0.6418988648090815, + "grad_norm": 0.23066407889979565, + "learning_rate": 0.0003002627077540056, + "loss": 2.499, + "step": 5598 + }, + { + "epoch": 0.6420135305584223, + "grad_norm": 0.2546313088370773, + "learning_rate": 0.0003000924864540766, + "loss": 2.3875, + "step": 5599 + }, + { + "epoch": 0.6421281963077629, + "grad_norm": 0.22364361841055, + "learning_rate": 0.00029992229272757833, + "loss": 2.4302, + "step": 5600 + }, + { + "epoch": 0.6422428620571036, + "grad_norm": 0.255260045539605, + "learning_rate": 0.0002997521265979861, + "loss": 2.4009, + "step": 5601 + }, + { + "epoch": 0.6423575278064442, + "grad_norm": 0.27093319642463676, + "learning_rate": 0.0002995819880887709, + "loss": 2.3329, + "step": 5602 + }, + { + "epoch": 0.6424721935557849, + "grad_norm": 0.26454849215149356, + "learning_rate": 0.0002994118772233999, + "loss": 2.3926, + "step": 5603 + }, + { + "epoch": 0.6425868593051256, + "grad_norm": 0.2965449710769386, + "learning_rate": 0.0002992417940253371, + "loss": 2.3461, + "step": 5604 + }, + { + "epoch": 0.6427015250544662, + "grad_norm": 0.25930391110901724, + "learning_rate": 0.00029907173851804206, + "loss": 2.4265, + "step": 5605 + }, + { + "epoch": 0.6428161908038069, + "grad_norm": 0.29076442987596607, + "learning_rate": 0.00029890171072497054, + "loss": 2.477, + "step": 5606 + }, + { + "epoch": 0.6429308565531475, + "grad_norm": 0.2657461234756996, + "learning_rate": 0.0002987317106695748, + "loss": 2.4292, + "step": 5607 + }, + { + "epoch": 0.6430455223024882, + "grad_norm": 0.2762738606372087, + "learning_rate": 0.0002985617383753029, + "loss": 2.4491, + "step": 5608 + }, + { + "epoch": 0.643160188051829, + "grad_norm": 0.25762550425728215, + "learning_rate": 0.00029839179386559957, + "loss": 2.3368, + "step": 5609 + }, + { + "epoch": 0.6432748538011696, + "grad_norm": 0.23622645143233845, + "learning_rate": 0.00029822187716390525, + "loss": 2.367, + "step": 5610 + }, + { + "epoch": 0.6433895195505103, + "grad_norm": 0.25078313472020947, + "learning_rate": 0.0002980519882936568, + "loss": 2.5408, + "step": 5611 + }, + { + "epoch": 0.643504185299851, + "grad_norm": 0.2355015951503166, + "learning_rate": 0.000297882127278287, + "loss": 2.2902, + "step": 5612 + }, + { + "epoch": 0.6436188510491916, + "grad_norm": 0.20815988634004784, + "learning_rate": 0.00029771229414122494, + "loss": 2.4703, + "step": 5613 + }, + { + "epoch": 0.6437335167985323, + "grad_norm": 0.26240585088274326, + "learning_rate": 0.0002975424889058961, + "loss": 2.4254, + "step": 5614 + }, + { + "epoch": 0.6438481825478729, + "grad_norm": 0.23871563351535682, + "learning_rate": 0.00029737271159572176, + "loss": 2.43, + "step": 5615 + }, + { + "epoch": 0.6439628482972136, + "grad_norm": 0.23387369770608432, + "learning_rate": 0.0002972029622341193, + "loss": 2.3656, + "step": 5616 + }, + { + "epoch": 0.6440775140465543, + "grad_norm": 0.2763505678650222, + "learning_rate": 0.0002970332408445027, + "loss": 2.5593, + "step": 5617 + }, + { + "epoch": 0.6441921797958949, + "grad_norm": 0.2639255756398446, + "learning_rate": 0.0002968635474502813, + "loss": 2.4779, + "step": 5618 + }, + { + "epoch": 0.6443068455452357, + "grad_norm": 0.23528628447469446, + "learning_rate": 0.0002966938820748616, + "loss": 2.231, + "step": 5619 + }, + { + "epoch": 0.6444215112945763, + "grad_norm": 0.24616959166218327, + "learning_rate": 0.00029652424474164557, + "loss": 2.2724, + "step": 5620 + }, + { + "epoch": 0.644536177043917, + "grad_norm": 0.26708752812841563, + "learning_rate": 0.0002963546354740314, + "loss": 2.4256, + "step": 5621 + }, + { + "epoch": 0.6446508427932577, + "grad_norm": 0.26427323317525886, + "learning_rate": 0.0002961850542954133, + "loss": 2.3504, + "step": 5622 + }, + { + "epoch": 0.6447655085425983, + "grad_norm": 0.26998655568855934, + "learning_rate": 0.0002960155012291818, + "loss": 2.5164, + "step": 5623 + }, + { + "epoch": 0.644880174291939, + "grad_norm": 0.250084754133724, + "learning_rate": 0.0002958459762987238, + "loss": 2.4739, + "step": 5624 + }, + { + "epoch": 0.6449948400412797, + "grad_norm": 0.28786685738260465, + "learning_rate": 0.00029567647952742176, + "loss": 2.3899, + "step": 5625 + }, + { + "epoch": 0.6451095057906203, + "grad_norm": 0.26916469411413324, + "learning_rate": 0.00029550701093865474, + "loss": 2.428, + "step": 5626 + }, + { + "epoch": 0.645224171539961, + "grad_norm": 0.274447756838796, + "learning_rate": 0.0002953375705557975, + "loss": 2.4073, + "step": 5627 + }, + { + "epoch": 0.6453388372893016, + "grad_norm": 0.27906643516140783, + "learning_rate": 0.00029516815840222103, + "loss": 2.4642, + "step": 5628 + }, + { + "epoch": 0.6454535030386424, + "grad_norm": 0.23396250522532427, + "learning_rate": 0.0002949987745012928, + "loss": 2.3474, + "step": 5629 + }, + { + "epoch": 0.6455681687879831, + "grad_norm": 0.2557116841351349, + "learning_rate": 0.00029482941887637595, + "loss": 2.4611, + "step": 5630 + }, + { + "epoch": 0.6456828345373237, + "grad_norm": 0.23600051243122336, + "learning_rate": 0.00029466009155082977, + "loss": 2.4435, + "step": 5631 + }, + { + "epoch": 0.6457975002866644, + "grad_norm": 0.24745280890026364, + "learning_rate": 0.00029449079254801, + "loss": 2.3555, + "step": 5632 + }, + { + "epoch": 0.6459121660360051, + "grad_norm": 0.26967046403946093, + "learning_rate": 0.00029432152189126784, + "loss": 2.3924, + "step": 5633 + }, + { + "epoch": 0.6460268317853457, + "grad_norm": 0.2582628980879432, + "learning_rate": 0.00029415227960395126, + "loss": 2.4175, + "step": 5634 + }, + { + "epoch": 0.6461414975346864, + "grad_norm": 0.2667329007189647, + "learning_rate": 0.0002939830657094038, + "loss": 2.4359, + "step": 5635 + }, + { + "epoch": 0.646256163284027, + "grad_norm": 0.2584403054067718, + "learning_rate": 0.00029381388023096556, + "loss": 2.4777, + "step": 5636 + }, + { + "epoch": 0.6463708290333677, + "grad_norm": 0.2448074439887825, + "learning_rate": 0.0002936447231919721, + "loss": 2.4026, + "step": 5637 + }, + { + "epoch": 0.6464854947827084, + "grad_norm": 0.23506794710498322, + "learning_rate": 0.0002934755946157556, + "loss": 2.5527, + "step": 5638 + }, + { + "epoch": 0.646600160532049, + "grad_norm": 0.229589813860594, + "learning_rate": 0.0002933064945256442, + "loss": 2.3899, + "step": 5639 + }, + { + "epoch": 0.6467148262813898, + "grad_norm": 0.22711133093913352, + "learning_rate": 0.00029313742294496194, + "loss": 2.2789, + "step": 5640 + }, + { + "epoch": 0.6468294920307304, + "grad_norm": 0.24589402616403436, + "learning_rate": 0.00029296837989702893, + "loss": 2.4373, + "step": 5641 + }, + { + "epoch": 0.6469441577800711, + "grad_norm": 0.23318551826503237, + "learning_rate": 0.0002927993654051617, + "loss": 2.3886, + "step": 5642 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.23603384018727594, + "learning_rate": 0.0002926303794926722, + "loss": 2.5977, + "step": 5643 + }, + { + "epoch": 0.6471734892787524, + "grad_norm": 0.23520160798590653, + "learning_rate": 0.0002924614221828691, + "loss": 2.3885, + "step": 5644 + }, + { + "epoch": 0.6472881550280931, + "grad_norm": 0.23387142584112652, + "learning_rate": 0.0002922924934990568, + "loss": 2.4888, + "step": 5645 + }, + { + "epoch": 0.6474028207774338, + "grad_norm": 0.2348625852895707, + "learning_rate": 0.00029212359346453585, + "loss": 2.3976, + "step": 5646 + }, + { + "epoch": 0.6475174865267744, + "grad_norm": 0.24575923868493688, + "learning_rate": 0.00029195472210260257, + "loss": 2.4553, + "step": 5647 + }, + { + "epoch": 0.6476321522761151, + "grad_norm": 0.2529606429639754, + "learning_rate": 0.00029178587943654965, + "loss": 2.4127, + "step": 5648 + }, + { + "epoch": 0.6477468180254558, + "grad_norm": 0.25988253427881813, + "learning_rate": 0.00029161706548966576, + "loss": 2.2999, + "step": 5649 + }, + { + "epoch": 0.6478614837747965, + "grad_norm": 0.24219340207708312, + "learning_rate": 0.0002914482802852356, + "loss": 2.4196, + "step": 5650 + }, + { + "epoch": 0.6479761495241372, + "grad_norm": 0.2696814415673247, + "learning_rate": 0.0002912795238465399, + "loss": 2.4996, + "step": 5651 + }, + { + "epoch": 0.6480908152734778, + "grad_norm": 0.2541434058301903, + "learning_rate": 0.0002911107961968552, + "loss": 2.3113, + "step": 5652 + }, + { + "epoch": 0.6482054810228185, + "grad_norm": 0.2516095556348608, + "learning_rate": 0.0002909420973594541, + "loss": 2.5481, + "step": 5653 + }, + { + "epoch": 0.6483201467721592, + "grad_norm": 0.25464964305242477, + "learning_rate": 0.00029077342735760615, + "loss": 2.5334, + "step": 5654 + }, + { + "epoch": 0.6484348125214998, + "grad_norm": 0.27157018764617435, + "learning_rate": 0.0002906047862145754, + "loss": 2.394, + "step": 5655 + }, + { + "epoch": 0.6485494782708405, + "grad_norm": 0.29350822765479045, + "learning_rate": 0.00029043617395362297, + "loss": 2.5096, + "step": 5656 + }, + { + "epoch": 0.6486641440201811, + "grad_norm": 0.23958297168633486, + "learning_rate": 0.00029026759059800597, + "loss": 2.5002, + "step": 5657 + }, + { + "epoch": 0.6487788097695218, + "grad_norm": 0.25212140529203575, + "learning_rate": 0.00029009903617097647, + "loss": 2.4031, + "step": 5658 + }, + { + "epoch": 0.6488934755188626, + "grad_norm": 0.2801407147918183, + "learning_rate": 0.00028993051069578415, + "loss": 2.4639, + "step": 5659 + }, + { + "epoch": 0.6490081412682032, + "grad_norm": 0.26439529783461496, + "learning_rate": 0.0002897620141956737, + "loss": 2.5516, + "step": 5660 + }, + { + "epoch": 0.6491228070175439, + "grad_norm": 0.23585350633934601, + "learning_rate": 0.00028959354669388584, + "loss": 2.4125, + "step": 5661 + }, + { + "epoch": 0.6492374727668845, + "grad_norm": 0.22659632561400014, + "learning_rate": 0.0002894251082136574, + "loss": 2.2204, + "step": 5662 + }, + { + "epoch": 0.6493521385162252, + "grad_norm": 0.24286476485274064, + "learning_rate": 0.0002892566987782213, + "loss": 2.2555, + "step": 5663 + }, + { + "epoch": 0.6494668042655659, + "grad_norm": 0.24946976485492503, + "learning_rate": 0.0002890883184108065, + "loss": 2.4342, + "step": 5664 + }, + { + "epoch": 0.6495814700149065, + "grad_norm": 0.24937815568346353, + "learning_rate": 0.0002889199671346379, + "loss": 2.4295, + "step": 5665 + }, + { + "epoch": 0.6496961357642472, + "grad_norm": 0.27192849803477626, + "learning_rate": 0.0002887516449729365, + "loss": 2.4252, + "step": 5666 + }, + { + "epoch": 0.6498108015135879, + "grad_norm": 0.22347661255198248, + "learning_rate": 0.0002885833519489186, + "loss": 2.4658, + "step": 5667 + }, + { + "epoch": 0.6499254672629285, + "grad_norm": 0.23576867180901842, + "learning_rate": 0.0002884150880857972, + "loss": 2.4509, + "step": 5668 + }, + { + "epoch": 0.6500401330122693, + "grad_norm": 0.23434289413172396, + "learning_rate": 0.00028824685340678163, + "loss": 2.377, + "step": 5669 + }, + { + "epoch": 0.6501547987616099, + "grad_norm": 0.23397172846403394, + "learning_rate": 0.000288078647935076, + "loss": 2.3541, + "step": 5670 + }, + { + "epoch": 0.6502694645109506, + "grad_norm": 0.22515335259006614, + "learning_rate": 0.0002879104716938814, + "loss": 2.2812, + "step": 5671 + }, + { + "epoch": 0.6503841302602913, + "grad_norm": 0.2538860788105143, + "learning_rate": 0.00028774232470639454, + "loss": 2.5628, + "step": 5672 + }, + { + "epoch": 0.6504987960096319, + "grad_norm": 0.2300478455823693, + "learning_rate": 0.0002875742069958076, + "loss": 2.3377, + "step": 5673 + }, + { + "epoch": 0.6506134617589726, + "grad_norm": 0.2605896453240586, + "learning_rate": 0.00028740611858530984, + "loss": 2.3875, + "step": 5674 + }, + { + "epoch": 0.6507281275083132, + "grad_norm": 0.24941822668389896, + "learning_rate": 0.0002872380594980858, + "loss": 2.4525, + "step": 5675 + }, + { + "epoch": 0.6508427932576539, + "grad_norm": 0.2790859475463254, + "learning_rate": 0.00028707002975731564, + "loss": 2.3956, + "step": 5676 + }, + { + "epoch": 0.6509574590069946, + "grad_norm": 0.2791239206128264, + "learning_rate": 0.00028690202938617607, + "loss": 2.5136, + "step": 5677 + }, + { + "epoch": 0.6510721247563352, + "grad_norm": 0.22798766210742447, + "learning_rate": 0.0002867340584078395, + "loss": 2.2897, + "step": 5678 + }, + { + "epoch": 0.651186790505676, + "grad_norm": 0.2512337476378715, + "learning_rate": 0.0002865661168454744, + "loss": 2.3616, + "step": 5679 + }, + { + "epoch": 0.6513014562550167, + "grad_norm": 0.24514082837191367, + "learning_rate": 0.000286398204722245, + "loss": 2.4086, + "step": 5680 + }, + { + "epoch": 0.6514161220043573, + "grad_norm": 0.2632389911596468, + "learning_rate": 0.0002862303220613118, + "loss": 2.3422, + "step": 5681 + }, + { + "epoch": 0.651530787753698, + "grad_norm": 0.2534026760685791, + "learning_rate": 0.0002860624688858308, + "loss": 2.4974, + "step": 5682 + }, + { + "epoch": 0.6516454535030386, + "grad_norm": 0.2596882211351948, + "learning_rate": 0.00028589464521895414, + "loss": 2.4496, + "step": 5683 + }, + { + "epoch": 0.6517601192523793, + "grad_norm": 0.26204431894852764, + "learning_rate": 0.0002857268510838299, + "loss": 2.4679, + "step": 5684 + }, + { + "epoch": 0.65187478500172, + "grad_norm": 0.24101427205039994, + "learning_rate": 0.0002855590865036022, + "loss": 2.4391, + "step": 5685 + }, + { + "epoch": 0.6519894507510606, + "grad_norm": 0.24282564442262652, + "learning_rate": 0.00028539135150141084, + "loss": 2.4821, + "step": 5686 + }, + { + "epoch": 0.6521041165004013, + "grad_norm": 0.25533332270224457, + "learning_rate": 0.0002852236461003919, + "loss": 2.598, + "step": 5687 + }, + { + "epoch": 0.6522187822497421, + "grad_norm": 0.25363442918670664, + "learning_rate": 0.00028505597032367665, + "loss": 2.4296, + "step": 5688 + }, + { + "epoch": 0.6523334479990827, + "grad_norm": 0.2824175622123652, + "learning_rate": 0.00028488832419439346, + "loss": 2.6137, + "step": 5689 + }, + { + "epoch": 0.6524481137484234, + "grad_norm": 0.2637758170660184, + "learning_rate": 0.0002847207077356654, + "loss": 2.4239, + "step": 5690 + }, + { + "epoch": 0.652562779497764, + "grad_norm": 0.24076632651416974, + "learning_rate": 0.00028455312097061205, + "loss": 2.4514, + "step": 5691 + }, + { + "epoch": 0.6526774452471047, + "grad_norm": 0.22477164297578944, + "learning_rate": 0.0002843855639223488, + "loss": 2.4554, + "step": 5692 + }, + { + "epoch": 0.6527921109964454, + "grad_norm": 0.2652591228536832, + "learning_rate": 0.00028421803661398716, + "loss": 2.3119, + "step": 5693 + }, + { + "epoch": 0.652906776745786, + "grad_norm": 0.251732929770671, + "learning_rate": 0.00028405053906863407, + "loss": 2.576, + "step": 5694 + }, + { + "epoch": 0.6530214424951267, + "grad_norm": 0.2567026618906436, + "learning_rate": 0.00028388307130939303, + "loss": 2.4603, + "step": 5695 + }, + { + "epoch": 0.6531361082444673, + "grad_norm": 0.22930826067858937, + "learning_rate": 0.0002837156333593625, + "loss": 2.2955, + "step": 5696 + }, + { + "epoch": 0.653250773993808, + "grad_norm": 0.2585253162135586, + "learning_rate": 0.0002835482252416376, + "loss": 2.4466, + "step": 5697 + }, + { + "epoch": 0.6533654397431488, + "grad_norm": 0.23976899563172563, + "learning_rate": 0.00028338084697930913, + "loss": 2.4875, + "step": 5698 + }, + { + "epoch": 0.6534801054924894, + "grad_norm": 0.2665870704519859, + "learning_rate": 0.0002832134985954636, + "loss": 2.4702, + "step": 5699 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.2629597209701206, + "learning_rate": 0.0002830461801131837, + "loss": 2.3997, + "step": 5700 + }, + { + "epoch": 0.6537094369911708, + "grad_norm": 0.25377673367996123, + "learning_rate": 0.0002828788915555479, + "loss": 2.4565, + "step": 5701 + }, + { + "epoch": 0.6538241027405114, + "grad_norm": 0.24075963857151567, + "learning_rate": 0.0002827116329456301, + "loss": 2.3954, + "step": 5702 + }, + { + "epoch": 0.6539387684898521, + "grad_norm": 0.2833079796240083, + "learning_rate": 0.0002825444043065004, + "loss": 2.4201, + "step": 5703 + }, + { + "epoch": 0.6540534342391927, + "grad_norm": 0.24875200053756577, + "learning_rate": 0.0002823772056612255, + "loss": 2.5976, + "step": 5704 + }, + { + "epoch": 0.6541680999885334, + "grad_norm": 0.22378414377365588, + "learning_rate": 0.00028221003703286665, + "loss": 2.396, + "step": 5705 + }, + { + "epoch": 0.6542827657378741, + "grad_norm": 0.2709371367628395, + "learning_rate": 0.0002820428984444816, + "loss": 2.2661, + "step": 5706 + }, + { + "epoch": 0.6543974314872147, + "grad_norm": 0.2757182974005147, + "learning_rate": 0.00028187578991912437, + "loss": 2.3632, + "step": 5707 + }, + { + "epoch": 0.6545120972365555, + "grad_norm": 0.23508535568675532, + "learning_rate": 0.00028170871147984366, + "loss": 2.3696, + "step": 5708 + }, + { + "epoch": 0.6546267629858961, + "grad_norm": 0.24594163908158714, + "learning_rate": 0.00028154166314968545, + "loss": 2.3751, + "step": 5709 + }, + { + "epoch": 0.6547414287352368, + "grad_norm": 0.25833428537587416, + "learning_rate": 0.0002813746449516907, + "loss": 2.3903, + "step": 5710 + }, + { + "epoch": 0.6548560944845775, + "grad_norm": 0.27484711465138284, + "learning_rate": 0.0002812076569088962, + "loss": 2.322, + "step": 5711 + }, + { + "epoch": 0.6549707602339181, + "grad_norm": 0.2322907765790648, + "learning_rate": 0.0002810406990443348, + "loss": 2.5325, + "step": 5712 + }, + { + "epoch": 0.6550854259832588, + "grad_norm": 0.24776938688191574, + "learning_rate": 0.00028087377138103533, + "loss": 2.3297, + "step": 5713 + }, + { + "epoch": 0.6552000917325995, + "grad_norm": 0.23158261012027856, + "learning_rate": 0.0002807068739420221, + "loss": 2.4271, + "step": 5714 + }, + { + "epoch": 0.6553147574819401, + "grad_norm": 0.23479974406412335, + "learning_rate": 0.0002805400067503155, + "loss": 2.5268, + "step": 5715 + }, + { + "epoch": 0.6554294232312808, + "grad_norm": 0.2507778250433927, + "learning_rate": 0.0002803731698289319, + "loss": 2.3899, + "step": 5716 + }, + { + "epoch": 0.6555440889806214, + "grad_norm": 0.2583038963802231, + "learning_rate": 0.00028020636320088286, + "loss": 2.4044, + "step": 5717 + }, + { + "epoch": 0.6556587547299622, + "grad_norm": 0.2648132659645557, + "learning_rate": 0.00028003958688917614, + "loss": 2.3401, + "step": 5718 + }, + { + "epoch": 0.6557734204793029, + "grad_norm": 0.23900663822637008, + "learning_rate": 0.00027987284091681596, + "loss": 2.4322, + "step": 5719 + }, + { + "epoch": 0.6558880862286435, + "grad_norm": 0.23544551274215067, + "learning_rate": 0.0002797061253068012, + "loss": 2.2931, + "step": 5720 + }, + { + "epoch": 0.6560027519779842, + "grad_norm": 0.253469442142768, + "learning_rate": 0.0002795394400821273, + "loss": 2.3632, + "step": 5721 + }, + { + "epoch": 0.6561174177273249, + "grad_norm": 0.24782072259899357, + "learning_rate": 0.00027937278526578546, + "loss": 2.3532, + "step": 5722 + }, + { + "epoch": 0.6562320834766655, + "grad_norm": 0.2430005383429055, + "learning_rate": 0.0002792061608807619, + "loss": 2.3572, + "step": 5723 + }, + { + "epoch": 0.6563467492260062, + "grad_norm": 0.25354276627142075, + "learning_rate": 0.00027903956695004, + "loss": 2.4653, + "step": 5724 + }, + { + "epoch": 0.6564614149753468, + "grad_norm": 0.2557591005337568, + "learning_rate": 0.00027887300349659815, + "loss": 2.3297, + "step": 5725 + }, + { + "epoch": 0.6565760807246875, + "grad_norm": 0.26731713672166413, + "learning_rate": 0.00027870647054341016, + "loss": 2.3034, + "step": 5726 + }, + { + "epoch": 0.6566907464740283, + "grad_norm": 0.24931412951962512, + "learning_rate": 0.0002785399681134464, + "loss": 2.3791, + "step": 5727 + }, + { + "epoch": 0.6568054122233689, + "grad_norm": 0.25883674133520823, + "learning_rate": 0.0002783734962296726, + "loss": 2.442, + "step": 5728 + }, + { + "epoch": 0.6569200779727096, + "grad_norm": 0.25334229322011337, + "learning_rate": 0.00027820705491505025, + "loss": 2.5225, + "step": 5729 + }, + { + "epoch": 0.6570347437220502, + "grad_norm": 0.25985002381941585, + "learning_rate": 0.0002780406441925371, + "loss": 2.2936, + "step": 5730 + }, + { + "epoch": 0.6571494094713909, + "grad_norm": 0.25005410924447674, + "learning_rate": 0.0002778742640850863, + "loss": 2.3953, + "step": 5731 + }, + { + "epoch": 0.6572640752207316, + "grad_norm": 0.2748524723040095, + "learning_rate": 0.0002777079146156465, + "loss": 2.3931, + "step": 5732 + }, + { + "epoch": 0.6573787409700722, + "grad_norm": 0.25164991841180934, + "learning_rate": 0.0002775415958071625, + "loss": 2.4199, + "step": 5733 + }, + { + "epoch": 0.6574934067194129, + "grad_norm": 0.2307592088245544, + "learning_rate": 0.000277375307682575, + "loss": 2.5075, + "step": 5734 + }, + { + "epoch": 0.6576080724687536, + "grad_norm": 0.23280048460380875, + "learning_rate": 0.00027720905026482024, + "loss": 2.3895, + "step": 5735 + }, + { + "epoch": 0.6577227382180942, + "grad_norm": 0.26153464164864837, + "learning_rate": 0.00027704282357683013, + "loss": 2.4235, + "step": 5736 + }, + { + "epoch": 0.657837403967435, + "grad_norm": 0.24546149653701257, + "learning_rate": 0.0002768766276415328, + "loss": 2.353, + "step": 5737 + }, + { + "epoch": 0.6579520697167756, + "grad_norm": 0.2552156430418072, + "learning_rate": 0.00027671046248185115, + "loss": 2.3778, + "step": 5738 + }, + { + "epoch": 0.6580667354661163, + "grad_norm": 0.22784909879818593, + "learning_rate": 0.0002765443281207053, + "loss": 2.5407, + "step": 5739 + }, + { + "epoch": 0.658181401215457, + "grad_norm": 0.2380033394835606, + "learning_rate": 0.00027637822458100974, + "loss": 2.337, + "step": 5740 + }, + { + "epoch": 0.6582960669647976, + "grad_norm": 0.24508608122322872, + "learning_rate": 0.0002762121518856755, + "loss": 2.5339, + "step": 5741 + }, + { + "epoch": 0.6584107327141383, + "grad_norm": 0.23613795002851157, + "learning_rate": 0.00027604611005760914, + "loss": 2.3751, + "step": 5742 + }, + { + "epoch": 0.6585253984634789, + "grad_norm": 0.22844835784567327, + "learning_rate": 0.0002758800991197129, + "loss": 2.4891, + "step": 5743 + }, + { + "epoch": 0.6586400642128196, + "grad_norm": 0.2425499749283864, + "learning_rate": 0.00027571411909488487, + "loss": 2.3546, + "step": 5744 + }, + { + "epoch": 0.6587547299621603, + "grad_norm": 0.23198365567729787, + "learning_rate": 0.00027554817000601905, + "loss": 2.5144, + "step": 5745 + }, + { + "epoch": 0.6588693957115009, + "grad_norm": 0.23813006346784443, + "learning_rate": 0.00027538225187600455, + "loss": 2.4779, + "step": 5746 + }, + { + "epoch": 0.6589840614608417, + "grad_norm": 0.2532566653616931, + "learning_rate": 0.0002752163647277268, + "loss": 2.4193, + "step": 5747 + }, + { + "epoch": 0.6590987272101824, + "grad_norm": 0.25236499765016984, + "learning_rate": 0.0002750505085840668, + "loss": 2.5391, + "step": 5748 + }, + { + "epoch": 0.659213392959523, + "grad_norm": 0.2637220533699274, + "learning_rate": 0.0002748846834679012, + "loss": 2.3598, + "step": 5749 + }, + { + "epoch": 0.6593280587088637, + "grad_norm": 0.24911036304341752, + "learning_rate": 0.0002747188894021024, + "loss": 2.4712, + "step": 5750 + }, + { + "epoch": 0.6594427244582043, + "grad_norm": 0.28459948709122834, + "learning_rate": 0.00027455312640953873, + "loss": 2.3872, + "step": 5751 + }, + { + "epoch": 0.659557390207545, + "grad_norm": 0.2909386573438485, + "learning_rate": 0.0002743873945130737, + "loss": 2.4084, + "step": 5752 + }, + { + "epoch": 0.6596720559568857, + "grad_norm": 0.242725157572406, + "learning_rate": 0.0002742216937355668, + "loss": 2.5349, + "step": 5753 + }, + { + "epoch": 0.6597867217062263, + "grad_norm": 0.23820125358539773, + "learning_rate": 0.00027405602409987396, + "loss": 2.509, + "step": 5754 + }, + { + "epoch": 0.659901387455567, + "grad_norm": 0.24448818374823061, + "learning_rate": 0.0002738903856288455, + "loss": 2.3564, + "step": 5755 + }, + { + "epoch": 0.6600160532049077, + "grad_norm": 0.27903598930521567, + "learning_rate": 0.0002737247783453283, + "loss": 2.3885, + "step": 5756 + }, + { + "epoch": 0.6601307189542484, + "grad_norm": 0.2315742638326828, + "learning_rate": 0.00027355920227216493, + "loss": 2.3174, + "step": 5757 + }, + { + "epoch": 0.6602453847035891, + "grad_norm": 0.25829538078419223, + "learning_rate": 0.00027339365743219286, + "loss": 2.3934, + "step": 5758 + }, + { + "epoch": 0.6603600504529297, + "grad_norm": 0.26011231980751, + "learning_rate": 0.00027322814384824645, + "loss": 2.448, + "step": 5759 + }, + { + "epoch": 0.6604747162022704, + "grad_norm": 0.25564062319716885, + "learning_rate": 0.0002730626615431551, + "loss": 2.4743, + "step": 5760 + }, + { + "epoch": 0.6605893819516111, + "grad_norm": 0.2258519372107094, + "learning_rate": 0.0002728972105397436, + "loss": 2.4715, + "step": 5761 + }, + { + "epoch": 0.6607040477009517, + "grad_norm": 0.2474470814769647, + "learning_rate": 0.0002727317908608329, + "loss": 2.4973, + "step": 5762 + }, + { + "epoch": 0.6608187134502924, + "grad_norm": 0.2521469502411131, + "learning_rate": 0.0002725664025292395, + "loss": 2.4968, + "step": 5763 + }, + { + "epoch": 0.660933379199633, + "grad_norm": 0.22188981807127156, + "learning_rate": 0.00027240104556777565, + "loss": 2.4193, + "step": 5764 + }, + { + "epoch": 0.6610480449489737, + "grad_norm": 0.26844795854860976, + "learning_rate": 0.00027223571999924914, + "loss": 2.4611, + "step": 5765 + }, + { + "epoch": 0.6611627106983144, + "grad_norm": 0.2615122837564423, + "learning_rate": 0.00027207042584646363, + "loss": 2.4015, + "step": 5766 + }, + { + "epoch": 0.661277376447655, + "grad_norm": 0.24321498550275653, + "learning_rate": 0.000271905163132218, + "loss": 2.3518, + "step": 5767 + }, + { + "epoch": 0.6613920421969958, + "grad_norm": 0.24328017601829807, + "learning_rate": 0.00027173993187930696, + "loss": 2.4403, + "step": 5768 + }, + { + "epoch": 0.6615067079463365, + "grad_norm": 0.26482781169651587, + "learning_rate": 0.00027157473211052174, + "loss": 2.4241, + "step": 5769 + }, + { + "epoch": 0.6616213736956771, + "grad_norm": 0.24617465758440277, + "learning_rate": 0.0002714095638486478, + "loss": 2.4408, + "step": 5770 + }, + { + "epoch": 0.6617360394450178, + "grad_norm": 0.25793754815303105, + "learning_rate": 0.0002712444271164672, + "loss": 2.4667, + "step": 5771 + }, + { + "epoch": 0.6618507051943584, + "grad_norm": 0.2666174470617704, + "learning_rate": 0.00027107932193675766, + "loss": 2.3816, + "step": 5772 + }, + { + "epoch": 0.6619653709436991, + "grad_norm": 0.2600776433261486, + "learning_rate": 0.00027091424833229155, + "loss": 2.3813, + "step": 5773 + }, + { + "epoch": 0.6620800366930398, + "grad_norm": 0.26017058961566003, + "learning_rate": 0.0002707492063258384, + "loss": 2.4452, + "step": 5774 + }, + { + "epoch": 0.6621947024423804, + "grad_norm": 0.24664364353920692, + "learning_rate": 0.00027058419594016246, + "loss": 2.5027, + "step": 5775 + }, + { + "epoch": 0.6623093681917211, + "grad_norm": 0.2768164244078615, + "learning_rate": 0.00027041921719802344, + "loss": 2.4844, + "step": 5776 + }, + { + "epoch": 0.6624240339410618, + "grad_norm": 0.2611758363815006, + "learning_rate": 0.00027025427012217717, + "loss": 2.3916, + "step": 5777 + }, + { + "epoch": 0.6625386996904025, + "grad_norm": 0.26198707558714446, + "learning_rate": 0.00027008935473537497, + "loss": 2.535, + "step": 5778 + }, + { + "epoch": 0.6626533654397432, + "grad_norm": 0.24101031711537704, + "learning_rate": 0.0002699244710603639, + "loss": 2.33, + "step": 5779 + }, + { + "epoch": 0.6627680311890838, + "grad_norm": 0.26695295253175233, + "learning_rate": 0.00026975961911988633, + "loss": 2.3467, + "step": 5780 + }, + { + "epoch": 0.6628826969384245, + "grad_norm": 0.2523948482778285, + "learning_rate": 0.0002695947989366807, + "loss": 2.5101, + "step": 5781 + }, + { + "epoch": 0.6629973626877652, + "grad_norm": 0.24880321366928507, + "learning_rate": 0.0002694300105334805, + "loss": 2.5354, + "step": 5782 + }, + { + "epoch": 0.6631120284371058, + "grad_norm": 0.2586214705167903, + "learning_rate": 0.00026926525393301527, + "loss": 2.3773, + "step": 5783 + }, + { + "epoch": 0.6632266941864465, + "grad_norm": 0.2260697621130077, + "learning_rate": 0.00026910052915801003, + "loss": 2.4925, + "step": 5784 + }, + { + "epoch": 0.6633413599357871, + "grad_norm": 0.22549051760704042, + "learning_rate": 0.00026893583623118547, + "loss": 2.4412, + "step": 5785 + }, + { + "epoch": 0.6634560256851278, + "grad_norm": 0.25216009994666233, + "learning_rate": 0.00026877117517525777, + "loss": 2.5138, + "step": 5786 + }, + { + "epoch": 0.6635706914344686, + "grad_norm": 0.2289944579339804, + "learning_rate": 0.0002686065460129391, + "loss": 2.442, + "step": 5787 + }, + { + "epoch": 0.6636853571838092, + "grad_norm": 0.2268511314711116, + "learning_rate": 0.0002684419487669362, + "loss": 2.2968, + "step": 5788 + }, + { + "epoch": 0.6638000229331499, + "grad_norm": 0.2517235733942, + "learning_rate": 0.000268277383459953, + "loss": 2.4164, + "step": 5789 + }, + { + "epoch": 0.6639146886824906, + "grad_norm": 0.24532212714591592, + "learning_rate": 0.00026811285011468745, + "loss": 2.6298, + "step": 5790 + }, + { + "epoch": 0.6640293544318312, + "grad_norm": 0.2511226033114938, + "learning_rate": 0.00026794834875383414, + "loss": 2.2943, + "step": 5791 + }, + { + "epoch": 0.6641440201811719, + "grad_norm": 0.2296211956024753, + "learning_rate": 0.00026778387940008276, + "loss": 2.3497, + "step": 5792 + }, + { + "epoch": 0.6642586859305125, + "grad_norm": 0.2579091474860311, + "learning_rate": 0.0002676194420761188, + "loss": 2.5644, + "step": 5793 + }, + { + "epoch": 0.6643733516798532, + "grad_norm": 0.2845383516106061, + "learning_rate": 0.00026745503680462325, + "loss": 2.3564, + "step": 5794 + }, + { + "epoch": 0.664488017429194, + "grad_norm": 0.23801234657020823, + "learning_rate": 0.0002672906636082728, + "loss": 2.4536, + "step": 5795 + }, + { + "epoch": 0.6646026831785345, + "grad_norm": 0.26748769531200295, + "learning_rate": 0.0002671263225097393, + "loss": 2.4308, + "step": 5796 + }, + { + "epoch": 0.6647173489278753, + "grad_norm": 0.231109445136059, + "learning_rate": 0.0002669620135316906, + "loss": 2.3721, + "step": 5797 + }, + { + "epoch": 0.6648320146772159, + "grad_norm": 0.27985227682006536, + "learning_rate": 0.0002667977366967901, + "loss": 2.386, + "step": 5798 + }, + { + "epoch": 0.6649466804265566, + "grad_norm": 0.25301374917522906, + "learning_rate": 0.0002666334920276965, + "loss": 2.5929, + "step": 5799 + }, + { + "epoch": 0.6650613461758973, + "grad_norm": 0.2481723864450886, + "learning_rate": 0.00026646927954706434, + "loss": 2.4493, + "step": 5800 + }, + { + "epoch": 0.6651760119252379, + "grad_norm": 0.2571158282055341, + "learning_rate": 0.00026630509927754375, + "loss": 2.466, + "step": 5801 + }, + { + "epoch": 0.6652906776745786, + "grad_norm": 0.2579527850583459, + "learning_rate": 0.00026614095124177995, + "loss": 2.4284, + "step": 5802 + }, + { + "epoch": 0.6654053434239193, + "grad_norm": 0.2622022799194773, + "learning_rate": 0.0002659768354624139, + "loss": 2.4838, + "step": 5803 + }, + { + "epoch": 0.6655200091732599, + "grad_norm": 0.2568510598247887, + "learning_rate": 0.0002658127519620829, + "loss": 2.5334, + "step": 5804 + }, + { + "epoch": 0.6656346749226006, + "grad_norm": 0.2514932722055232, + "learning_rate": 0.00026564870076341865, + "loss": 2.3849, + "step": 5805 + }, + { + "epoch": 0.6657493406719412, + "grad_norm": 0.25904607420170395, + "learning_rate": 0.0002654846818890489, + "loss": 2.4995, + "step": 5806 + }, + { + "epoch": 0.665864006421282, + "grad_norm": 0.25564826804265495, + "learning_rate": 0.0002653206953615972, + "loss": 2.371, + "step": 5807 + }, + { + "epoch": 0.6659786721706227, + "grad_norm": 0.24061487582797872, + "learning_rate": 0.0002651567412036818, + "loss": 2.3945, + "step": 5808 + }, + { + "epoch": 0.6660933379199633, + "grad_norm": 0.24164705421958846, + "learning_rate": 0.0002649928194379177, + "loss": 2.4871, + "step": 5809 + }, + { + "epoch": 0.666208003669304, + "grad_norm": 0.24557890888663322, + "learning_rate": 0.00026482893008691465, + "loss": 2.5372, + "step": 5810 + }, + { + "epoch": 0.6663226694186446, + "grad_norm": 0.25386631532968834, + "learning_rate": 0.0002646650731732777, + "loss": 2.3595, + "step": 5811 + }, + { + "epoch": 0.6664373351679853, + "grad_norm": 0.24488325798568564, + "learning_rate": 0.00026450124871960805, + "loss": 2.4271, + "step": 5812 + }, + { + "epoch": 0.666552000917326, + "grad_norm": 0.2429752532187373, + "learning_rate": 0.0002643374567485022, + "loss": 2.3799, + "step": 5813 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.2546202728494158, + "learning_rate": 0.00026417369728255194, + "loss": 2.4456, + "step": 5814 + }, + { + "epoch": 0.6667813324160073, + "grad_norm": 0.26078461944946285, + "learning_rate": 0.0002640099703443449, + "loss": 2.4225, + "step": 5815 + }, + { + "epoch": 0.6668959981653481, + "grad_norm": 0.23396062942653115, + "learning_rate": 0.00026384627595646426, + "loss": 2.3843, + "step": 5816 + }, + { + "epoch": 0.6670106639146887, + "grad_norm": 0.27587231849473237, + "learning_rate": 0.0002636826141414882, + "loss": 2.4722, + "step": 5817 + }, + { + "epoch": 0.6671253296640294, + "grad_norm": 0.23880852441174227, + "learning_rate": 0.0002635189849219907, + "loss": 2.2687, + "step": 5818 + }, + { + "epoch": 0.66723999541337, + "grad_norm": 0.2385612950245045, + "learning_rate": 0.0002633553883205419, + "loss": 2.4924, + "step": 5819 + }, + { + "epoch": 0.6673546611627107, + "grad_norm": 0.2257002183425713, + "learning_rate": 0.0002631918243597062, + "loss": 2.4018, + "step": 5820 + }, + { + "epoch": 0.6674693269120514, + "grad_norm": 0.24560813459014622, + "learning_rate": 0.00026302829306204436, + "loss": 2.3393, + "step": 5821 + }, + { + "epoch": 0.667583992661392, + "grad_norm": 0.26520992093483164, + "learning_rate": 0.0002628647944501126, + "loss": 2.434, + "step": 5822 + }, + { + "epoch": 0.6676986584107327, + "grad_norm": 0.2483553760706253, + "learning_rate": 0.0002627013285464618, + "loss": 2.489, + "step": 5823 + }, + { + "epoch": 0.6678133241600734, + "grad_norm": 0.255928078299804, + "learning_rate": 0.0002625378953736396, + "loss": 2.5033, + "step": 5824 + }, + { + "epoch": 0.667927989909414, + "grad_norm": 0.24211697108188346, + "learning_rate": 0.0002623744949541886, + "loss": 2.3963, + "step": 5825 + }, + { + "epoch": 0.6680426556587548, + "grad_norm": 0.2647890559606929, + "learning_rate": 0.0002622111273106462, + "loss": 2.3193, + "step": 5826 + }, + { + "epoch": 0.6681573214080954, + "grad_norm": 0.277788007715186, + "learning_rate": 0.0002620477924655461, + "loss": 2.4702, + "step": 5827 + }, + { + "epoch": 0.6682719871574361, + "grad_norm": 0.2460607124802482, + "learning_rate": 0.0002618844904414173, + "loss": 2.369, + "step": 5828 + }, + { + "epoch": 0.6683866529067768, + "grad_norm": 0.24410150939426625, + "learning_rate": 0.0002617212212607841, + "loss": 2.4882, + "step": 5829 + }, + { + "epoch": 0.6685013186561174, + "grad_norm": 0.26404685752331813, + "learning_rate": 0.00026155798494616645, + "loss": 2.2492, + "step": 5830 + }, + { + "epoch": 0.6686159844054581, + "grad_norm": 0.2450358696295318, + "learning_rate": 0.0002613947815200798, + "loss": 2.3861, + "step": 5831 + }, + { + "epoch": 0.6687306501547987, + "grad_norm": 0.256508753509733, + "learning_rate": 0.00026123161100503466, + "loss": 2.5351, + "step": 5832 + }, + { + "epoch": 0.6688453159041394, + "grad_norm": 0.26715591941633976, + "learning_rate": 0.00026106847342353745, + "loss": 2.478, + "step": 5833 + }, + { + "epoch": 0.6689599816534801, + "grad_norm": 0.2574387525470375, + "learning_rate": 0.00026090536879808986, + "loss": 2.3845, + "step": 5834 + }, + { + "epoch": 0.6690746474028207, + "grad_norm": 0.2903925699704349, + "learning_rate": 0.0002607422971511891, + "loss": 2.4275, + "step": 5835 + }, + { + "epoch": 0.6691893131521615, + "grad_norm": 0.2804494104622146, + "learning_rate": 0.00026057925850532775, + "loss": 2.5161, + "step": 5836 + }, + { + "epoch": 0.6693039789015022, + "grad_norm": 0.22938752247390093, + "learning_rate": 0.00026041625288299414, + "loss": 2.4119, + "step": 5837 + }, + { + "epoch": 0.6694186446508428, + "grad_norm": 0.2813340993743448, + "learning_rate": 0.00026025328030667116, + "loss": 2.5075, + "step": 5838 + }, + { + "epoch": 0.6695333104001835, + "grad_norm": 0.2451700677017847, + "learning_rate": 0.00026009034079883857, + "loss": 2.3208, + "step": 5839 + }, + { + "epoch": 0.6696479761495241, + "grad_norm": 0.2444477972459729, + "learning_rate": 0.0002599274343819702, + "loss": 2.5459, + "step": 5840 + }, + { + "epoch": 0.6697626418988648, + "grad_norm": 0.2717160189980225, + "learning_rate": 0.000259764561078536, + "loss": 2.2996, + "step": 5841 + }, + { + "epoch": 0.6698773076482055, + "grad_norm": 0.2264375927879016, + "learning_rate": 0.0002596017209110013, + "loss": 2.356, + "step": 5842 + }, + { + "epoch": 0.6699919733975461, + "grad_norm": 0.22376510689130266, + "learning_rate": 0.0002594389139018269, + "loss": 2.4212, + "step": 5843 + }, + { + "epoch": 0.6701066391468868, + "grad_norm": 0.25114334735399785, + "learning_rate": 0.00025927614007346874, + "loss": 2.5236, + "step": 5844 + }, + { + "epoch": 0.6702213048962274, + "grad_norm": 0.263593600167426, + "learning_rate": 0.0002591133994483786, + "loss": 2.3822, + "step": 5845 + }, + { + "epoch": 0.6703359706455682, + "grad_norm": 0.21509395210204282, + "learning_rate": 0.0002589506920490031, + "loss": 2.3675, + "step": 5846 + }, + { + "epoch": 0.6704506363949089, + "grad_norm": 0.26201572383920296, + "learning_rate": 0.0002587880178977849, + "loss": 2.2854, + "step": 5847 + }, + { + "epoch": 0.6705653021442495, + "grad_norm": 0.25025803403755875, + "learning_rate": 0.00025862537701716164, + "loss": 2.5073, + "step": 5848 + }, + { + "epoch": 0.6706799678935902, + "grad_norm": 0.23987446527945874, + "learning_rate": 0.0002584627694295666, + "loss": 2.287, + "step": 5849 + }, + { + "epoch": 0.6707946336429309, + "grad_norm": 0.25166852219440267, + "learning_rate": 0.0002583001951574284, + "loss": 2.4286, + "step": 5850 + }, + { + "epoch": 0.6709092993922715, + "grad_norm": 0.2532772960086042, + "learning_rate": 0.0002581376542231713, + "loss": 2.4012, + "step": 5851 + }, + { + "epoch": 0.6710239651416122, + "grad_norm": 0.24124231838592292, + "learning_rate": 0.0002579751466492143, + "loss": 2.4907, + "step": 5852 + }, + { + "epoch": 0.6711386308909528, + "grad_norm": 0.28441037507188494, + "learning_rate": 0.0002578126724579721, + "loss": 2.4885, + "step": 5853 + }, + { + "epoch": 0.6712532966402935, + "grad_norm": 0.2549522000471048, + "learning_rate": 0.00025765023167185575, + "loss": 2.4948, + "step": 5854 + }, + { + "epoch": 0.6713679623896343, + "grad_norm": 0.25523779783028466, + "learning_rate": 0.00025748782431327013, + "loss": 2.2664, + "step": 5855 + }, + { + "epoch": 0.6714826281389749, + "grad_norm": 0.21679774321743178, + "learning_rate": 0.0002573254504046163, + "loss": 2.3899, + "step": 5856 + }, + { + "epoch": 0.6715972938883156, + "grad_norm": 0.2603769775819472, + "learning_rate": 0.00025716310996829096, + "loss": 2.4745, + "step": 5857 + }, + { + "epoch": 0.6717119596376563, + "grad_norm": 0.23727766944641868, + "learning_rate": 0.0002570008030266856, + "loss": 2.4293, + "step": 5858 + }, + { + "epoch": 0.6718266253869969, + "grad_norm": 0.25059253054758934, + "learning_rate": 0.00025683852960218747, + "loss": 2.32, + "step": 5859 + }, + { + "epoch": 0.6719412911363376, + "grad_norm": 0.23024222526876786, + "learning_rate": 0.0002566762897171794, + "loss": 2.3824, + "step": 5860 + }, + { + "epoch": 0.6720559568856782, + "grad_norm": 0.2700322663754082, + "learning_rate": 0.0002565140833940387, + "loss": 2.4785, + "step": 5861 + }, + { + "epoch": 0.6721706226350189, + "grad_norm": 0.2566635676578736, + "learning_rate": 0.00025635191065513896, + "loss": 2.4051, + "step": 5862 + }, + { + "epoch": 0.6722852883843596, + "grad_norm": 0.2662057413338458, + "learning_rate": 0.00025618977152284874, + "loss": 2.3916, + "step": 5863 + }, + { + "epoch": 0.6723999541337002, + "grad_norm": 0.2439899583835536, + "learning_rate": 0.0002560276660195322, + "loss": 2.406, + "step": 5864 + }, + { + "epoch": 0.672514619883041, + "grad_norm": 0.2408159922043852, + "learning_rate": 0.0002558655941675485, + "loss": 2.304, + "step": 5865 + }, + { + "epoch": 0.6726292856323816, + "grad_norm": 0.24346162156251644, + "learning_rate": 0.00025570355598925266, + "loss": 2.4549, + "step": 5866 + }, + { + "epoch": 0.6727439513817223, + "grad_norm": 0.27339626600201794, + "learning_rate": 0.00025554155150699445, + "loss": 2.3451, + "step": 5867 + }, + { + "epoch": 0.672858617131063, + "grad_norm": 0.2621980925090863, + "learning_rate": 0.0002553795807431192, + "loss": 2.4296, + "step": 5868 + }, + { + "epoch": 0.6729732828804036, + "grad_norm": 0.28952244705526664, + "learning_rate": 0.0002552176437199682, + "loss": 2.5053, + "step": 5869 + }, + { + "epoch": 0.6730879486297443, + "grad_norm": 0.25947310150940167, + "learning_rate": 0.00025505574045987713, + "loss": 2.4611, + "step": 5870 + }, + { + "epoch": 0.673202614379085, + "grad_norm": 0.23732402243097353, + "learning_rate": 0.0002548938709851776, + "loss": 2.4868, + "step": 5871 + }, + { + "epoch": 0.6733172801284256, + "grad_norm": 0.2462447821195796, + "learning_rate": 0.00025473203531819653, + "loss": 2.4188, + "step": 5872 + }, + { + "epoch": 0.6734319458777663, + "grad_norm": 0.2569346102223702, + "learning_rate": 0.0002545702334812557, + "loss": 2.4894, + "step": 5873 + }, + { + "epoch": 0.6735466116271069, + "grad_norm": 0.2649165498663336, + "learning_rate": 0.00025440846549667297, + "loss": 2.3394, + "step": 5874 + }, + { + "epoch": 0.6736612773764477, + "grad_norm": 0.26382775321129204, + "learning_rate": 0.00025424673138676123, + "loss": 2.4476, + "step": 5875 + }, + { + "epoch": 0.6737759431257884, + "grad_norm": 0.23108671893421445, + "learning_rate": 0.0002540850311738282, + "loss": 2.4266, + "step": 5876 + }, + { + "epoch": 0.673890608875129, + "grad_norm": 0.2335579654361149, + "learning_rate": 0.00025392336488017764, + "loss": 2.4573, + "step": 5877 + }, + { + "epoch": 0.6740052746244697, + "grad_norm": 0.23736260011359034, + "learning_rate": 0.00025376173252810813, + "loss": 2.4134, + "step": 5878 + }, + { + "epoch": 0.6741199403738103, + "grad_norm": 0.21910237059644166, + "learning_rate": 0.00025360013413991395, + "loss": 2.38, + "step": 5879 + }, + { + "epoch": 0.674234606123151, + "grad_norm": 0.23505053392981368, + "learning_rate": 0.0002534385697378845, + "loss": 2.377, + "step": 5880 + }, + { + "epoch": 0.6743492718724917, + "grad_norm": 0.24578701409554926, + "learning_rate": 0.00025327703934430456, + "loss": 2.4878, + "step": 5881 + }, + { + "epoch": 0.6744639376218323, + "grad_norm": 0.22919335517878578, + "learning_rate": 0.0002531155429814539, + "loss": 2.5216, + "step": 5882 + }, + { + "epoch": 0.674578603371173, + "grad_norm": 0.27020361971866275, + "learning_rate": 0.00025295408067160807, + "loss": 2.3832, + "step": 5883 + }, + { + "epoch": 0.6746932691205138, + "grad_norm": 0.2166826344627892, + "learning_rate": 0.0002527926524370378, + "loss": 2.4304, + "step": 5884 + }, + { + "epoch": 0.6748079348698544, + "grad_norm": 0.23714389448091572, + "learning_rate": 0.00025263125830000873, + "loss": 2.3412, + "step": 5885 + }, + { + "epoch": 0.6749226006191951, + "grad_norm": 0.28015954831863465, + "learning_rate": 0.00025246989828278237, + "loss": 2.5086, + "step": 5886 + }, + { + "epoch": 0.6750372663685357, + "grad_norm": 0.22883550320154789, + "learning_rate": 0.0002523085724076154, + "loss": 2.4369, + "step": 5887 + }, + { + "epoch": 0.6751519321178764, + "grad_norm": 0.2537664035537909, + "learning_rate": 0.00025214728069675906, + "loss": 2.4068, + "step": 5888 + }, + { + "epoch": 0.6752665978672171, + "grad_norm": 0.23571029398729612, + "learning_rate": 0.00025198602317246123, + "loss": 2.4127, + "step": 5889 + }, + { + "epoch": 0.6753812636165577, + "grad_norm": 0.2811325464767358, + "learning_rate": 0.0002518247998569637, + "loss": 2.561, + "step": 5890 + }, + { + "epoch": 0.6754959293658984, + "grad_norm": 0.22473587153980507, + "learning_rate": 0.0002516636107725044, + "loss": 2.5216, + "step": 5891 + }, + { + "epoch": 0.6756105951152391, + "grad_norm": 0.24642430135370247, + "learning_rate": 0.00025150245594131624, + "loss": 2.4197, + "step": 5892 + }, + { + "epoch": 0.6757252608645797, + "grad_norm": 0.250258597634242, + "learning_rate": 0.00025134133538562756, + "loss": 2.4669, + "step": 5893 + }, + { + "epoch": 0.6758399266139204, + "grad_norm": 0.2635142679032383, + "learning_rate": 0.00025118024912766174, + "loss": 2.354, + "step": 5894 + }, + { + "epoch": 0.675954592363261, + "grad_norm": 0.25277435164073214, + "learning_rate": 0.0002510191971896376, + "loss": 2.3408, + "step": 5895 + }, + { + "epoch": 0.6760692581126018, + "grad_norm": 0.2578933627793284, + "learning_rate": 0.00025085817959376945, + "loss": 2.4225, + "step": 5896 + }, + { + "epoch": 0.6761839238619425, + "grad_norm": 0.2675630706458878, + "learning_rate": 0.00025069719636226616, + "loss": 2.3976, + "step": 5897 + }, + { + "epoch": 0.6762985896112831, + "grad_norm": 0.25339599527676654, + "learning_rate": 0.0002505362475173325, + "loss": 2.4903, + "step": 5898 + }, + { + "epoch": 0.6764132553606238, + "grad_norm": 0.2387584991554649, + "learning_rate": 0.0002503753330811682, + "loss": 2.4778, + "step": 5899 + }, + { + "epoch": 0.6765279211099644, + "grad_norm": 0.24591141481484705, + "learning_rate": 0.00025021445307596847, + "loss": 2.3851, + "step": 5900 + }, + { + "epoch": 0.6766425868593051, + "grad_norm": 0.23155630142824957, + "learning_rate": 0.0002500536075239236, + "loss": 2.3125, + "step": 5901 + }, + { + "epoch": 0.6767572526086458, + "grad_norm": 0.232707155713306, + "learning_rate": 0.00024989279644721936, + "loss": 2.4229, + "step": 5902 + }, + { + "epoch": 0.6768719183579864, + "grad_norm": 0.26805417164901, + "learning_rate": 0.000249732019868036, + "loss": 2.3585, + "step": 5903 + }, + { + "epoch": 0.6769865841073271, + "grad_norm": 0.24279181077830103, + "learning_rate": 0.00024957127780855033, + "loss": 2.347, + "step": 5904 + }, + { + "epoch": 0.6771012498566679, + "grad_norm": 0.272616109737779, + "learning_rate": 0.00024941057029093306, + "loss": 2.5327, + "step": 5905 + }, + { + "epoch": 0.6772159156060085, + "grad_norm": 0.2551979228619229, + "learning_rate": 0.0002492498973373509, + "loss": 2.4281, + "step": 5906 + }, + { + "epoch": 0.6773305813553492, + "grad_norm": 0.24572448791223372, + "learning_rate": 0.00024908925896996583, + "loss": 2.4757, + "step": 5907 + }, + { + "epoch": 0.6774452471046898, + "grad_norm": 0.26693233352896395, + "learning_rate": 0.00024892865521093454, + "loss": 2.3794, + "step": 5908 + }, + { + "epoch": 0.6775599128540305, + "grad_norm": 0.2773657909438781, + "learning_rate": 0.0002487680860824095, + "loss": 2.3026, + "step": 5909 + }, + { + "epoch": 0.6776745786033712, + "grad_norm": 0.2176424816177569, + "learning_rate": 0.0002486075516065382, + "loss": 2.4033, + "step": 5910 + }, + { + "epoch": 0.6777892443527118, + "grad_norm": 0.24747854142718895, + "learning_rate": 0.000248447051805463, + "loss": 2.3994, + "step": 5911 + }, + { + "epoch": 0.6779039101020525, + "grad_norm": 0.27009090810666164, + "learning_rate": 0.000248286586701322, + "loss": 2.4939, + "step": 5912 + }, + { + "epoch": 0.6780185758513931, + "grad_norm": 0.26388707263814826, + "learning_rate": 0.00024812615631624824, + "loss": 2.4566, + "step": 5913 + }, + { + "epoch": 0.6781332416007338, + "grad_norm": 0.24910976625115885, + "learning_rate": 0.0002479657606723701, + "loss": 2.4372, + "step": 5914 + }, + { + "epoch": 0.6782479073500746, + "grad_norm": 0.2559728568280635, + "learning_rate": 0.00024780539979181107, + "loss": 2.4859, + "step": 5915 + }, + { + "epoch": 0.6783625730994152, + "grad_norm": 0.2648773513855566, + "learning_rate": 0.00024764507369669, + "loss": 2.3807, + "step": 5916 + }, + { + "epoch": 0.6784772388487559, + "grad_norm": 0.24838837270309236, + "learning_rate": 0.00024748478240912053, + "loss": 2.2707, + "step": 5917 + }, + { + "epoch": 0.6785919045980966, + "grad_norm": 0.27010994079675804, + "learning_rate": 0.0002473245259512118, + "loss": 2.4327, + "step": 5918 + }, + { + "epoch": 0.6787065703474372, + "grad_norm": 0.2436045844203098, + "learning_rate": 0.0002471643043450686, + "loss": 2.2979, + "step": 5919 + }, + { + "epoch": 0.6788212360967779, + "grad_norm": 0.2617427159749266, + "learning_rate": 0.00024700411761278995, + "loss": 2.3195, + "step": 5920 + }, + { + "epoch": 0.6789359018461185, + "grad_norm": 0.24559457889463165, + "learning_rate": 0.0002468439657764708, + "loss": 2.3256, + "step": 5921 + }, + { + "epoch": 0.6790505675954592, + "grad_norm": 0.2719261131528808, + "learning_rate": 0.0002466838488582011, + "loss": 2.4911, + "step": 5922 + }, + { + "epoch": 0.6791652333448, + "grad_norm": 0.27087362183702196, + "learning_rate": 0.00024652376688006543, + "loss": 2.4066, + "step": 5923 + }, + { + "epoch": 0.6792798990941405, + "grad_norm": 0.27600723664610594, + "learning_rate": 0.0002463637198641446, + "loss": 2.3256, + "step": 5924 + }, + { + "epoch": 0.6793945648434813, + "grad_norm": 0.2652221131227717, + "learning_rate": 0.00024620370783251413, + "loss": 2.3651, + "step": 5925 + }, + { + "epoch": 0.679509230592822, + "grad_norm": 0.25463130838323844, + "learning_rate": 0.00024604373080724414, + "loss": 2.3179, + "step": 5926 + }, + { + "epoch": 0.6796238963421626, + "grad_norm": 0.2382997507235637, + "learning_rate": 0.00024588378881040064, + "loss": 2.3115, + "step": 5927 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.264290060030737, + "learning_rate": 0.00024572388186404456, + "loss": 2.3649, + "step": 5928 + }, + { + "epoch": 0.6798532278408439, + "grad_norm": 0.26962157118235613, + "learning_rate": 0.000245564009990232, + "loss": 2.5403, + "step": 5929 + }, + { + "epoch": 0.6799678935901846, + "grad_norm": 0.25068342091440526, + "learning_rate": 0.00024540417321101434, + "loss": 2.3642, + "step": 5930 + }, + { + "epoch": 0.6800825593395253, + "grad_norm": 0.2786768365078446, + "learning_rate": 0.0002452443715484381, + "loss": 2.4606, + "step": 5931 + }, + { + "epoch": 0.6801972250888659, + "grad_norm": 0.27548258901410927, + "learning_rate": 0.00024508460502454453, + "loss": 2.3439, + "step": 5932 + }, + { + "epoch": 0.6803118908382066, + "grad_norm": 0.23455765887383576, + "learning_rate": 0.00024492487366137056, + "loss": 2.4906, + "step": 5933 + }, + { + "epoch": 0.6804265565875472, + "grad_norm": 0.25440478626484697, + "learning_rate": 0.0002447651774809481, + "loss": 2.3705, + "step": 5934 + }, + { + "epoch": 0.680541222336888, + "grad_norm": 0.25749006959661125, + "learning_rate": 0.0002446055165053042, + "loss": 2.3652, + "step": 5935 + }, + { + "epoch": 0.6806558880862287, + "grad_norm": 0.2521559261381318, + "learning_rate": 0.0002444458907564611, + "loss": 2.3948, + "step": 5936 + }, + { + "epoch": 0.6807705538355693, + "grad_norm": 0.27495783942904434, + "learning_rate": 0.00024428630025643617, + "loss": 2.2922, + "step": 5937 + }, + { + "epoch": 0.68088521958491, + "grad_norm": 0.2837050212860146, + "learning_rate": 0.00024412674502724142, + "loss": 2.3345, + "step": 5938 + }, + { + "epoch": 0.6809998853342507, + "grad_norm": 0.2395640420658184, + "learning_rate": 0.00024396722509088497, + "loss": 2.4224, + "step": 5939 + }, + { + "epoch": 0.6811145510835913, + "grad_norm": 0.2268010809022975, + "learning_rate": 0.0002438077404693696, + "loss": 2.3514, + "step": 5940 + }, + { + "epoch": 0.681229216832932, + "grad_norm": 0.2679908185901851, + "learning_rate": 0.0002436482911846928, + "loss": 2.4255, + "step": 5941 + }, + { + "epoch": 0.6813438825822726, + "grad_norm": 0.26518135787583974, + "learning_rate": 0.00024348887725884766, + "loss": 2.4844, + "step": 5942 + }, + { + "epoch": 0.6814585483316133, + "grad_norm": 0.26472901173473073, + "learning_rate": 0.00024332949871382238, + "loss": 2.547, + "step": 5943 + }, + { + "epoch": 0.6815732140809541, + "grad_norm": 0.23726327026324298, + "learning_rate": 0.00024317015557160022, + "loss": 2.5146, + "step": 5944 + }, + { + "epoch": 0.6816878798302947, + "grad_norm": 0.2764433058310707, + "learning_rate": 0.00024301084785415938, + "loss": 2.3204, + "step": 5945 + }, + { + "epoch": 0.6818025455796354, + "grad_norm": 0.25050205161613665, + "learning_rate": 0.00024285157558347372, + "loss": 2.5113, + "step": 5946 + }, + { + "epoch": 0.681917211328976, + "grad_norm": 0.27296474692877987, + "learning_rate": 0.00024269233878151125, + "loss": 2.4571, + "step": 5947 + }, + { + "epoch": 0.6820318770783167, + "grad_norm": 0.25927578895452735, + "learning_rate": 0.00024253313747023592, + "loss": 2.3687, + "step": 5948 + }, + { + "epoch": 0.6821465428276574, + "grad_norm": 0.22678160139598205, + "learning_rate": 0.00024237397167160653, + "loss": 2.5486, + "step": 5949 + }, + { + "epoch": 0.682261208576998, + "grad_norm": 0.2659364625002715, + "learning_rate": 0.00024221484140757692, + "loss": 2.375, + "step": 5950 + }, + { + "epoch": 0.6823758743263387, + "grad_norm": 0.2529910655021268, + "learning_rate": 0.00024205574670009618, + "loss": 2.4485, + "step": 5951 + }, + { + "epoch": 0.6824905400756794, + "grad_norm": 0.2426440602915467, + "learning_rate": 0.00024189668757110844, + "loss": 2.4361, + "step": 5952 + }, + { + "epoch": 0.68260520582502, + "grad_norm": 0.25004059774554244, + "learning_rate": 0.00024173766404255237, + "loss": 2.6313, + "step": 5953 + }, + { + "epoch": 0.6827198715743608, + "grad_norm": 0.2357702364768859, + "learning_rate": 0.00024157867613636304, + "loss": 2.3629, + "step": 5954 + }, + { + "epoch": 0.6828345373237014, + "grad_norm": 0.2531489651126013, + "learning_rate": 0.00024141972387446914, + "loss": 2.5759, + "step": 5955 + }, + { + "epoch": 0.6829492030730421, + "grad_norm": 0.2484231619181877, + "learning_rate": 0.0002412608072787954, + "loss": 2.3616, + "step": 5956 + }, + { + "epoch": 0.6830638688223828, + "grad_norm": 0.2948318936902473, + "learning_rate": 0.00024110192637126122, + "loss": 2.5161, + "step": 5957 + }, + { + "epoch": 0.6831785345717234, + "grad_norm": 0.28413265794348663, + "learning_rate": 0.00024094308117378127, + "loss": 2.3751, + "step": 5958 + }, + { + "epoch": 0.6832932003210641, + "grad_norm": 0.24185703249314555, + "learning_rate": 0.00024078427170826523, + "loss": 2.2914, + "step": 5959 + }, + { + "epoch": 0.6834078660704048, + "grad_norm": 0.270538450484696, + "learning_rate": 0.000240625497996618, + "loss": 2.4241, + "step": 5960 + }, + { + "epoch": 0.6835225318197454, + "grad_norm": 0.2584247070936721, + "learning_rate": 0.00024046676006073898, + "loss": 2.3998, + "step": 5961 + }, + { + "epoch": 0.6836371975690861, + "grad_norm": 0.24547427202955674, + "learning_rate": 0.00024030805792252325, + "loss": 2.4195, + "step": 5962 + }, + { + "epoch": 0.6837518633184267, + "grad_norm": 0.24529444483689977, + "learning_rate": 0.00024014939160386074, + "loss": 2.4509, + "step": 5963 + }, + { + "epoch": 0.6838665290677675, + "grad_norm": 0.2520558691476565, + "learning_rate": 0.00023999076112663647, + "loss": 2.3378, + "step": 5964 + }, + { + "epoch": 0.6839811948171082, + "grad_norm": 0.2535389506748618, + "learning_rate": 0.00023983216651273043, + "loss": 2.4644, + "step": 5965 + }, + { + "epoch": 0.6840958605664488, + "grad_norm": 0.2451114482287227, + "learning_rate": 0.00023967360778401803, + "loss": 2.4831, + "step": 5966 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.25715563112976164, + "learning_rate": 0.00023951508496236897, + "loss": 2.2813, + "step": 5967 + }, + { + "epoch": 0.6843251920651301, + "grad_norm": 0.2439105770800984, + "learning_rate": 0.00023935659806964844, + "loss": 2.3361, + "step": 5968 + }, + { + "epoch": 0.6844398578144708, + "grad_norm": 0.24473107374652628, + "learning_rate": 0.00023919814712771725, + "loss": 2.6024, + "step": 5969 + }, + { + "epoch": 0.6845545235638115, + "grad_norm": 0.22765717686941522, + "learning_rate": 0.0002390397321584301, + "loss": 2.3261, + "step": 5970 + }, + { + "epoch": 0.6846691893131521, + "grad_norm": 0.2649468817745486, + "learning_rate": 0.00023888135318363764, + "loss": 2.4759, + "step": 5971 + }, + { + "epoch": 0.6847838550624928, + "grad_norm": 0.2424686437415284, + "learning_rate": 0.00023872301022518527, + "loss": 2.4309, + "step": 5972 + }, + { + "epoch": 0.6848985208118336, + "grad_norm": 0.23988201752151206, + "learning_rate": 0.00023856470330491287, + "loss": 2.3969, + "step": 5973 + }, + { + "epoch": 0.6850131865611742, + "grad_norm": 0.2733974010482567, + "learning_rate": 0.00023840643244465643, + "loss": 2.2964, + "step": 5974 + }, + { + "epoch": 0.6851278523105149, + "grad_norm": 0.263365702608744, + "learning_rate": 0.0002382481976662464, + "loss": 2.4853, + "step": 5975 + }, + { + "epoch": 0.6852425180598555, + "grad_norm": 0.24022707045594938, + "learning_rate": 0.0002380899989915079, + "loss": 2.3139, + "step": 5976 + }, + { + "epoch": 0.6853571838091962, + "grad_norm": 0.25887098517347007, + "learning_rate": 0.00023793183644226152, + "loss": 2.4615, + "step": 5977 + }, + { + "epoch": 0.6854718495585369, + "grad_norm": 0.26789157388600776, + "learning_rate": 0.00023777371004032289, + "loss": 2.4133, + "step": 5978 + }, + { + "epoch": 0.6855865153078775, + "grad_norm": 0.23428414183566829, + "learning_rate": 0.0002376156198075024, + "loss": 2.3758, + "step": 5979 + }, + { + "epoch": 0.6857011810572182, + "grad_norm": 0.24782346397380386, + "learning_rate": 0.00023745756576560572, + "loss": 2.4501, + "step": 5980 + }, + { + "epoch": 0.6858158468065588, + "grad_norm": 0.25427364735193364, + "learning_rate": 0.00023729954793643348, + "loss": 2.4936, + "step": 5981 + }, + { + "epoch": 0.6859305125558995, + "grad_norm": 0.2592222653160551, + "learning_rate": 0.0002371415663417809, + "loss": 2.4839, + "step": 5982 + }, + { + "epoch": 0.6860451783052403, + "grad_norm": 0.25862299505307546, + "learning_rate": 0.00023698362100343858, + "loss": 2.4485, + "step": 5983 + }, + { + "epoch": 0.6861598440545809, + "grad_norm": 0.25706240565145433, + "learning_rate": 0.00023682571194319247, + "loss": 2.354, + "step": 5984 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 0.25473879671946764, + "learning_rate": 0.0002366678391828227, + "loss": 2.3497, + "step": 5985 + }, + { + "epoch": 0.6863891755532623, + "grad_norm": 0.259146785436911, + "learning_rate": 0.00023651000274410493, + "loss": 2.4152, + "step": 5986 + }, + { + "epoch": 0.6865038413026029, + "grad_norm": 0.26580596935281386, + "learning_rate": 0.0002363522026488099, + "loss": 2.397, + "step": 5987 + }, + { + "epoch": 0.6866185070519436, + "grad_norm": 0.2637236203569202, + "learning_rate": 0.00023619443891870258, + "loss": 2.2371, + "step": 5988 + }, + { + "epoch": 0.6867331728012842, + "grad_norm": 0.3099883403327138, + "learning_rate": 0.00023603671157554397, + "loss": 2.4009, + "step": 5989 + }, + { + "epoch": 0.6868478385506249, + "grad_norm": 0.2717404622692867, + "learning_rate": 0.00023587902064108962, + "loss": 2.4706, + "step": 5990 + }, + { + "epoch": 0.6869625042999656, + "grad_norm": 0.27227966807734244, + "learning_rate": 0.00023572136613708955, + "loss": 2.3522, + "step": 5991 + }, + { + "epoch": 0.6870771700493062, + "grad_norm": 0.25207348136714425, + "learning_rate": 0.0002355637480852894, + "loss": 2.3323, + "step": 5992 + }, + { + "epoch": 0.687191835798647, + "grad_norm": 0.2323301108521448, + "learning_rate": 0.00023540616650742963, + "loss": 2.2851, + "step": 5993 + }, + { + "epoch": 0.6873065015479877, + "grad_norm": 0.2556784604113841, + "learning_rate": 0.00023524862142524557, + "loss": 2.338, + "step": 5994 + }, + { + "epoch": 0.6874211672973283, + "grad_norm": 0.256515269018604, + "learning_rate": 0.00023509111286046752, + "loss": 2.5354, + "step": 5995 + }, + { + "epoch": 0.687535833046669, + "grad_norm": 0.22087769681952038, + "learning_rate": 0.00023493364083482105, + "loss": 2.3333, + "step": 5996 + }, + { + "epoch": 0.6876504987960096, + "grad_norm": 0.2551928427023443, + "learning_rate": 0.00023477620537002604, + "loss": 2.3491, + "step": 5997 + }, + { + "epoch": 0.6877651645453503, + "grad_norm": 0.2426545704312682, + "learning_rate": 0.00023461880648779782, + "loss": 2.4617, + "step": 5998 + }, + { + "epoch": 0.687879830294691, + "grad_norm": 0.28228693609092514, + "learning_rate": 0.00023446144420984666, + "loss": 2.4737, + "step": 5999 + }, + { + "epoch": 0.6879944960440316, + "grad_norm": 0.2660732786067178, + "learning_rate": 0.00023430411855787764, + "loss": 2.5198, + "step": 6000 + }, + { + "epoch": 0.6881091617933723, + "grad_norm": 0.2682803624536527, + "learning_rate": 0.00023414682955359085, + "loss": 2.285, + "step": 6001 + }, + { + "epoch": 0.6882238275427129, + "grad_norm": 0.25371106817736977, + "learning_rate": 0.00023398957721868147, + "loss": 2.4452, + "step": 6002 + }, + { + "epoch": 0.6883384932920537, + "grad_norm": 0.2684407354524099, + "learning_rate": 0.0002338323615748389, + "loss": 2.3738, + "step": 6003 + }, + { + "epoch": 0.6884531590413944, + "grad_norm": 0.2718203646472465, + "learning_rate": 0.0002336751826437488, + "loss": 2.4163, + "step": 6004 + }, + { + "epoch": 0.688567824790735, + "grad_norm": 0.25776345692107555, + "learning_rate": 0.00023351804044709052, + "loss": 2.3195, + "step": 6005 + }, + { + "epoch": 0.6886824905400757, + "grad_norm": 0.28017188060966125, + "learning_rate": 0.00023336093500653888, + "loss": 2.5245, + "step": 6006 + }, + { + "epoch": 0.6887971562894164, + "grad_norm": 0.2828575163434155, + "learning_rate": 0.00023320386634376368, + "loss": 2.3966, + "step": 6007 + }, + { + "epoch": 0.688911822038757, + "grad_norm": 0.27808779909933234, + "learning_rate": 0.00023304683448042957, + "loss": 2.4189, + "step": 6008 + }, + { + "epoch": 0.6890264877880977, + "grad_norm": 0.27887565906848655, + "learning_rate": 0.00023288983943819597, + "loss": 2.4518, + "step": 6009 + }, + { + "epoch": 0.6891411535374383, + "grad_norm": 0.2932893899820623, + "learning_rate": 0.00023273288123871767, + "loss": 2.4071, + "step": 6010 + }, + { + "epoch": 0.689255819286779, + "grad_norm": 0.23137782650760233, + "learning_rate": 0.00023257595990364366, + "loss": 2.3965, + "step": 6011 + }, + { + "epoch": 0.6893704850361198, + "grad_norm": 0.26637552574109913, + "learning_rate": 0.00023241907545461837, + "loss": 2.4212, + "step": 6012 + }, + { + "epoch": 0.6894851507854604, + "grad_norm": 0.2504518103977167, + "learning_rate": 0.0002322622279132811, + "loss": 2.3614, + "step": 6013 + }, + { + "epoch": 0.6895998165348011, + "grad_norm": 0.24774369044488778, + "learning_rate": 0.0002321054173012659, + "loss": 2.4518, + "step": 6014 + }, + { + "epoch": 0.6897144822841417, + "grad_norm": 0.25556935747182274, + "learning_rate": 0.0002319486436402019, + "loss": 2.3862, + "step": 6015 + }, + { + "epoch": 0.6898291480334824, + "grad_norm": 0.2304107534739128, + "learning_rate": 0.00023179190695171316, + "loss": 2.3165, + "step": 6016 + }, + { + "epoch": 0.6899438137828231, + "grad_norm": 0.25369502599549154, + "learning_rate": 0.0002316352072574181, + "loss": 2.544, + "step": 6017 + }, + { + "epoch": 0.6900584795321637, + "grad_norm": 0.25800459517868296, + "learning_rate": 0.00023147854457893058, + "loss": 2.5028, + "step": 6018 + }, + { + "epoch": 0.6901731452815044, + "grad_norm": 0.2599825953979226, + "learning_rate": 0.0002313219189378597, + "loss": 2.5127, + "step": 6019 + }, + { + "epoch": 0.6902878110308451, + "grad_norm": 0.27956996653213906, + "learning_rate": 0.00023116533035580844, + "loss": 2.3869, + "step": 6020 + }, + { + "epoch": 0.6904024767801857, + "grad_norm": 0.25144555438504407, + "learning_rate": 0.00023100877885437544, + "loss": 2.4285, + "step": 6021 + }, + { + "epoch": 0.6905171425295265, + "grad_norm": 0.26793971796127836, + "learning_rate": 0.00023085226445515412, + "loss": 2.5434, + "step": 6022 + }, + { + "epoch": 0.690631808278867, + "grad_norm": 0.2524798530443962, + "learning_rate": 0.0002306957871797321, + "loss": 2.3822, + "step": 6023 + }, + { + "epoch": 0.6907464740282078, + "grad_norm": 0.25414619201184835, + "learning_rate": 0.00023053934704969303, + "loss": 2.3981, + "step": 6024 + }, + { + "epoch": 0.6908611397775485, + "grad_norm": 0.2680085705854303, + "learning_rate": 0.00023038294408661488, + "loss": 2.3863, + "step": 6025 + }, + { + "epoch": 0.6909758055268891, + "grad_norm": 0.2523206307534517, + "learning_rate": 0.00023022657831207, + "loss": 2.4861, + "step": 6026 + }, + { + "epoch": 0.6910904712762298, + "grad_norm": 0.2211428267397594, + "learning_rate": 0.00023007024974762625, + "loss": 2.3405, + "step": 6027 + }, + { + "epoch": 0.6912051370255705, + "grad_norm": 0.21711186694922344, + "learning_rate": 0.0002299139584148463, + "loss": 2.3254, + "step": 6028 + }, + { + "epoch": 0.6913198027749111, + "grad_norm": 0.2687171898704878, + "learning_rate": 0.00022975770433528742, + "loss": 2.4673, + "step": 6029 + }, + { + "epoch": 0.6914344685242518, + "grad_norm": 0.2526553915969993, + "learning_rate": 0.00022960148753050198, + "loss": 2.3097, + "step": 6030 + }, + { + "epoch": 0.6915491342735924, + "grad_norm": 0.2400595220874449, + "learning_rate": 0.00022944530802203723, + "loss": 2.3928, + "step": 6031 + }, + { + "epoch": 0.6916638000229332, + "grad_norm": 0.24070170839143495, + "learning_rate": 0.00022928916583143483, + "loss": 2.3958, + "step": 6032 + }, + { + "epoch": 0.6917784657722739, + "grad_norm": 0.261669425278507, + "learning_rate": 0.00022913306098023157, + "loss": 2.3152, + "step": 6033 + }, + { + "epoch": 0.6918931315216145, + "grad_norm": 0.2611638711887135, + "learning_rate": 0.00022897699348995977, + "loss": 2.5859, + "step": 6034 + }, + { + "epoch": 0.6920077972709552, + "grad_norm": 0.2420817711503355, + "learning_rate": 0.00022882096338214532, + "loss": 2.4329, + "step": 6035 + }, + { + "epoch": 0.6921224630202958, + "grad_norm": 0.2535777943157481, + "learning_rate": 0.00022866497067830982, + "loss": 2.5085, + "step": 6036 + }, + { + "epoch": 0.6922371287696365, + "grad_norm": 0.2582391630889939, + "learning_rate": 0.00022850901539996971, + "loss": 2.4327, + "step": 6037 + }, + { + "epoch": 0.6923517945189772, + "grad_norm": 0.242483854221785, + "learning_rate": 0.00022835309756863548, + "loss": 2.4958, + "step": 6038 + }, + { + "epoch": 0.6924664602683178, + "grad_norm": 0.2586813358047851, + "learning_rate": 0.00022819721720581355, + "loss": 2.4243, + "step": 6039 + }, + { + "epoch": 0.6925811260176585, + "grad_norm": 0.24531309423144862, + "learning_rate": 0.00022804137433300465, + "loss": 2.3632, + "step": 6040 + }, + { + "epoch": 0.6926957917669992, + "grad_norm": 0.2499376248500328, + "learning_rate": 0.00022788556897170397, + "loss": 2.5065, + "step": 6041 + }, + { + "epoch": 0.6928104575163399, + "grad_norm": 0.23896445264405852, + "learning_rate": 0.00022772980114340208, + "loss": 2.3797, + "step": 6042 + }, + { + "epoch": 0.6929251232656806, + "grad_norm": 0.23312729924962428, + "learning_rate": 0.0002275740708695842, + "loss": 2.2866, + "step": 6043 + }, + { + "epoch": 0.6930397890150212, + "grad_norm": 0.23391535979741276, + "learning_rate": 0.00022741837817173032, + "loss": 2.451, + "step": 6044 + }, + { + "epoch": 0.6931544547643619, + "grad_norm": 0.2655517868447734, + "learning_rate": 0.0002272627230713153, + "loss": 2.5203, + "step": 6045 + }, + { + "epoch": 0.6932691205137026, + "grad_norm": 0.25581189120813497, + "learning_rate": 0.00022710710558980897, + "loss": 2.4073, + "step": 6046 + }, + { + "epoch": 0.6933837862630432, + "grad_norm": 0.25545915385073414, + "learning_rate": 0.00022695152574867545, + "loss": 2.3968, + "step": 6047 + }, + { + "epoch": 0.6934984520123839, + "grad_norm": 0.2536850588934234, + "learning_rate": 0.00022679598356937415, + "loss": 2.4762, + "step": 6048 + }, + { + "epoch": 0.6936131177617245, + "grad_norm": 0.23372803142015164, + "learning_rate": 0.00022664047907335917, + "loss": 2.3904, + "step": 6049 + }, + { + "epoch": 0.6937277835110652, + "grad_norm": 0.23391094359989, + "learning_rate": 0.00022648501228207942, + "loss": 2.4163, + "step": 6050 + }, + { + "epoch": 0.693842449260406, + "grad_norm": 0.2840064279502021, + "learning_rate": 0.0002263295832169785, + "loss": 2.5135, + "step": 6051 + }, + { + "epoch": 0.6939571150097466, + "grad_norm": 0.23929194980645876, + "learning_rate": 0.00022617419189949518, + "loss": 2.4789, + "step": 6052 + }, + { + "epoch": 0.6940717807590873, + "grad_norm": 0.27090188683192695, + "learning_rate": 0.00022601883835106208, + "loss": 2.2868, + "step": 6053 + }, + { + "epoch": 0.694186446508428, + "grad_norm": 0.2624206903796548, + "learning_rate": 0.0002258635225931081, + "loss": 2.3578, + "step": 6054 + }, + { + "epoch": 0.6943011122577686, + "grad_norm": 0.2123345810483625, + "learning_rate": 0.0002257082446470554, + "loss": 2.3363, + "step": 6055 + }, + { + "epoch": 0.6944157780071093, + "grad_norm": 0.23344845397915376, + "learning_rate": 0.00022555300453432192, + "loss": 2.3605, + "step": 6056 + }, + { + "epoch": 0.6945304437564499, + "grad_norm": 0.23884032436175504, + "learning_rate": 0.00022539780227632007, + "loss": 2.4283, + "step": 6057 + }, + { + "epoch": 0.6946451095057906, + "grad_norm": 0.2542484066507928, + "learning_rate": 0.00022524263789445694, + "loss": 2.3803, + "step": 6058 + }, + { + "epoch": 0.6947597752551313, + "grad_norm": 0.2413493138490213, + "learning_rate": 0.00022508751141013463, + "loss": 2.4603, + "step": 6059 + }, + { + "epoch": 0.6948744410044719, + "grad_norm": 0.24658475187835932, + "learning_rate": 0.00022493242284474992, + "loss": 2.1539, + "step": 6060 + }, + { + "epoch": 0.6949891067538126, + "grad_norm": 0.25866980424589175, + "learning_rate": 0.00022477737221969407, + "loss": 2.2465, + "step": 6061 + }, + { + "epoch": 0.6951037725031534, + "grad_norm": 0.2909943831876313, + "learning_rate": 0.00022462235955635347, + "loss": 2.6218, + "step": 6062 + }, + { + "epoch": 0.695218438252494, + "grad_norm": 0.2507038658433087, + "learning_rate": 0.00022446738487610923, + "loss": 2.4541, + "step": 6063 + }, + { + "epoch": 0.6953331040018347, + "grad_norm": 0.2484650058685007, + "learning_rate": 0.0002243124482003371, + "loss": 2.5083, + "step": 6064 + }, + { + "epoch": 0.6954477697511753, + "grad_norm": 0.26319575810254775, + "learning_rate": 0.00022415754955040762, + "loss": 2.4375, + "step": 6065 + }, + { + "epoch": 0.695562435500516, + "grad_norm": 0.2548088509715983, + "learning_rate": 0.0002240026889476864, + "loss": 2.4561, + "step": 6066 + }, + { + "epoch": 0.6956771012498567, + "grad_norm": 0.2392221857286319, + "learning_rate": 0.00022384786641353299, + "loss": 2.3351, + "step": 6067 + }, + { + "epoch": 0.6957917669991973, + "grad_norm": 0.2526821430350191, + "learning_rate": 0.00022369308196930237, + "loss": 2.376, + "step": 6068 + }, + { + "epoch": 0.695906432748538, + "grad_norm": 0.2723388980766227, + "learning_rate": 0.00022353833563634452, + "loss": 2.4998, + "step": 6069 + }, + { + "epoch": 0.6960210984978786, + "grad_norm": 0.2631895448480991, + "learning_rate": 0.00022338362743600326, + "loss": 2.2407, + "step": 6070 + }, + { + "epoch": 0.6961357642472193, + "grad_norm": 0.2637832750776056, + "learning_rate": 0.0002232289573896178, + "loss": 2.5123, + "step": 6071 + }, + { + "epoch": 0.6962504299965601, + "grad_norm": 0.2720490386181197, + "learning_rate": 0.00022307432551852213, + "loss": 2.3557, + "step": 6072 + }, + { + "epoch": 0.6963650957459007, + "grad_norm": 0.27838838904850055, + "learning_rate": 0.00022291973184404419, + "loss": 2.3983, + "step": 6073 + }, + { + "epoch": 0.6964797614952414, + "grad_norm": 0.2512977583300789, + "learning_rate": 0.00022276517638750783, + "loss": 2.4426, + "step": 6074 + }, + { + "epoch": 0.6965944272445821, + "grad_norm": 0.2820055149496258, + "learning_rate": 0.00022261065917023094, + "loss": 2.3921, + "step": 6075 + }, + { + "epoch": 0.6967090929939227, + "grad_norm": 0.2539027499816261, + "learning_rate": 0.00022245618021352593, + "loss": 2.2748, + "step": 6076 + }, + { + "epoch": 0.6968237587432634, + "grad_norm": 0.2643284120615805, + "learning_rate": 0.00022230173953870043, + "loss": 2.3921, + "step": 6077 + }, + { + "epoch": 0.696938424492604, + "grad_norm": 0.2818843205409747, + "learning_rate": 0.0002221473371670565, + "loss": 2.6005, + "step": 6078 + }, + { + "epoch": 0.6970530902419447, + "grad_norm": 0.24661314052114677, + "learning_rate": 0.00022199297311989108, + "loss": 2.4509, + "step": 6079 + }, + { + "epoch": 0.6971677559912854, + "grad_norm": 0.2475283108277519, + "learning_rate": 0.00022183864741849569, + "loss": 2.4362, + "step": 6080 + }, + { + "epoch": 0.697282421740626, + "grad_norm": 0.2576912035458726, + "learning_rate": 0.00022168436008415687, + "loss": 2.5478, + "step": 6081 + }, + { + "epoch": 0.6973970874899668, + "grad_norm": 0.24904693317698942, + "learning_rate": 0.00022153011113815519, + "loss": 2.3927, + "step": 6082 + }, + { + "epoch": 0.6975117532393074, + "grad_norm": 0.24650944303798017, + "learning_rate": 0.00022137590060176637, + "loss": 2.4531, + "step": 6083 + }, + { + "epoch": 0.6976264189886481, + "grad_norm": 0.28935174808384223, + "learning_rate": 0.00022122172849626143, + "loss": 2.2884, + "step": 6084 + }, + { + "epoch": 0.6977410847379888, + "grad_norm": 0.24574798396334227, + "learning_rate": 0.00022106759484290483, + "loss": 2.3338, + "step": 6085 + }, + { + "epoch": 0.6978557504873294, + "grad_norm": 0.25787007344997465, + "learning_rate": 0.0002209134996629566, + "loss": 2.3664, + "step": 6086 + }, + { + "epoch": 0.6979704162366701, + "grad_norm": 0.25545711015600603, + "learning_rate": 0.00022075944297767137, + "loss": 2.4057, + "step": 6087 + }, + { + "epoch": 0.6980850819860108, + "grad_norm": 0.27204289605649984, + "learning_rate": 0.00022060542480829782, + "loss": 2.5412, + "step": 6088 + }, + { + "epoch": 0.6981997477353514, + "grad_norm": 0.2348975156654947, + "learning_rate": 0.00022045144517608033, + "loss": 2.4248, + "step": 6089 + }, + { + "epoch": 0.6983144134846921, + "grad_norm": 0.2534379627374989, + "learning_rate": 0.00022029750410225752, + "loss": 2.4142, + "step": 6090 + }, + { + "epoch": 0.6984290792340327, + "grad_norm": 0.25886380375220064, + "learning_rate": 0.00022014360160806218, + "loss": 2.4203, + "step": 6091 + }, + { + "epoch": 0.6985437449833735, + "grad_norm": 0.2529740471850219, + "learning_rate": 0.00021998973771472248, + "loss": 2.361, + "step": 6092 + }, + { + "epoch": 0.6986584107327142, + "grad_norm": 0.27199155463472, + "learning_rate": 0.00021983591244346097, + "loss": 2.4755, + "step": 6093 + }, + { + "epoch": 0.6987730764820548, + "grad_norm": 0.2783746824463265, + "learning_rate": 0.00021968212581549495, + "loss": 2.4007, + "step": 6094 + }, + { + "epoch": 0.6988877422313955, + "grad_norm": 0.25293474977809655, + "learning_rate": 0.00021952837785203634, + "loss": 2.5032, + "step": 6095 + }, + { + "epoch": 0.6990024079807362, + "grad_norm": 0.2529987731629939, + "learning_rate": 0.00021937466857429184, + "loss": 2.5169, + "step": 6096 + }, + { + "epoch": 0.6991170737300768, + "grad_norm": 0.24316642182432266, + "learning_rate": 0.00021922099800346246, + "loss": 2.3067, + "step": 6097 + }, + { + "epoch": 0.6992317394794175, + "grad_norm": 0.2542560577671439, + "learning_rate": 0.00021906736616074428, + "loss": 2.4223, + "step": 6098 + }, + { + "epoch": 0.6993464052287581, + "grad_norm": 0.2544461721743063, + "learning_rate": 0.0002189137730673279, + "loss": 2.3984, + "step": 6099 + }, + { + "epoch": 0.6994610709780988, + "grad_norm": 0.23192157333733684, + "learning_rate": 0.0002187602187443985, + "loss": 2.4086, + "step": 6100 + }, + { + "epoch": 0.6995757367274396, + "grad_norm": 0.2549273280267901, + "learning_rate": 0.00021860670321313604, + "loss": 2.3486, + "step": 6101 + }, + { + "epoch": 0.6996904024767802, + "grad_norm": 0.24624058290959622, + "learning_rate": 0.00021845322649471517, + "loss": 2.5079, + "step": 6102 + }, + { + "epoch": 0.6998050682261209, + "grad_norm": 0.2608300303069196, + "learning_rate": 0.00021829978861030459, + "loss": 2.4843, + "step": 6103 + }, + { + "epoch": 0.6999197339754615, + "grad_norm": 0.2475642543440845, + "learning_rate": 0.0002181463895810688, + "loss": 2.2747, + "step": 6104 + }, + { + "epoch": 0.7000343997248022, + "grad_norm": 0.2464811384869547, + "learning_rate": 0.00021799302942816578, + "loss": 2.2378, + "step": 6105 + }, + { + "epoch": 0.7001490654741429, + "grad_norm": 0.273222174520215, + "learning_rate": 0.00021783970817274885, + "loss": 2.3874, + "step": 6106 + }, + { + "epoch": 0.7002637312234835, + "grad_norm": 0.2528763674954494, + "learning_rate": 0.0002176864258359657, + "loss": 2.2565, + "step": 6107 + }, + { + "epoch": 0.7003783969728242, + "grad_norm": 0.2769712007325071, + "learning_rate": 0.0002175331824389587, + "loss": 2.4165, + "step": 6108 + }, + { + "epoch": 0.7004930627221649, + "grad_norm": 0.25442949628112366, + "learning_rate": 0.0002173799780028649, + "loss": 2.3315, + "step": 6109 + }, + { + "epoch": 0.7006077284715055, + "grad_norm": 0.2735310373855664, + "learning_rate": 0.00021722681254881604, + "loss": 2.514, + "step": 6110 + }, + { + "epoch": 0.7007223942208463, + "grad_norm": 0.26606288204312917, + "learning_rate": 0.00021707368609793815, + "loss": 2.2473, + "step": 6111 + }, + { + "epoch": 0.7008370599701869, + "grad_norm": 0.25193752688170873, + "learning_rate": 0.0002169205986713521, + "loss": 2.4051, + "step": 6112 + }, + { + "epoch": 0.7009517257195276, + "grad_norm": 0.2312633852700128, + "learning_rate": 0.00021676755029017354, + "loss": 2.4278, + "step": 6113 + }, + { + "epoch": 0.7010663914688683, + "grad_norm": 0.2468576856485859, + "learning_rate": 0.00021661454097551247, + "loss": 2.5959, + "step": 6114 + }, + { + "epoch": 0.7011810572182089, + "grad_norm": 0.2302460985510695, + "learning_rate": 0.00021646157074847372, + "loss": 2.3457, + "step": 6115 + }, + { + "epoch": 0.7012957229675496, + "grad_norm": 0.24866278183761428, + "learning_rate": 0.0002163086396301567, + "loss": 2.4546, + "step": 6116 + }, + { + "epoch": 0.7014103887168902, + "grad_norm": 0.23669034397686303, + "learning_rate": 0.00021615574764165496, + "loss": 2.3993, + "step": 6117 + }, + { + "epoch": 0.7015250544662309, + "grad_norm": 0.24069332240785304, + "learning_rate": 0.00021600289480405715, + "loss": 2.3245, + "step": 6118 + }, + { + "epoch": 0.7016397202155716, + "grad_norm": 0.2787556975542305, + "learning_rate": 0.0002158500811384469, + "loss": 2.511, + "step": 6119 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 0.24245597655539464, + "learning_rate": 0.00021569730666590132, + "loss": 2.4537, + "step": 6120 + }, + { + "epoch": 0.701869051714253, + "grad_norm": 0.25192496185685076, + "learning_rate": 0.000215544571407493, + "loss": 2.4153, + "step": 6121 + }, + { + "epoch": 0.7019837174635937, + "grad_norm": 0.249582614256594, + "learning_rate": 0.00021539187538428906, + "loss": 2.3925, + "step": 6122 + }, + { + "epoch": 0.7020983832129343, + "grad_norm": 0.240094425234144, + "learning_rate": 0.0002152392186173504, + "loss": 2.3414, + "step": 6123 + }, + { + "epoch": 0.702213048962275, + "grad_norm": 0.2505059477568348, + "learning_rate": 0.0002150866011277337, + "loss": 2.4865, + "step": 6124 + }, + { + "epoch": 0.7023277147116156, + "grad_norm": 0.27627480796233034, + "learning_rate": 0.0002149340229364896, + "loss": 2.57, + "step": 6125 + }, + { + "epoch": 0.7024423804609563, + "grad_norm": 0.24354523082493298, + "learning_rate": 0.00021478148406466302, + "loss": 2.4071, + "step": 6126 + }, + { + "epoch": 0.702557046210297, + "grad_norm": 0.26369275833503175, + "learning_rate": 0.00021462898453329394, + "loss": 2.3926, + "step": 6127 + }, + { + "epoch": 0.7026717119596376, + "grad_norm": 0.2724997102298509, + "learning_rate": 0.00021447652436341674, + "loss": 2.5047, + "step": 6128 + }, + { + "epoch": 0.7027863777089783, + "grad_norm": 0.2538795470584879, + "learning_rate": 0.00021432410357606046, + "loss": 2.5197, + "step": 6129 + }, + { + "epoch": 0.702901043458319, + "grad_norm": 0.23910144882538228, + "learning_rate": 0.0002141717221922486, + "loss": 2.4812, + "step": 6130 + }, + { + "epoch": 0.7030157092076597, + "grad_norm": 0.23914988826423383, + "learning_rate": 0.0002140193802329995, + "loss": 2.4265, + "step": 6131 + }, + { + "epoch": 0.7031303749570004, + "grad_norm": 0.2510558133713407, + "learning_rate": 0.0002138670777193254, + "loss": 2.3365, + "step": 6132 + }, + { + "epoch": 0.703245040706341, + "grad_norm": 0.2906966616706873, + "learning_rate": 0.00021371481467223358, + "loss": 2.3397, + "step": 6133 + }, + { + "epoch": 0.7033597064556817, + "grad_norm": 0.25808191233504235, + "learning_rate": 0.0002135625911127263, + "loss": 2.5402, + "step": 6134 + }, + { + "epoch": 0.7034743722050224, + "grad_norm": 0.23651133898314605, + "learning_rate": 0.00021341040706179942, + "loss": 2.5378, + "step": 6135 + }, + { + "epoch": 0.703589037954363, + "grad_norm": 0.24692749896526253, + "learning_rate": 0.000213258262540444, + "loss": 2.4893, + "step": 6136 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 0.25971551793412295, + "learning_rate": 0.0002131061575696457, + "loss": 2.386, + "step": 6137 + }, + { + "epoch": 0.7038183694530443, + "grad_norm": 0.2258367772341615, + "learning_rate": 0.0002129540921703838, + "loss": 2.4342, + "step": 6138 + }, + { + "epoch": 0.703933035202385, + "grad_norm": 0.2380350894943928, + "learning_rate": 0.00021280206636363358, + "loss": 2.3896, + "step": 6139 + }, + { + "epoch": 0.7040477009517258, + "grad_norm": 0.24301671728180796, + "learning_rate": 0.0002126500801703639, + "loss": 2.484, + "step": 6140 + }, + { + "epoch": 0.7041623667010664, + "grad_norm": 0.2586781392192157, + "learning_rate": 0.00021249813361153819, + "loss": 2.4672, + "step": 6141 + }, + { + "epoch": 0.7042770324504071, + "grad_norm": 0.2645662491101431, + "learning_rate": 0.0002123462267081146, + "loss": 2.3119, + "step": 6142 + }, + { + "epoch": 0.7043916981997478, + "grad_norm": 0.2591620900363445, + "learning_rate": 0.00021219435948104586, + "loss": 2.3664, + "step": 6143 + }, + { + "epoch": 0.7045063639490884, + "grad_norm": 0.26698700705465, + "learning_rate": 0.00021204253195127916, + "loss": 2.4214, + "step": 6144 + }, + { + "epoch": 0.7046210296984291, + "grad_norm": 0.2575902621445413, + "learning_rate": 0.00021189074413975622, + "loss": 2.2619, + "step": 6145 + }, + { + "epoch": 0.7047356954477697, + "grad_norm": 0.2294340362426549, + "learning_rate": 0.00021173899606741342, + "loss": 2.2688, + "step": 6146 + }, + { + "epoch": 0.7048503611971104, + "grad_norm": 0.23319524984019208, + "learning_rate": 0.0002115872877551812, + "loss": 2.2491, + "step": 6147 + }, + { + "epoch": 0.7049650269464511, + "grad_norm": 0.24685819642436638, + "learning_rate": 0.00021143561922398497, + "loss": 2.3206, + "step": 6148 + }, + { + "epoch": 0.7050796926957917, + "grad_norm": 0.24236108209047455, + "learning_rate": 0.00021128399049474456, + "loss": 2.5078, + "step": 6149 + }, + { + "epoch": 0.7051943584451325, + "grad_norm": 0.23443693361503415, + "learning_rate": 0.0002111324015883742, + "loss": 2.3633, + "step": 6150 + }, + { + "epoch": 0.705309024194473, + "grad_norm": 0.2687937375866934, + "learning_rate": 0.00021098085252578276, + "loss": 2.3525, + "step": 6151 + }, + { + "epoch": 0.7054236899438138, + "grad_norm": 0.2588047585068749, + "learning_rate": 0.00021082934332787367, + "loss": 2.3487, + "step": 6152 + }, + { + "epoch": 0.7055383556931545, + "grad_norm": 0.23730068914264443, + "learning_rate": 0.00021067787401554423, + "loss": 2.4015, + "step": 6153 + }, + { + "epoch": 0.7056530214424951, + "grad_norm": 0.2816192620347726, + "learning_rate": 0.0002105264446096874, + "loss": 2.4742, + "step": 6154 + }, + { + "epoch": 0.7057676871918358, + "grad_norm": 0.28149055521360683, + "learning_rate": 0.00021037505513118955, + "loss": 2.4745, + "step": 6155 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.2544194421912847, + "learning_rate": 0.000210223705600932, + "loss": 2.4349, + "step": 6156 + }, + { + "epoch": 0.7059970186905171, + "grad_norm": 0.25789215153767386, + "learning_rate": 0.00021007239603979073, + "loss": 2.3918, + "step": 6157 + }, + { + "epoch": 0.7061116844398578, + "grad_norm": 0.24298011585530466, + "learning_rate": 0.00020992112646863581, + "loss": 2.4304, + "step": 6158 + }, + { + "epoch": 0.7062263501891984, + "grad_norm": 0.2277606776129831, + "learning_rate": 0.00020976989690833214, + "loss": 2.3179, + "step": 6159 + }, + { + "epoch": 0.7063410159385392, + "grad_norm": 0.2763483069212914, + "learning_rate": 0.0002096187073797391, + "loss": 2.4563, + "step": 6160 + }, + { + "epoch": 0.7064556816878799, + "grad_norm": 0.284254106250262, + "learning_rate": 0.00020946755790371004, + "loss": 2.4251, + "step": 6161 + }, + { + "epoch": 0.7065703474372205, + "grad_norm": 0.2605713308098253, + "learning_rate": 0.00020931644850109322, + "loss": 2.3959, + "step": 6162 + }, + { + "epoch": 0.7066850131865612, + "grad_norm": 0.24540620666098126, + "learning_rate": 0.00020916537919273148, + "loss": 2.3294, + "step": 6163 + }, + { + "epoch": 0.7067996789359019, + "grad_norm": 0.29653632633471655, + "learning_rate": 0.00020901434999946183, + "loss": 2.4973, + "step": 6164 + }, + { + "epoch": 0.7069143446852425, + "grad_norm": 0.2562281535222324, + "learning_rate": 0.00020886336094211595, + "loss": 2.2602, + "step": 6165 + }, + { + "epoch": 0.7070290104345832, + "grad_norm": 0.2625016806204164, + "learning_rate": 0.00020871241204151997, + "loss": 2.3983, + "step": 6166 + }, + { + "epoch": 0.7071436761839238, + "grad_norm": 0.26457796389038596, + "learning_rate": 0.00020856150331849415, + "loss": 2.5286, + "step": 6167 + }, + { + "epoch": 0.7072583419332645, + "grad_norm": 0.24165276782952522, + "learning_rate": 0.00020841063479385346, + "loss": 2.3958, + "step": 6168 + }, + { + "epoch": 0.7073730076826052, + "grad_norm": 0.24464755491755444, + "learning_rate": 0.00020825980648840788, + "loss": 2.389, + "step": 6169 + }, + { + "epoch": 0.7074876734319459, + "grad_norm": 0.28052073661345367, + "learning_rate": 0.00020810901842296072, + "loss": 2.4862, + "step": 6170 + }, + { + "epoch": 0.7076023391812866, + "grad_norm": 0.2706153663700323, + "learning_rate": 0.00020795827061831045, + "loss": 2.4206, + "step": 6171 + }, + { + "epoch": 0.7077170049306272, + "grad_norm": 0.2405468956448794, + "learning_rate": 0.00020780756309525024, + "loss": 2.4842, + "step": 6172 + }, + { + "epoch": 0.7078316706799679, + "grad_norm": 0.2707652751124375, + "learning_rate": 0.00020765689587456655, + "loss": 2.4222, + "step": 6173 + }, + { + "epoch": 0.7079463364293086, + "grad_norm": 0.22660517580232212, + "learning_rate": 0.00020750626897704172, + "loss": 2.3964, + "step": 6174 + }, + { + "epoch": 0.7080610021786492, + "grad_norm": 0.27332411494008946, + "learning_rate": 0.0002073556824234518, + "loss": 2.4166, + "step": 6175 + }, + { + "epoch": 0.7081756679279899, + "grad_norm": 0.2471906328371607, + "learning_rate": 0.00020720513623456705, + "loss": 2.4128, + "step": 6176 + }, + { + "epoch": 0.7082903336773306, + "grad_norm": 0.24240664026656653, + "learning_rate": 0.0002070546304311525, + "loss": 2.4684, + "step": 6177 + }, + { + "epoch": 0.7084049994266712, + "grad_norm": 0.2605443583995794, + "learning_rate": 0.00020690416503396774, + "loss": 2.4241, + "step": 6178 + }, + { + "epoch": 0.708519665176012, + "grad_norm": 0.22567155238789033, + "learning_rate": 0.00020675374006376646, + "loss": 2.4143, + "step": 6179 + }, + { + "epoch": 0.7086343309253526, + "grad_norm": 0.24694175963077794, + "learning_rate": 0.00020660335554129695, + "loss": 2.4058, + "step": 6180 + }, + { + "epoch": 0.7087489966746933, + "grad_norm": 0.23363024279092437, + "learning_rate": 0.00020645301148730205, + "loss": 2.4078, + "step": 6181 + }, + { + "epoch": 0.708863662424034, + "grad_norm": 0.23511955165853576, + "learning_rate": 0.00020630270792251854, + "loss": 2.352, + "step": 6182 + }, + { + "epoch": 0.7089783281733746, + "grad_norm": 0.2731031809677535, + "learning_rate": 0.00020615244486767793, + "loss": 2.3433, + "step": 6183 + }, + { + "epoch": 0.7090929939227153, + "grad_norm": 0.2440709017597821, + "learning_rate": 0.00020600222234350664, + "loss": 2.3585, + "step": 6184 + }, + { + "epoch": 0.709207659672056, + "grad_norm": 0.2609437795357541, + "learning_rate": 0.00020585204037072447, + "loss": 2.448, + "step": 6185 + }, + { + "epoch": 0.7093223254213966, + "grad_norm": 0.25063624602538603, + "learning_rate": 0.00020570189897004638, + "loss": 2.3258, + "step": 6186 + }, + { + "epoch": 0.7094369911707373, + "grad_norm": 0.26227554894561483, + "learning_rate": 0.00020555179816218166, + "loss": 2.4188, + "step": 6187 + }, + { + "epoch": 0.7095516569200779, + "grad_norm": 0.24933635997827105, + "learning_rate": 0.0002054017379678334, + "loss": 2.4555, + "step": 6188 + }, + { + "epoch": 0.7096663226694186, + "grad_norm": 0.282008584290258, + "learning_rate": 0.00020525171840769996, + "loss": 2.4421, + "step": 6189 + }, + { + "epoch": 0.7097809884187594, + "grad_norm": 0.24081906831505961, + "learning_rate": 0.00020510173950247373, + "loss": 2.4536, + "step": 6190 + }, + { + "epoch": 0.7098956541681, + "grad_norm": 0.2537676245957966, + "learning_rate": 0.00020495180127284108, + "loss": 2.4773, + "step": 6191 + }, + { + "epoch": 0.7100103199174407, + "grad_norm": 0.25246782289630526, + "learning_rate": 0.00020480190373948332, + "loss": 2.4319, + "step": 6192 + }, + { + "epoch": 0.7101249856667813, + "grad_norm": 0.25982914474746366, + "learning_rate": 0.00020465204692307598, + "loss": 2.3953, + "step": 6193 + }, + { + "epoch": 0.710239651416122, + "grad_norm": 0.24891645510139418, + "learning_rate": 0.00020450223084428888, + "loss": 2.4694, + "step": 6194 + }, + { + "epoch": 0.7103543171654627, + "grad_norm": 0.2794990509879442, + "learning_rate": 0.00020435245552378635, + "loss": 2.4143, + "step": 6195 + }, + { + "epoch": 0.7104689829148033, + "grad_norm": 0.24623750048276213, + "learning_rate": 0.00020420272098222724, + "loss": 2.4088, + "step": 6196 + }, + { + "epoch": 0.710583648664144, + "grad_norm": 0.24521645346691387, + "learning_rate": 0.00020405302724026415, + "loss": 2.5882, + "step": 6197 + }, + { + "epoch": 0.7106983144134847, + "grad_norm": 0.24331008381446623, + "learning_rate": 0.00020390337431854466, + "loss": 2.3338, + "step": 6198 + }, + { + "epoch": 0.7108129801628253, + "grad_norm": 0.2337402647747093, + "learning_rate": 0.00020375376223771062, + "loss": 2.3866, + "step": 6199 + }, + { + "epoch": 0.7109276459121661, + "grad_norm": 0.2632061636402899, + "learning_rate": 0.00020360419101839812, + "loss": 2.3893, + "step": 6200 + }, + { + "epoch": 0.7110423116615067, + "grad_norm": 0.2535343911901774, + "learning_rate": 0.00020345466068123758, + "loss": 2.4511, + "step": 6201 + }, + { + "epoch": 0.7111569774108474, + "grad_norm": 0.24851256140778466, + "learning_rate": 0.00020330517124685416, + "loss": 2.458, + "step": 6202 + }, + { + "epoch": 0.7112716431601881, + "grad_norm": 0.28028099997603184, + "learning_rate": 0.00020315572273586646, + "loss": 2.4785, + "step": 6203 + }, + { + "epoch": 0.7113863089095287, + "grad_norm": 0.264445909212145, + "learning_rate": 0.00020300631516888883, + "loss": 2.4593, + "step": 6204 + }, + { + "epoch": 0.7115009746588694, + "grad_norm": 0.25340463036099214, + "learning_rate": 0.00020285694856652864, + "loss": 2.5138, + "step": 6205 + }, + { + "epoch": 0.71161564040821, + "grad_norm": 0.26817628504477736, + "learning_rate": 0.00020270762294938833, + "loss": 2.3434, + "step": 6206 + }, + { + "epoch": 0.7117303061575507, + "grad_norm": 0.2518760529370294, + "learning_rate": 0.00020255833833806452, + "loss": 2.4225, + "step": 6207 + }, + { + "epoch": 0.7118449719068914, + "grad_norm": 0.23481911222833723, + "learning_rate": 0.00020240909475314816, + "loss": 2.4746, + "step": 6208 + }, + { + "epoch": 0.711959637656232, + "grad_norm": 0.26712626193944317, + "learning_rate": 0.00020225989221522462, + "loss": 2.4006, + "step": 6209 + }, + { + "epoch": 0.7120743034055728, + "grad_norm": 0.29652177995680157, + "learning_rate": 0.00020211073074487363, + "loss": 2.4567, + "step": 6210 + }, + { + "epoch": 0.7121889691549135, + "grad_norm": 0.2477749072997504, + "learning_rate": 0.00020196161036266892, + "loss": 2.5365, + "step": 6211 + }, + { + "epoch": 0.7123036349042541, + "grad_norm": 0.24869835536299567, + "learning_rate": 0.0002018125310891789, + "loss": 2.3424, + "step": 6212 + }, + { + "epoch": 0.7124183006535948, + "grad_norm": 0.2826178818608545, + "learning_rate": 0.00020166349294496622, + "loss": 2.5115, + "step": 6213 + }, + { + "epoch": 0.7125329664029354, + "grad_norm": 0.24591263641471692, + "learning_rate": 0.0002015144959505879, + "loss": 2.5305, + "step": 6214 + }, + { + "epoch": 0.7126476321522761, + "grad_norm": 0.27625355600767626, + "learning_rate": 0.00020136554012659526, + "loss": 2.3493, + "step": 6215 + }, + { + "epoch": 0.7127622979016168, + "grad_norm": 0.2597629111832175, + "learning_rate": 0.00020121662549353402, + "loss": 2.4192, + "step": 6216 + }, + { + "epoch": 0.7128769636509574, + "grad_norm": 0.24194281370495624, + "learning_rate": 0.0002010677520719438, + "loss": 2.4677, + "step": 6217 + }, + { + "epoch": 0.7129916294002981, + "grad_norm": 0.2508260808037649, + "learning_rate": 0.00020091891988235878, + "loss": 2.4265, + "step": 6218 + }, + { + "epoch": 0.7131062951496389, + "grad_norm": 0.2774659156109727, + "learning_rate": 0.00020077012894530817, + "loss": 2.436, + "step": 6219 + }, + { + "epoch": 0.7132209608989795, + "grad_norm": 0.2717706387504273, + "learning_rate": 0.00020062137928131418, + "loss": 2.385, + "step": 6220 + }, + { + "epoch": 0.7133356266483202, + "grad_norm": 0.24577502463146314, + "learning_rate": 0.00020047267091089432, + "loss": 2.4795, + "step": 6221 + }, + { + "epoch": 0.7134502923976608, + "grad_norm": 0.2724709289924197, + "learning_rate": 0.0002003240038545602, + "loss": 2.4866, + "step": 6222 + }, + { + "epoch": 0.7135649581470015, + "grad_norm": 0.23771636484757336, + "learning_rate": 0.00020017537813281694, + "loss": 2.295, + "step": 6223 + }, + { + "epoch": 0.7136796238963422, + "grad_norm": 0.23673611553423884, + "learning_rate": 0.00020002679376616533, + "loss": 2.4352, + "step": 6224 + }, + { + "epoch": 0.7137942896456828, + "grad_norm": 0.2450469193955017, + "learning_rate": 0.0001998782507750997, + "loss": 2.3508, + "step": 6225 + }, + { + "epoch": 0.7139089553950235, + "grad_norm": 0.23537014443521556, + "learning_rate": 0.00019972974918010833, + "loss": 2.4463, + "step": 6226 + }, + { + "epoch": 0.7140236211443641, + "grad_norm": 0.25208541887548586, + "learning_rate": 0.00019958128900167437, + "loss": 2.5648, + "step": 6227 + }, + { + "epoch": 0.7141382868937048, + "grad_norm": 0.22977789869581988, + "learning_rate": 0.00019943287026027518, + "loss": 2.2609, + "step": 6228 + }, + { + "epoch": 0.7142529526430456, + "grad_norm": 0.23599264727608193, + "learning_rate": 0.0001992844929763821, + "loss": 2.4706, + "step": 6229 + }, + { + "epoch": 0.7143676183923862, + "grad_norm": 0.2498939136285986, + "learning_rate": 0.00019913615717046112, + "loss": 2.3906, + "step": 6230 + }, + { + "epoch": 0.7144822841417269, + "grad_norm": 0.25055903426110343, + "learning_rate": 0.00019898786286297242, + "loss": 2.3875, + "step": 6231 + }, + { + "epoch": 0.7145969498910676, + "grad_norm": 0.25410998216773506, + "learning_rate": 0.00019883961007437006, + "loss": 2.4278, + "step": 6232 + }, + { + "epoch": 0.7147116156404082, + "grad_norm": 0.23318905116867952, + "learning_rate": 0.0001986913988251027, + "loss": 2.3679, + "step": 6233 + }, + { + "epoch": 0.7148262813897489, + "grad_norm": 0.2736715339408412, + "learning_rate": 0.00019854322913561372, + "loss": 2.4014, + "step": 6234 + }, + { + "epoch": 0.7149409471390895, + "grad_norm": 0.27218399871702403, + "learning_rate": 0.00019839510102633985, + "loss": 2.3683, + "step": 6235 + }, + { + "epoch": 0.7150556128884302, + "grad_norm": 0.24468006633495115, + "learning_rate": 0.00019824701451771266, + "loss": 2.2595, + "step": 6236 + }, + { + "epoch": 0.7151702786377709, + "grad_norm": 0.25197957782058705, + "learning_rate": 0.00019809896963015805, + "loss": 2.5135, + "step": 6237 + }, + { + "epoch": 0.7152849443871115, + "grad_norm": 0.2906670527447589, + "learning_rate": 0.00019795096638409548, + "loss": 2.4263, + "step": 6238 + }, + { + "epoch": 0.7153996101364523, + "grad_norm": 0.2389474328017037, + "learning_rate": 0.0001978030047999397, + "loss": 2.3709, + "step": 6239 + }, + { + "epoch": 0.7155142758857929, + "grad_norm": 0.25059375558875024, + "learning_rate": 0.00019765508489809913, + "loss": 2.4834, + "step": 6240 + }, + { + "epoch": 0.7156289416351336, + "grad_norm": 0.2485915415453671, + "learning_rate": 0.00019750720669897627, + "loss": 2.2594, + "step": 6241 + }, + { + "epoch": 0.7157436073844743, + "grad_norm": 0.2782216358327332, + "learning_rate": 0.00019735937022296817, + "loss": 2.3428, + "step": 6242 + }, + { + "epoch": 0.7158582731338149, + "grad_norm": 0.2443229624657192, + "learning_rate": 0.00019721157549046613, + "loss": 2.4073, + "step": 6243 + }, + { + "epoch": 0.7159729388831556, + "grad_norm": 0.26841422563857065, + "learning_rate": 0.00019706382252185556, + "loss": 2.54, + "step": 6244 + }, + { + "epoch": 0.7160876046324963, + "grad_norm": 0.2590504279642476, + "learning_rate": 0.0001969161113375162, + "loss": 2.3342, + "step": 6245 + }, + { + "epoch": 0.7162022703818369, + "grad_norm": 0.2502875754748808, + "learning_rate": 0.00019676844195782217, + "loss": 2.4302, + "step": 6246 + }, + { + "epoch": 0.7163169361311776, + "grad_norm": 0.2587318390869682, + "learning_rate": 0.00019662081440314122, + "loss": 2.3865, + "step": 6247 + }, + { + "epoch": 0.7164316018805182, + "grad_norm": 0.2432238886924314, + "learning_rate": 0.000196473228693836, + "loss": 2.4883, + "step": 6248 + }, + { + "epoch": 0.716546267629859, + "grad_norm": 0.23979503693380924, + "learning_rate": 0.00019632568485026313, + "loss": 2.4638, + "step": 6249 + }, + { + "epoch": 0.7166609333791997, + "grad_norm": 0.25445703718254303, + "learning_rate": 0.00019617818289277355, + "loss": 2.4098, + "step": 6250 + }, + { + "epoch": 0.7167755991285403, + "grad_norm": 0.25880728685465315, + "learning_rate": 0.0001960307228417122, + "loss": 2.5151, + "step": 6251 + }, + { + "epoch": 0.716890264877881, + "grad_norm": 0.251278764904902, + "learning_rate": 0.00019588330471741866, + "loss": 2.257, + "step": 6252 + }, + { + "epoch": 0.7170049306272217, + "grad_norm": 0.2510446734340795, + "learning_rate": 0.00019573592854022588, + "loss": 2.351, + "step": 6253 + }, + { + "epoch": 0.7171195963765623, + "grad_norm": 0.2866824381754139, + "learning_rate": 0.00019558859433046227, + "loss": 2.4946, + "step": 6254 + }, + { + "epoch": 0.717234262125903, + "grad_norm": 0.23490741112180144, + "learning_rate": 0.00019544130210844934, + "loss": 2.3133, + "step": 6255 + }, + { + "epoch": 0.7173489278752436, + "grad_norm": 0.2391580626887255, + "learning_rate": 0.00019529405189450333, + "loss": 2.4463, + "step": 6256 + }, + { + "epoch": 0.7174635936245843, + "grad_norm": 0.21821070999377562, + "learning_rate": 0.00019514684370893466, + "loss": 2.387, + "step": 6257 + }, + { + "epoch": 0.717578259373925, + "grad_norm": 0.2510103018797439, + "learning_rate": 0.0001949996775720479, + "loss": 2.4026, + "step": 6258 + }, + { + "epoch": 0.7176929251232657, + "grad_norm": 0.26956321145554135, + "learning_rate": 0.00019485255350414171, + "loss": 2.3856, + "step": 6259 + }, + { + "epoch": 0.7178075908726064, + "grad_norm": 0.24787895222937587, + "learning_rate": 0.00019470547152550938, + "loss": 2.5316, + "step": 6260 + }, + { + "epoch": 0.717922256621947, + "grad_norm": 0.26518199109472634, + "learning_rate": 0.00019455843165643754, + "loss": 2.2918, + "step": 6261 + }, + { + "epoch": 0.7180369223712877, + "grad_norm": 0.25193365039969184, + "learning_rate": 0.0001944114339172079, + "loss": 2.4664, + "step": 6262 + }, + { + "epoch": 0.7181515881206284, + "grad_norm": 0.24643153521798708, + "learning_rate": 0.00019426447832809586, + "loss": 2.4356, + "step": 6263 + }, + { + "epoch": 0.718266253869969, + "grad_norm": 0.2744099837412709, + "learning_rate": 0.00019411756490937115, + "loss": 2.5434, + "step": 6264 + }, + { + "epoch": 0.7183809196193097, + "grad_norm": 0.2452611451323808, + "learning_rate": 0.00019397069368129777, + "loss": 2.1427, + "step": 6265 + }, + { + "epoch": 0.7184955853686504, + "grad_norm": 0.24903537160430875, + "learning_rate": 0.00019382386466413394, + "loss": 2.4363, + "step": 6266 + }, + { + "epoch": 0.718610251117991, + "grad_norm": 0.2573727411348979, + "learning_rate": 0.0001936770778781315, + "loss": 2.3807, + "step": 6267 + }, + { + "epoch": 0.7187249168673318, + "grad_norm": 0.25108334600727267, + "learning_rate": 0.000193530333343537, + "loss": 2.3154, + "step": 6268 + }, + { + "epoch": 0.7188395826166724, + "grad_norm": 0.2593871158355924, + "learning_rate": 0.00019338363108059142, + "loss": 2.4615, + "step": 6269 + }, + { + "epoch": 0.7189542483660131, + "grad_norm": 0.26274250915099306, + "learning_rate": 0.0001932369711095292, + "loss": 2.4868, + "step": 6270 + }, + { + "epoch": 0.7190689141153538, + "grad_norm": 0.2419829542783949, + "learning_rate": 0.00019309035345057942, + "loss": 2.4262, + "step": 6271 + }, + { + "epoch": 0.7191835798646944, + "grad_norm": 0.24149502549488625, + "learning_rate": 0.00019294377812396524, + "loss": 2.3419, + "step": 6272 + }, + { + "epoch": 0.7192982456140351, + "grad_norm": 0.2383070200164491, + "learning_rate": 0.00019279724514990356, + "loss": 2.501, + "step": 6273 + }, + { + "epoch": 0.7194129113633757, + "grad_norm": 0.265126603393842, + "learning_rate": 0.0001926507545486062, + "loss": 2.5041, + "step": 6274 + }, + { + "epoch": 0.7195275771127164, + "grad_norm": 0.3003548243124077, + "learning_rate": 0.00019250430634027883, + "loss": 2.4454, + "step": 6275 + }, + { + "epoch": 0.7196422428620571, + "grad_norm": 0.2316324848199238, + "learning_rate": 0.00019235790054512086, + "loss": 2.3529, + "step": 6276 + }, + { + "epoch": 0.7197569086113977, + "grad_norm": 0.2472042958246401, + "learning_rate": 0.0001922115371833263, + "loss": 2.354, + "step": 6277 + }, + { + "epoch": 0.7198715743607385, + "grad_norm": 0.29767880618529297, + "learning_rate": 0.0001920652162750832, + "loss": 2.3726, + "step": 6278 + }, + { + "epoch": 0.7199862401100792, + "grad_norm": 0.2529227827924357, + "learning_rate": 0.00019191893784057374, + "loss": 2.4425, + "step": 6279 + }, + { + "epoch": 0.7201009058594198, + "grad_norm": 0.26951910727094114, + "learning_rate": 0.00019177270189997425, + "loss": 2.3759, + "step": 6280 + }, + { + "epoch": 0.7202155716087605, + "grad_norm": 0.29237078754309104, + "learning_rate": 0.00019162650847345542, + "loss": 2.4389, + "step": 6281 + }, + { + "epoch": 0.7203302373581011, + "grad_norm": 0.295590979124636, + "learning_rate": 0.00019148035758118138, + "loss": 2.47, + "step": 6282 + }, + { + "epoch": 0.7204449031074418, + "grad_norm": 0.27554983238441616, + "learning_rate": 0.0001913342492433109, + "loss": 2.3924, + "step": 6283 + }, + { + "epoch": 0.7205595688567825, + "grad_norm": 0.27155205442555047, + "learning_rate": 0.0001911881834799974, + "loss": 2.4521, + "step": 6284 + }, + { + "epoch": 0.7206742346061231, + "grad_norm": 0.2698033299533288, + "learning_rate": 0.0001910421603113873, + "loss": 2.419, + "step": 6285 + }, + { + "epoch": 0.7207889003554638, + "grad_norm": 0.27553646680522675, + "learning_rate": 0.00019089617975762198, + "loss": 2.5869, + "step": 6286 + }, + { + "epoch": 0.7209035661048045, + "grad_norm": 0.25486455796290763, + "learning_rate": 0.00019075024183883672, + "loss": 2.4118, + "step": 6287 + }, + { + "epoch": 0.7210182318541452, + "grad_norm": 0.2685407220536627, + "learning_rate": 0.00019060434657516045, + "loss": 2.3728, + "step": 6288 + }, + { + "epoch": 0.7211328976034859, + "grad_norm": 0.27426053160859754, + "learning_rate": 0.0001904584939867171, + "loss": 2.4482, + "step": 6289 + }, + { + "epoch": 0.7212475633528265, + "grad_norm": 0.26708925021555474, + "learning_rate": 0.0001903126840936243, + "loss": 2.3926, + "step": 6290 + }, + { + "epoch": 0.7213622291021672, + "grad_norm": 0.23446968936020765, + "learning_rate": 0.00019016691691599335, + "loss": 2.3789, + "step": 6291 + }, + { + "epoch": 0.7214768948515079, + "grad_norm": 0.25998850130756773, + "learning_rate": 0.00019002119247393028, + "loss": 2.3509, + "step": 6292 + }, + { + "epoch": 0.7215915606008485, + "grad_norm": 0.29050952876957775, + "learning_rate": 0.000189875510787535, + "loss": 2.4089, + "step": 6293 + }, + { + "epoch": 0.7217062263501892, + "grad_norm": 0.2525993141090884, + "learning_rate": 0.00018972987187690149, + "loss": 2.3216, + "step": 6294 + }, + { + "epoch": 0.7218208920995298, + "grad_norm": 0.25212862423752697, + "learning_rate": 0.00018958427576211783, + "loss": 2.4313, + "step": 6295 + }, + { + "epoch": 0.7219355578488705, + "grad_norm": 0.29600205599174806, + "learning_rate": 0.00018943872246326643, + "loss": 2.4789, + "step": 6296 + }, + { + "epoch": 0.7220502235982112, + "grad_norm": 0.27153656854742864, + "learning_rate": 0.0001892932120004233, + "loss": 2.4877, + "step": 6297 + }, + { + "epoch": 0.7221648893475519, + "grad_norm": 0.27301803270981717, + "learning_rate": 0.00018914774439365896, + "loss": 2.4937, + "step": 6298 + }, + { + "epoch": 0.7222795550968926, + "grad_norm": 0.27959387007577485, + "learning_rate": 0.0001890023196630379, + "loss": 2.3753, + "step": 6299 + }, + { + "epoch": 0.7223942208462333, + "grad_norm": 0.24717647338007923, + "learning_rate": 0.0001888569378286187, + "loss": 2.3679, + "step": 6300 + }, + { + "epoch": 0.7225088865955739, + "grad_norm": 0.25098079037604293, + "learning_rate": 0.0001887115989104539, + "loss": 2.5038, + "step": 6301 + }, + { + "epoch": 0.7226235523449146, + "grad_norm": 0.27908070611136876, + "learning_rate": 0.00018856630292859062, + "loss": 2.5158, + "step": 6302 + }, + { + "epoch": 0.7227382180942552, + "grad_norm": 0.2668373810654318, + "learning_rate": 0.00018842104990306896, + "loss": 2.4835, + "step": 6303 + }, + { + "epoch": 0.7228528838435959, + "grad_norm": 0.2841194733746334, + "learning_rate": 0.00018827583985392467, + "loss": 2.4008, + "step": 6304 + }, + { + "epoch": 0.7229675495929366, + "grad_norm": 0.265193063923442, + "learning_rate": 0.000188130672801186, + "loss": 2.5068, + "step": 6305 + }, + { + "epoch": 0.7230822153422772, + "grad_norm": 0.2547161387389508, + "learning_rate": 0.00018798554876487628, + "loss": 2.3686, + "step": 6306 + }, + { + "epoch": 0.723196881091618, + "grad_norm": 0.27225987870501694, + "learning_rate": 0.00018784046776501261, + "loss": 2.5141, + "step": 6307 + }, + { + "epoch": 0.7233115468409586, + "grad_norm": 0.2264798497366811, + "learning_rate": 0.00018769542982160608, + "loss": 2.5625, + "step": 6308 + }, + { + "epoch": 0.7234262125902993, + "grad_norm": 0.25793381300077495, + "learning_rate": 0.00018755043495466195, + "loss": 2.4379, + "step": 6309 + }, + { + "epoch": 0.72354087833964, + "grad_norm": 0.24650215639070633, + "learning_rate": 0.0001874054831841796, + "loss": 2.4676, + "step": 6310 + }, + { + "epoch": 0.7236555440889806, + "grad_norm": 0.2439435786702376, + "learning_rate": 0.00018726057453015215, + "loss": 2.4076, + "step": 6311 + }, + { + "epoch": 0.7237702098383213, + "grad_norm": 0.2897404988533681, + "learning_rate": 0.00018711570901256707, + "loss": 2.5332, + "step": 6312 + }, + { + "epoch": 0.723884875587662, + "grad_norm": 0.2414452327815872, + "learning_rate": 0.0001869708866514058, + "loss": 2.3204, + "step": 6313 + }, + { + "epoch": 0.7239995413370026, + "grad_norm": 0.24871506945495983, + "learning_rate": 0.00018682610746664386, + "loss": 2.4646, + "step": 6314 + }, + { + "epoch": 0.7241142070863433, + "grad_norm": 0.24184806579039836, + "learning_rate": 0.00018668137147825076, + "loss": 2.3158, + "step": 6315 + }, + { + "epoch": 0.7242288728356839, + "grad_norm": 0.2757096431387559, + "learning_rate": 0.00018653667870619024, + "loss": 2.4439, + "step": 6316 + }, + { + "epoch": 0.7243435385850246, + "grad_norm": 0.23700227749077915, + "learning_rate": 0.00018639202917041954, + "loss": 2.3628, + "step": 6317 + }, + { + "epoch": 0.7244582043343654, + "grad_norm": 0.25116409805069695, + "learning_rate": 0.0001862474228908903, + "loss": 2.4153, + "step": 6318 + }, + { + "epoch": 0.724572870083706, + "grad_norm": 0.2445765787977314, + "learning_rate": 0.00018610285988754883, + "loss": 2.5184, + "step": 6319 + }, + { + "epoch": 0.7246875358330467, + "grad_norm": 0.2804681948833104, + "learning_rate": 0.0001859583401803342, + "loss": 2.3973, + "step": 6320 + }, + { + "epoch": 0.7248022015823874, + "grad_norm": 0.257869270058341, + "learning_rate": 0.00018581386378918035, + "loss": 2.4286, + "step": 6321 + }, + { + "epoch": 0.724916867331728, + "grad_norm": 0.2506811720828909, + "learning_rate": 0.00018566943073401526, + "loss": 2.4745, + "step": 6322 + }, + { + "epoch": 0.7250315330810687, + "grad_norm": 0.2461873243266885, + "learning_rate": 0.00018552504103476014, + "loss": 2.3219, + "step": 6323 + }, + { + "epoch": 0.7251461988304093, + "grad_norm": 0.28801032880318717, + "learning_rate": 0.0001853806947113314, + "loss": 2.3951, + "step": 6324 + }, + { + "epoch": 0.72526086457975, + "grad_norm": 0.25259642828538714, + "learning_rate": 0.00018523639178363876, + "loss": 2.3336, + "step": 6325 + }, + { + "epoch": 0.7253755303290907, + "grad_norm": 0.23776146399876796, + "learning_rate": 0.00018509213227158578, + "loss": 2.409, + "step": 6326 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 0.2692431768410158, + "learning_rate": 0.00018494791619507044, + "loss": 2.3954, + "step": 6327 + }, + { + "epoch": 0.7256048618277721, + "grad_norm": 0.23501032557711746, + "learning_rate": 0.0001848037435739846, + "loss": 2.466, + "step": 6328 + }, + { + "epoch": 0.7257195275771127, + "grad_norm": 0.23494667264566466, + "learning_rate": 0.00018465961442821422, + "loss": 2.3036, + "step": 6329 + }, + { + "epoch": 0.7258341933264534, + "grad_norm": 0.2525351815852477, + "learning_rate": 0.00018451552877763906, + "loss": 2.3913, + "step": 6330 + }, + { + "epoch": 0.7259488590757941, + "grad_norm": 0.2411813687212816, + "learning_rate": 0.00018437148664213315, + "loss": 2.2822, + "step": 6331 + }, + { + "epoch": 0.7260635248251347, + "grad_norm": 0.2596968333743399, + "learning_rate": 0.00018422748804156414, + "loss": 2.4222, + "step": 6332 + }, + { + "epoch": 0.7261781905744754, + "grad_norm": 0.2603268655437906, + "learning_rate": 0.00018408353299579377, + "loss": 2.513, + "step": 6333 + }, + { + "epoch": 0.7262928563238161, + "grad_norm": 0.263041376017191, + "learning_rate": 0.00018393962152467847, + "loss": 2.3774, + "step": 6334 + }, + { + "epoch": 0.7264075220731567, + "grad_norm": 0.2448367694477783, + "learning_rate": 0.0001837957536480675, + "loss": 2.35, + "step": 6335 + }, + { + "epoch": 0.7265221878224974, + "grad_norm": 0.24224252505582225, + "learning_rate": 0.00018365192938580495, + "loss": 2.3577, + "step": 6336 + }, + { + "epoch": 0.726636853571838, + "grad_norm": 0.25752866169736355, + "learning_rate": 0.00018350814875772874, + "loss": 2.4191, + "step": 6337 + }, + { + "epoch": 0.7267515193211788, + "grad_norm": 0.248726891606585, + "learning_rate": 0.00018336441178367025, + "loss": 2.4851, + "step": 6338 + }, + { + "epoch": 0.7268661850705195, + "grad_norm": 0.26251566921382435, + "learning_rate": 0.00018322071848345567, + "loss": 2.4883, + "step": 6339 + }, + { + "epoch": 0.7269808508198601, + "grad_norm": 0.24581560194528682, + "learning_rate": 0.00018307706887690477, + "loss": 2.1997, + "step": 6340 + }, + { + "epoch": 0.7270955165692008, + "grad_norm": 0.27848330607461524, + "learning_rate": 0.00018293346298383097, + "loss": 2.5403, + "step": 6341 + }, + { + "epoch": 0.7272101823185414, + "grad_norm": 0.2560841098437568, + "learning_rate": 0.00018278990082404206, + "loss": 2.4487, + "step": 6342 + }, + { + "epoch": 0.7273248480678821, + "grad_norm": 0.25605803918238557, + "learning_rate": 0.00018264638241733983, + "loss": 2.4992, + "step": 6343 + }, + { + "epoch": 0.7274395138172228, + "grad_norm": 0.25471666680789784, + "learning_rate": 0.0001825029077835198, + "loss": 2.4528, + "step": 6344 + }, + { + "epoch": 0.7275541795665634, + "grad_norm": 0.2702709415218117, + "learning_rate": 0.0001823594769423716, + "loss": 2.3383, + "step": 6345 + }, + { + "epoch": 0.7276688453159041, + "grad_norm": 0.26978626842915426, + "learning_rate": 0.0001822160899136789, + "loss": 2.4416, + "step": 6346 + }, + { + "epoch": 0.7277835110652449, + "grad_norm": 0.28201002769361216, + "learning_rate": 0.00018207274671721896, + "loss": 2.4747, + "step": 6347 + }, + { + "epoch": 0.7278981768145855, + "grad_norm": 0.25739206792869423, + "learning_rate": 0.00018192944737276335, + "loss": 2.3456, + "step": 6348 + }, + { + "epoch": 0.7280128425639262, + "grad_norm": 0.24048059538881492, + "learning_rate": 0.0001817861919000775, + "loss": 2.5114, + "step": 6349 + }, + { + "epoch": 0.7281275083132668, + "grad_norm": 0.24741594863916858, + "learning_rate": 0.0001816429803189208, + "loss": 2.3827, + "step": 6350 + }, + { + "epoch": 0.7282421740626075, + "grad_norm": 0.22575820325453494, + "learning_rate": 0.00018149981264904653, + "loss": 2.462, + "step": 6351 + }, + { + "epoch": 0.7283568398119482, + "grad_norm": 0.2599262102245639, + "learning_rate": 0.00018135668891020208, + "loss": 2.3489, + "step": 6352 + }, + { + "epoch": 0.7284715055612888, + "grad_norm": 0.2529561409075528, + "learning_rate": 0.00018121360912212814, + "loss": 2.5248, + "step": 6353 + }, + { + "epoch": 0.7285861713106295, + "grad_norm": 0.23192061333535038, + "learning_rate": 0.00018107057330456056, + "loss": 2.3605, + "step": 6354 + }, + { + "epoch": 0.7287008370599702, + "grad_norm": 0.22693905714388501, + "learning_rate": 0.00018092758147722794, + "loss": 2.3762, + "step": 6355 + }, + { + "epoch": 0.7288155028093108, + "grad_norm": 0.25769585498734093, + "learning_rate": 0.00018078463365985332, + "loss": 2.5354, + "step": 6356 + }, + { + "epoch": 0.7289301685586516, + "grad_norm": 0.22427009533700526, + "learning_rate": 0.00018064172987215365, + "loss": 2.4918, + "step": 6357 + }, + { + "epoch": 0.7290448343079922, + "grad_norm": 0.21141639241388008, + "learning_rate": 0.00018049887013383986, + "loss": 2.2762, + "step": 6358 + }, + { + "epoch": 0.7291595000573329, + "grad_norm": 0.25417417583245894, + "learning_rate": 0.00018035605446461671, + "loss": 2.4098, + "step": 6359 + }, + { + "epoch": 0.7292741658066736, + "grad_norm": 0.24630905597078248, + "learning_rate": 0.00018021328288418304, + "loss": 2.4107, + "step": 6360 + }, + { + "epoch": 0.7293888315560142, + "grad_norm": 0.2562281266007565, + "learning_rate": 0.0001800705554122311, + "loss": 2.3002, + "step": 6361 + }, + { + "epoch": 0.7295034973053549, + "grad_norm": 0.25053144427494145, + "learning_rate": 0.00017992787206844767, + "loss": 2.4666, + "step": 6362 + }, + { + "epoch": 0.7296181630546955, + "grad_norm": 0.2707093903145125, + "learning_rate": 0.00017978523287251314, + "loss": 2.3509, + "step": 6363 + }, + { + "epoch": 0.7297328288040362, + "grad_norm": 0.23546626627246298, + "learning_rate": 0.0001796426378441019, + "loss": 2.3344, + "step": 6364 + }, + { + "epoch": 0.7298474945533769, + "grad_norm": 0.26456970685911196, + "learning_rate": 0.00017950008700288222, + "loss": 2.4178, + "step": 6365 + }, + { + "epoch": 0.7299621603027175, + "grad_norm": 0.2700147073907764, + "learning_rate": 0.0001793575803685164, + "loss": 2.4863, + "step": 6366 + }, + { + "epoch": 0.7300768260520583, + "grad_norm": 0.23060516066151862, + "learning_rate": 0.0001792151179606602, + "loss": 2.2841, + "step": 6367 + }, + { + "epoch": 0.730191491801399, + "grad_norm": 0.27001929445015477, + "learning_rate": 0.00017907269979896358, + "loss": 2.4686, + "step": 6368 + }, + { + "epoch": 0.7303061575507396, + "grad_norm": 0.2738402082052355, + "learning_rate": 0.00017893032590307097, + "loss": 2.4409, + "step": 6369 + }, + { + "epoch": 0.7304208233000803, + "grad_norm": 0.2633449863703587, + "learning_rate": 0.00017878799629261955, + "loss": 2.4848, + "step": 6370 + }, + { + "epoch": 0.7305354890494209, + "grad_norm": 0.2530087326873093, + "learning_rate": 0.0001786457109872412, + "loss": 2.3913, + "step": 6371 + }, + { + "epoch": 0.7306501547987616, + "grad_norm": 0.23594224576003306, + "learning_rate": 0.00017850347000656158, + "loss": 2.3928, + "step": 6372 + }, + { + "epoch": 0.7307648205481023, + "grad_norm": 0.2504171887142438, + "learning_rate": 0.0001783612733701997, + "loss": 2.2967, + "step": 6373 + }, + { + "epoch": 0.7308794862974429, + "grad_norm": 0.28876013344220103, + "learning_rate": 0.00017821912109776929, + "loss": 2.4997, + "step": 6374 + }, + { + "epoch": 0.7309941520467836, + "grad_norm": 0.2566375347483386, + "learning_rate": 0.00017807701320887766, + "loss": 2.3467, + "step": 6375 + }, + { + "epoch": 0.7311088177961242, + "grad_norm": 0.2559084099750586, + "learning_rate": 0.0001779349497231254, + "loss": 2.3992, + "step": 6376 + }, + { + "epoch": 0.731223483545465, + "grad_norm": 0.24455758879156603, + "learning_rate": 0.00017779293066010776, + "loss": 2.58, + "step": 6377 + }, + { + "epoch": 0.7313381492948057, + "grad_norm": 0.32298626457033325, + "learning_rate": 0.0001776509560394135, + "loss": 2.4417, + "step": 6378 + }, + { + "epoch": 0.7314528150441463, + "grad_norm": 0.2705516887674042, + "learning_rate": 0.00017750902588062534, + "loss": 2.2464, + "step": 6379 + }, + { + "epoch": 0.731567480793487, + "grad_norm": 0.24950170430154092, + "learning_rate": 0.0001773671402033198, + "loss": 2.2936, + "step": 6380 + }, + { + "epoch": 0.7316821465428277, + "grad_norm": 0.2616864632231311, + "learning_rate": 0.00017722529902706747, + "loss": 2.3761, + "step": 6381 + }, + { + "epoch": 0.7317968122921683, + "grad_norm": 0.24255816239243266, + "learning_rate": 0.00017708350237143235, + "loss": 2.5065, + "step": 6382 + }, + { + "epoch": 0.731911478041509, + "grad_norm": 0.26166732735161835, + "learning_rate": 0.0001769417502559726, + "loss": 2.4196, + "step": 6383 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.27835620739194583, + "learning_rate": 0.00017680004270024063, + "loss": 2.4478, + "step": 6384 + }, + { + "epoch": 0.7321408095401903, + "grad_norm": 0.2742563062833354, + "learning_rate": 0.00017665837972378185, + "loss": 2.4594, + "step": 6385 + }, + { + "epoch": 0.732255475289531, + "grad_norm": 0.25214509190746215, + "learning_rate": 0.0001765167613461362, + "loss": 2.327, + "step": 6386 + }, + { + "epoch": 0.7323701410388717, + "grad_norm": 0.23972928643385047, + "learning_rate": 0.0001763751875868373, + "loss": 2.5061, + "step": 6387 + }, + { + "epoch": 0.7324848067882124, + "grad_norm": 0.2405970862554129, + "learning_rate": 0.00017623365846541206, + "loss": 2.4211, + "step": 6388 + }, + { + "epoch": 0.7325994725375531, + "grad_norm": 0.26278832972145627, + "learning_rate": 0.00017609217400138227, + "loss": 2.4494, + "step": 6389 + }, + { + "epoch": 0.7327141382868937, + "grad_norm": 0.29854968307617197, + "learning_rate": 0.000175950734214263, + "loss": 2.4794, + "step": 6390 + }, + { + "epoch": 0.7328288040362344, + "grad_norm": 0.22982503979857025, + "learning_rate": 0.00017580933912356283, + "loss": 2.4716, + "step": 6391 + }, + { + "epoch": 0.732943469785575, + "grad_norm": 0.2711480861666599, + "learning_rate": 0.0001756679887487847, + "loss": 2.4777, + "step": 6392 + }, + { + "epoch": 0.7330581355349157, + "grad_norm": 0.2693472933179573, + "learning_rate": 0.0001755266831094252, + "loss": 2.4635, + "step": 6393 + }, + { + "epoch": 0.7331728012842564, + "grad_norm": 0.28335585644181865, + "learning_rate": 0.00017538542222497478, + "loss": 2.4173, + "step": 6394 + }, + { + "epoch": 0.733287467033597, + "grad_norm": 0.2567275570973392, + "learning_rate": 0.0001752442061149176, + "loss": 2.39, + "step": 6395 + }, + { + "epoch": 0.7334021327829378, + "grad_norm": 0.25372657583174985, + "learning_rate": 0.0001751030347987319, + "loss": 2.453, + "step": 6396 + }, + { + "epoch": 0.7335167985322784, + "grad_norm": 0.25051814611341433, + "learning_rate": 0.00017496190829588938, + "loss": 2.5713, + "step": 6397 + }, + { + "epoch": 0.7336314642816191, + "grad_norm": 0.274209797280249, + "learning_rate": 0.00017482082662585575, + "loss": 2.3274, + "step": 6398 + }, + { + "epoch": 0.7337461300309598, + "grad_norm": 0.23327299851833, + "learning_rate": 0.00017467978980809063, + "loss": 2.462, + "step": 6399 + }, + { + "epoch": 0.7338607957803004, + "grad_norm": 0.2695485987745949, + "learning_rate": 0.00017453879786204736, + "loss": 2.4806, + "step": 6400 + }, + { + "epoch": 0.7339754615296411, + "grad_norm": 0.25397866037272276, + "learning_rate": 0.00017439785080717302, + "loss": 2.4387, + "step": 6401 + }, + { + "epoch": 0.7340901272789818, + "grad_norm": 0.23029430975258247, + "learning_rate": 0.00017425694866290886, + "loss": 2.3079, + "step": 6402 + }, + { + "epoch": 0.7342047930283224, + "grad_norm": 0.2618202206264615, + "learning_rate": 0.00017411609144868902, + "loss": 2.2745, + "step": 6403 + }, + { + "epoch": 0.7343194587776631, + "grad_norm": 0.257225924005901, + "learning_rate": 0.0001739752791839428, + "loss": 2.556, + "step": 6404 + }, + { + "epoch": 0.7344341245270037, + "grad_norm": 0.27276206623449895, + "learning_rate": 0.00017383451188809202, + "loss": 2.4432, + "step": 6405 + }, + { + "epoch": 0.7345487902763445, + "grad_norm": 0.2737758710908553, + "learning_rate": 0.00017369378958055309, + "loss": 2.3957, + "step": 6406 + }, + { + "epoch": 0.7346634560256852, + "grad_norm": 0.2612370822339997, + "learning_rate": 0.00017355311228073588, + "loss": 2.3741, + "step": 6407 + }, + { + "epoch": 0.7347781217750258, + "grad_norm": 0.27460841137430797, + "learning_rate": 0.0001734124800080442, + "loss": 2.4622, + "step": 6408 + }, + { + "epoch": 0.7348927875243665, + "grad_norm": 0.23819597260755612, + "learning_rate": 0.0001732718927818756, + "loss": 2.3509, + "step": 6409 + }, + { + "epoch": 0.7350074532737071, + "grad_norm": 0.26902026136348944, + "learning_rate": 0.00017313135062162156, + "loss": 2.332, + "step": 6410 + }, + { + "epoch": 0.7351221190230478, + "grad_norm": 0.25570815986070616, + "learning_rate": 0.00017299085354666684, + "loss": 2.4685, + "step": 6411 + }, + { + "epoch": 0.7352367847723885, + "grad_norm": 0.2483241272040528, + "learning_rate": 0.00017285040157639055, + "loss": 2.2687, + "step": 6412 + }, + { + "epoch": 0.7353514505217291, + "grad_norm": 0.2503368519024546, + "learning_rate": 0.00017270999473016524, + "loss": 2.4077, + "step": 6413 + }, + { + "epoch": 0.7354661162710698, + "grad_norm": 0.33417339436644733, + "learning_rate": 0.0001725696330273575, + "loss": 2.3328, + "step": 6414 + }, + { + "epoch": 0.7355807820204106, + "grad_norm": 0.2684638318458924, + "learning_rate": 0.00017242931648732746, + "loss": 2.3738, + "step": 6415 + }, + { + "epoch": 0.7356954477697512, + "grad_norm": 0.25751092889785415, + "learning_rate": 0.00017228904512942933, + "loss": 2.4867, + "step": 6416 + }, + { + "epoch": 0.7358101135190919, + "grad_norm": 0.24769457030090525, + "learning_rate": 0.00017214881897301053, + "loss": 2.3744, + "step": 6417 + }, + { + "epoch": 0.7359247792684325, + "grad_norm": 0.273370489174241, + "learning_rate": 0.00017200863803741256, + "loss": 2.3376, + "step": 6418 + }, + { + "epoch": 0.7360394450177732, + "grad_norm": 0.24001829120908535, + "learning_rate": 0.0001718685023419712, + "loss": 2.3963, + "step": 6419 + }, + { + "epoch": 0.7361541107671139, + "grad_norm": 0.2735408441436579, + "learning_rate": 0.00017172841190601508, + "loss": 2.3947, + "step": 6420 + }, + { + "epoch": 0.7362687765164545, + "grad_norm": 0.25038010240051484, + "learning_rate": 0.00017158836674886709, + "loss": 2.512, + "step": 6421 + }, + { + "epoch": 0.7363834422657952, + "grad_norm": 0.2511986466388504, + "learning_rate": 0.00017144836688984393, + "loss": 2.3736, + "step": 6422 + }, + { + "epoch": 0.7364981080151359, + "grad_norm": 0.261641704377779, + "learning_rate": 0.0001713084123482555, + "loss": 2.3823, + "step": 6423 + }, + { + "epoch": 0.7366127737644765, + "grad_norm": 0.24774891335312474, + "learning_rate": 0.0001711685031434063, + "loss": 2.3787, + "step": 6424 + }, + { + "epoch": 0.7367274395138173, + "grad_norm": 0.2817974334501849, + "learning_rate": 0.00017102863929459422, + "loss": 2.3888, + "step": 6425 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.25556045841548114, + "learning_rate": 0.00017088882082111033, + "loss": 2.3557, + "step": 6426 + }, + { + "epoch": 0.7369567710124986, + "grad_norm": 0.25429917219156073, + "learning_rate": 0.00017074904774224027, + "loss": 2.3451, + "step": 6427 + }, + { + "epoch": 0.7370714367618393, + "grad_norm": 0.24949524694374095, + "learning_rate": 0.00017060932007726293, + "loss": 2.4022, + "step": 6428 + }, + { + "epoch": 0.7371861025111799, + "grad_norm": 0.26945785783098614, + "learning_rate": 0.0001704696378454511, + "loss": 2.5371, + "step": 6429 + }, + { + "epoch": 0.7373007682605206, + "grad_norm": 0.24085239603047626, + "learning_rate": 0.00017033000106607133, + "loss": 2.4066, + "step": 6430 + }, + { + "epoch": 0.7374154340098612, + "grad_norm": 0.26438975356507105, + "learning_rate": 0.000170190409758384, + "loss": 2.4455, + "step": 6431 + }, + { + "epoch": 0.7375300997592019, + "grad_norm": 0.23933484319095555, + "learning_rate": 0.00017005086394164277, + "loss": 2.4235, + "step": 6432 + }, + { + "epoch": 0.7376447655085426, + "grad_norm": 0.25455056990007663, + "learning_rate": 0.00016991136363509523, + "loss": 2.3903, + "step": 6433 + }, + { + "epoch": 0.7377594312578832, + "grad_norm": 0.2879926758018694, + "learning_rate": 0.00016977190885798338, + "loss": 2.4464, + "step": 6434 + }, + { + "epoch": 0.737874097007224, + "grad_norm": 0.2526712579522657, + "learning_rate": 0.00016963249962954186, + "loss": 2.2796, + "step": 6435 + }, + { + "epoch": 0.7379887627565647, + "grad_norm": 0.2775822329112146, + "learning_rate": 0.00016949313596899957, + "loss": 2.5296, + "step": 6436 + }, + { + "epoch": 0.7381034285059053, + "grad_norm": 0.2926359744267795, + "learning_rate": 0.00016935381789557935, + "loss": 2.379, + "step": 6437 + }, + { + "epoch": 0.738218094255246, + "grad_norm": 0.25009791022147054, + "learning_rate": 0.0001692145454284969, + "loss": 2.4433, + "step": 6438 + }, + { + "epoch": 0.7383327600045866, + "grad_norm": 0.27482010013691033, + "learning_rate": 0.00016907531858696272, + "loss": 2.4141, + "step": 6439 + }, + { + "epoch": 0.7384474257539273, + "grad_norm": 0.24086837784438758, + "learning_rate": 0.00016893613739018054, + "loss": 2.3119, + "step": 6440 + }, + { + "epoch": 0.738562091503268, + "grad_norm": 0.2698293221390053, + "learning_rate": 0.00016879700185734736, + "loss": 2.3283, + "step": 6441 + }, + { + "epoch": 0.7386767572526086, + "grad_norm": 0.27000175315465824, + "learning_rate": 0.00016865791200765452, + "loss": 2.4391, + "step": 6442 + }, + { + "epoch": 0.7387914230019493, + "grad_norm": 0.24856935527968704, + "learning_rate": 0.0001685188678602867, + "loss": 2.5043, + "step": 6443 + }, + { + "epoch": 0.7389060887512899, + "grad_norm": 0.25099600324808935, + "learning_rate": 0.0001683798694344225, + "loss": 2.3761, + "step": 6444 + }, + { + "epoch": 0.7390207545006306, + "grad_norm": 0.27073262329145825, + "learning_rate": 0.00016824091674923404, + "loss": 2.5456, + "step": 6445 + }, + { + "epoch": 0.7391354202499714, + "grad_norm": 0.2700194485082787, + "learning_rate": 0.00016810200982388736, + "loss": 2.3781, + "step": 6446 + }, + { + "epoch": 0.739250085999312, + "grad_norm": 0.24865415740635244, + "learning_rate": 0.00016796314867754175, + "loss": 2.3994, + "step": 6447 + }, + { + "epoch": 0.7393647517486527, + "grad_norm": 0.2617456392522337, + "learning_rate": 0.00016782433332935054, + "loss": 2.4393, + "step": 6448 + }, + { + "epoch": 0.7394794174979934, + "grad_norm": 0.2516537119649872, + "learning_rate": 0.00016768556379846074, + "loss": 2.2734, + "step": 6449 + }, + { + "epoch": 0.739594083247334, + "grad_norm": 0.24740584302427626, + "learning_rate": 0.00016754684010401288, + "loss": 2.2605, + "step": 6450 + }, + { + "epoch": 0.7397087489966747, + "grad_norm": 0.3000928272834957, + "learning_rate": 0.00016740816226514134, + "loss": 2.4193, + "step": 6451 + }, + { + "epoch": 0.7398234147460153, + "grad_norm": 0.2689530172082053, + "learning_rate": 0.0001672695303009742, + "loss": 2.3269, + "step": 6452 + }, + { + "epoch": 0.739938080495356, + "grad_norm": 0.25359276963384686, + "learning_rate": 0.00016713094423063257, + "loss": 2.445, + "step": 6453 + }, + { + "epoch": 0.7400527462446967, + "grad_norm": 0.2655351113095584, + "learning_rate": 0.00016699240407323245, + "loss": 2.2968, + "step": 6454 + }, + { + "epoch": 0.7401674119940373, + "grad_norm": 0.26143423498694873, + "learning_rate": 0.00016685390984788235, + "loss": 2.3892, + "step": 6455 + }, + { + "epoch": 0.7402820777433781, + "grad_norm": 0.2685860924714883, + "learning_rate": 0.00016671546157368507, + "loss": 2.3429, + "step": 6456 + }, + { + "epoch": 0.7403967434927188, + "grad_norm": 0.2695601604068389, + "learning_rate": 0.00016657705926973682, + "loss": 2.2832, + "step": 6457 + }, + { + "epoch": 0.7405114092420594, + "grad_norm": 0.29279549829964374, + "learning_rate": 0.00016643870295512763, + "loss": 2.474, + "step": 6458 + }, + { + "epoch": 0.7406260749914001, + "grad_norm": 0.25695420948760556, + "learning_rate": 0.00016630039264894114, + "loss": 2.5383, + "step": 6459 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.2826840319743698, + "learning_rate": 0.00016616212837025475, + "loss": 2.57, + "step": 6460 + }, + { + "epoch": 0.7408554064900814, + "grad_norm": 0.28331646164693103, + "learning_rate": 0.00016602391013813907, + "loss": 2.449, + "step": 6461 + }, + { + "epoch": 0.7409700722394221, + "grad_norm": 0.2880777652658807, + "learning_rate": 0.0001658857379716588, + "loss": 2.4556, + "step": 6462 + }, + { + "epoch": 0.7410847379887627, + "grad_norm": 0.2779212008973584, + "learning_rate": 0.0001657476118898722, + "loss": 2.3941, + "step": 6463 + }, + { + "epoch": 0.7411994037381034, + "grad_norm": 0.24365589846876948, + "learning_rate": 0.0001656095319118311, + "loss": 2.4424, + "step": 6464 + }, + { + "epoch": 0.741314069487444, + "grad_norm": 0.2562907513914674, + "learning_rate": 0.00016547149805658102, + "loss": 2.4218, + "step": 6465 + }, + { + "epoch": 0.7414287352367848, + "grad_norm": 0.22003670505580827, + "learning_rate": 0.00016533351034316124, + "loss": 2.5837, + "step": 6466 + }, + { + "epoch": 0.7415434009861255, + "grad_norm": 0.23707831648615418, + "learning_rate": 0.00016519556879060422, + "loss": 2.3795, + "step": 6467 + }, + { + "epoch": 0.7416580667354661, + "grad_norm": 0.2265355270750885, + "learning_rate": 0.00016505767341793638, + "loss": 2.42, + "step": 6468 + }, + { + "epoch": 0.7417727324848068, + "grad_norm": 0.25543895117182963, + "learning_rate": 0.00016491982424417822, + "loss": 2.4143, + "step": 6469 + }, + { + "epoch": 0.7418873982341475, + "grad_norm": 0.24562601409371795, + "learning_rate": 0.00016478202128834298, + "loss": 2.5538, + "step": 6470 + }, + { + "epoch": 0.7420020639834881, + "grad_norm": 0.24634630304277433, + "learning_rate": 0.00016464426456943798, + "loss": 2.4678, + "step": 6471 + }, + { + "epoch": 0.7421167297328288, + "grad_norm": 0.2745901075912691, + "learning_rate": 0.0001645065541064645, + "loss": 2.3996, + "step": 6472 + }, + { + "epoch": 0.7422313954821694, + "grad_norm": 0.24452415107738634, + "learning_rate": 0.00016436888991841635, + "loss": 2.4128, + "step": 6473 + }, + { + "epoch": 0.7423460612315101, + "grad_norm": 0.26966349356425356, + "learning_rate": 0.00016423127202428234, + "loss": 2.3854, + "step": 6474 + }, + { + "epoch": 0.7424607269808509, + "grad_norm": 0.23410776465117272, + "learning_rate": 0.00016409370044304417, + "loss": 2.2958, + "step": 6475 + }, + { + "epoch": 0.7425753927301915, + "grad_norm": 0.23977639100899975, + "learning_rate": 0.00016395617519367694, + "loss": 2.3592, + "step": 6476 + }, + { + "epoch": 0.7426900584795322, + "grad_norm": 0.24684302742019762, + "learning_rate": 0.00016381869629514972, + "loss": 2.3887, + "step": 6477 + }, + { + "epoch": 0.7428047242288728, + "grad_norm": 0.2760276302081096, + "learning_rate": 0.00016368126376642517, + "loss": 2.3981, + "step": 6478 + }, + { + "epoch": 0.7429193899782135, + "grad_norm": 0.24430085885689223, + "learning_rate": 0.0001635438776264595, + "loss": 2.4263, + "step": 6479 + }, + { + "epoch": 0.7430340557275542, + "grad_norm": 0.24341053562650392, + "learning_rate": 0.00016340653789420246, + "loss": 2.4263, + "step": 6480 + }, + { + "epoch": 0.7431487214768948, + "grad_norm": 0.2954612588048405, + "learning_rate": 0.0001632692445885976, + "loss": 2.5716, + "step": 6481 + }, + { + "epoch": 0.7432633872262355, + "grad_norm": 0.2795041690457652, + "learning_rate": 0.00016313199772858162, + "loss": 2.421, + "step": 6482 + }, + { + "epoch": 0.7433780529755762, + "grad_norm": 0.24879527765820658, + "learning_rate": 0.0001629947973330851, + "loss": 2.2663, + "step": 6483 + }, + { + "epoch": 0.7434927187249168, + "grad_norm": 0.24786038043681655, + "learning_rate": 0.0001628576434210327, + "loss": 2.3732, + "step": 6484 + }, + { + "epoch": 0.7436073844742576, + "grad_norm": 0.2578956512610779, + "learning_rate": 0.0001627205360113417, + "loss": 2.4985, + "step": 6485 + }, + { + "epoch": 0.7437220502235982, + "grad_norm": 0.2547879296125543, + "learning_rate": 0.00016258347512292364, + "loss": 2.4464, + "step": 6486 + }, + { + "epoch": 0.7438367159729389, + "grad_norm": 0.2426821857356462, + "learning_rate": 0.0001624464607746836, + "loss": 2.4569, + "step": 6487 + }, + { + "epoch": 0.7439513817222796, + "grad_norm": 0.2293939025740168, + "learning_rate": 0.00016230949298551967, + "loss": 2.3713, + "step": 6488 + }, + { + "epoch": 0.7440660474716202, + "grad_norm": 0.2597311712424021, + "learning_rate": 0.00016217257177432432, + "loss": 2.3969, + "step": 6489 + }, + { + "epoch": 0.7441807132209609, + "grad_norm": 0.22775542067646762, + "learning_rate": 0.0001620356971599833, + "loss": 2.4824, + "step": 6490 + }, + { + "epoch": 0.7442953789703016, + "grad_norm": 0.24618118113992368, + "learning_rate": 0.00016189886916137547, + "loss": 2.4064, + "step": 6491 + }, + { + "epoch": 0.7444100447196422, + "grad_norm": 0.26715289128833575, + "learning_rate": 0.0001617620877973739, + "loss": 2.3661, + "step": 6492 + }, + { + "epoch": 0.7445247104689829, + "grad_norm": 0.24838679607508837, + "learning_rate": 0.0001616253530868449, + "loss": 2.4047, + "step": 6493 + }, + { + "epoch": 0.7446393762183235, + "grad_norm": 0.2665402729934092, + "learning_rate": 0.0001614886650486484, + "loss": 2.3521, + "step": 6494 + }, + { + "epoch": 0.7447540419676643, + "grad_norm": 0.21901497163164738, + "learning_rate": 0.00016135202370163798, + "loss": 2.2581, + "step": 6495 + }, + { + "epoch": 0.744868707717005, + "grad_norm": 0.27963002865866904, + "learning_rate": 0.00016121542906466087, + "loss": 2.4586, + "step": 6496 + }, + { + "epoch": 0.7449833734663456, + "grad_norm": 0.2272167510121665, + "learning_rate": 0.00016107888115655727, + "loss": 2.4346, + "step": 6497 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 0.23221969349128596, + "learning_rate": 0.00016094237999616162, + "loss": 2.3672, + "step": 6498 + }, + { + "epoch": 0.7452127049650269, + "grad_norm": 0.2572882361327514, + "learning_rate": 0.00016080592560230162, + "loss": 2.5071, + "step": 6499 + }, + { + "epoch": 0.7453273707143676, + "grad_norm": 0.23563660975172557, + "learning_rate": 0.00016066951799379852, + "loss": 2.364, + "step": 6500 + }, + { + "epoch": 0.7454420364637083, + "grad_norm": 0.257424602609495, + "learning_rate": 0.00016053315718946726, + "loss": 2.5173, + "step": 6501 + }, + { + "epoch": 0.7455567022130489, + "grad_norm": 0.25988742865269904, + "learning_rate": 0.00016039684320811627, + "loss": 2.3357, + "step": 6502 + }, + { + "epoch": 0.7456713679623896, + "grad_norm": 0.259338911434718, + "learning_rate": 0.000160260576068547, + "loss": 2.3429, + "step": 6503 + }, + { + "epoch": 0.7457860337117304, + "grad_norm": 0.26662028476921323, + "learning_rate": 0.00016012435578955552, + "loss": 2.3863, + "step": 6504 + }, + { + "epoch": 0.745900699461071, + "grad_norm": 0.257441712900983, + "learning_rate": 0.00015998818238993046, + "loss": 2.4351, + "step": 6505 + }, + { + "epoch": 0.7460153652104117, + "grad_norm": 0.2397458781662433, + "learning_rate": 0.00015985205588845436, + "loss": 2.5672, + "step": 6506 + }, + { + "epoch": 0.7461300309597523, + "grad_norm": 0.2554299064534445, + "learning_rate": 0.0001597159763039034, + "loss": 2.3912, + "step": 6507 + }, + { + "epoch": 0.746244696709093, + "grad_norm": 0.23833238551514632, + "learning_rate": 0.00015957994365504709, + "loss": 2.3998, + "step": 6508 + }, + { + "epoch": 0.7463593624584337, + "grad_norm": 0.25926464849641523, + "learning_rate": 0.00015944395796064847, + "loss": 2.354, + "step": 6509 + }, + { + "epoch": 0.7464740282077743, + "grad_norm": 0.2588610537965368, + "learning_rate": 0.00015930801923946453, + "loss": 2.3658, + "step": 6510 + }, + { + "epoch": 0.746588693957115, + "grad_norm": 0.2478396020699848, + "learning_rate": 0.00015917212751024497, + "loss": 2.3444, + "step": 6511 + }, + { + "epoch": 0.7467033597064556, + "grad_norm": 0.24263915918450826, + "learning_rate": 0.0001590362827917336, + "loss": 2.43, + "step": 6512 + }, + { + "epoch": 0.7468180254557963, + "grad_norm": 0.25354821661470545, + "learning_rate": 0.00015890048510266768, + "loss": 2.2718, + "step": 6513 + }, + { + "epoch": 0.7469326912051371, + "grad_norm": 0.2867770908491982, + "learning_rate": 0.00015876473446177792, + "loss": 2.419, + "step": 6514 + }, + { + "epoch": 0.7470473569544777, + "grad_norm": 0.25085489041591963, + "learning_rate": 0.0001586290308877885, + "loss": 2.4816, + "step": 6515 + }, + { + "epoch": 0.7471620227038184, + "grad_norm": 0.24658458569034983, + "learning_rate": 0.00015849337439941731, + "loss": 2.3646, + "step": 6516 + }, + { + "epoch": 0.7472766884531591, + "grad_norm": 0.2562785709451517, + "learning_rate": 0.00015835776501537524, + "loss": 2.4274, + "step": 6517 + }, + { + "epoch": 0.7473913542024997, + "grad_norm": 0.25372733288916743, + "learning_rate": 0.000158222202754367, + "loss": 2.4289, + "step": 6518 + }, + { + "epoch": 0.7475060199518404, + "grad_norm": 0.24170121945546652, + "learning_rate": 0.00015808668763509138, + "loss": 2.3094, + "step": 6519 + }, + { + "epoch": 0.747620685701181, + "grad_norm": 0.23866326861704354, + "learning_rate": 0.00015795121967623955, + "loss": 2.4337, + "step": 6520 + }, + { + "epoch": 0.7477353514505217, + "grad_norm": 0.3157771506251674, + "learning_rate": 0.00015781579889649695, + "loss": 2.2822, + "step": 6521 + }, + { + "epoch": 0.7478500171998624, + "grad_norm": 0.275701823082198, + "learning_rate": 0.0001576804253145424, + "loss": 2.4905, + "step": 6522 + }, + { + "epoch": 0.747964682949203, + "grad_norm": 0.26507048253077253, + "learning_rate": 0.00015754509894904767, + "loss": 2.4144, + "step": 6523 + }, + { + "epoch": 0.7480793486985438, + "grad_norm": 0.2650881384820673, + "learning_rate": 0.0001574098198186789, + "loss": 2.4104, + "step": 6524 + }, + { + "epoch": 0.7481940144478845, + "grad_norm": 0.23345830152427435, + "learning_rate": 0.00015727458794209532, + "loss": 2.3961, + "step": 6525 + }, + { + "epoch": 0.7483086801972251, + "grad_norm": 0.2616393227989673, + "learning_rate": 0.00015713940333794924, + "loss": 2.3332, + "step": 6526 + }, + { + "epoch": 0.7484233459465658, + "grad_norm": 0.2636261029965682, + "learning_rate": 0.00015700426602488694, + "loss": 2.3359, + "step": 6527 + }, + { + "epoch": 0.7485380116959064, + "grad_norm": 0.2590906361630558, + "learning_rate": 0.000156869176021548, + "loss": 2.3446, + "step": 6528 + }, + { + "epoch": 0.7486526774452471, + "grad_norm": 0.24853030451381355, + "learning_rate": 0.00015673413334656562, + "loss": 2.2838, + "step": 6529 + }, + { + "epoch": 0.7487673431945878, + "grad_norm": 0.2674819230945004, + "learning_rate": 0.00015659913801856624, + "loss": 2.3066, + "step": 6530 + }, + { + "epoch": 0.7488820089439284, + "grad_norm": 0.251569792005686, + "learning_rate": 0.00015646419005617014, + "loss": 2.3501, + "step": 6531 + }, + { + "epoch": 0.7489966746932691, + "grad_norm": 0.257095050545602, + "learning_rate": 0.00015632928947799046, + "loss": 2.4589, + "step": 6532 + }, + { + "epoch": 0.7491113404426097, + "grad_norm": 0.2718355013828471, + "learning_rate": 0.00015619443630263426, + "loss": 2.4579, + "step": 6533 + }, + { + "epoch": 0.7492260061919505, + "grad_norm": 0.27022586286513006, + "learning_rate": 0.00015605963054870226, + "loss": 2.4169, + "step": 6534 + }, + { + "epoch": 0.7493406719412912, + "grad_norm": 0.2730818068698634, + "learning_rate": 0.00015592487223478802, + "loss": 2.2675, + "step": 6535 + }, + { + "epoch": 0.7494553376906318, + "grad_norm": 0.28820148512239824, + "learning_rate": 0.00015579016137947898, + "loss": 2.3329, + "step": 6536 + }, + { + "epoch": 0.7495700034399725, + "grad_norm": 0.2648563268377763, + "learning_rate": 0.0001556554980013561, + "loss": 2.3603, + "step": 6537 + }, + { + "epoch": 0.7496846691893132, + "grad_norm": 0.286807142340206, + "learning_rate": 0.00015552088211899311, + "loss": 2.4557, + "step": 6538 + }, + { + "epoch": 0.7497993349386538, + "grad_norm": 0.27078543998863563, + "learning_rate": 0.00015538631375095824, + "loss": 2.3435, + "step": 6539 + }, + { + "epoch": 0.7499140006879945, + "grad_norm": 0.2539300677677799, + "learning_rate": 0.00015525179291581265, + "loss": 2.446, + "step": 6540 + }, + { + "epoch": 0.7500286664373351, + "grad_norm": 0.2680512435727763, + "learning_rate": 0.00015511731963211057, + "loss": 2.3388, + "step": 6541 + }, + { + "epoch": 0.7501433321866758, + "grad_norm": 0.23276964650397447, + "learning_rate": 0.00015498289391840015, + "loss": 2.3831, + "step": 6542 + }, + { + "epoch": 0.7502579979360166, + "grad_norm": 0.25655775790486474, + "learning_rate": 0.0001548485157932229, + "loss": 2.3926, + "step": 6543 + }, + { + "epoch": 0.7503726636853572, + "grad_norm": 0.28221696635866594, + "learning_rate": 0.00015471418527511366, + "loss": 2.4787, + "step": 6544 + }, + { + "epoch": 0.7504873294346979, + "grad_norm": 0.23006601439987143, + "learning_rate": 0.0001545799023826009, + "loss": 2.3436, + "step": 6545 + }, + { + "epoch": 0.7506019951840385, + "grad_norm": 0.23829401301324962, + "learning_rate": 0.00015444566713420643, + "loss": 2.3431, + "step": 6546 + }, + { + "epoch": 0.7507166609333792, + "grad_norm": 0.2522764831833254, + "learning_rate": 0.00015431147954844516, + "loss": 2.35, + "step": 6547 + }, + { + "epoch": 0.7508313266827199, + "grad_norm": 0.25266432541030887, + "learning_rate": 0.00015417733964382587, + "loss": 2.441, + "step": 6548 + }, + { + "epoch": 0.7509459924320605, + "grad_norm": 0.24496874776163988, + "learning_rate": 0.00015404324743885055, + "loss": 2.4877, + "step": 6549 + }, + { + "epoch": 0.7510606581814012, + "grad_norm": 0.25821055523971004, + "learning_rate": 0.0001539092029520147, + "loss": 2.4441, + "step": 6550 + }, + { + "epoch": 0.7511753239307419, + "grad_norm": 0.2446546467967584, + "learning_rate": 0.0001537752062018073, + "loss": 2.2808, + "step": 6551 + }, + { + "epoch": 0.7512899896800825, + "grad_norm": 0.23416276014559373, + "learning_rate": 0.00015364125720671062, + "loss": 2.331, + "step": 6552 + }, + { + "epoch": 0.7514046554294233, + "grad_norm": 0.22276805268508754, + "learning_rate": 0.00015350735598520005, + "loss": 2.2845, + "step": 6553 + }, + { + "epoch": 0.7515193211787639, + "grad_norm": 0.23970777248771488, + "learning_rate": 0.00015337350255574528, + "loss": 2.4154, + "step": 6554 + }, + { + "epoch": 0.7516339869281046, + "grad_norm": 0.24398053535117697, + "learning_rate": 0.0001532396969368084, + "loss": 2.4158, + "step": 6555 + }, + { + "epoch": 0.7517486526774453, + "grad_norm": 0.24255876198168227, + "learning_rate": 0.00015310593914684545, + "loss": 2.4301, + "step": 6556 + }, + { + "epoch": 0.7518633184267859, + "grad_norm": 0.23422146158572033, + "learning_rate": 0.00015297222920430576, + "loss": 2.2782, + "step": 6557 + }, + { + "epoch": 0.7519779841761266, + "grad_norm": 0.2824356410531014, + "learning_rate": 0.00015283856712763216, + "loss": 2.465, + "step": 6558 + }, + { + "epoch": 0.7520926499254673, + "grad_norm": 0.2509184042150388, + "learning_rate": 0.0001527049529352607, + "loss": 2.5023, + "step": 6559 + }, + { + "epoch": 0.7522073156748079, + "grad_norm": 0.2541861593116082, + "learning_rate": 0.0001525713866456211, + "loss": 2.5068, + "step": 6560 + }, + { + "epoch": 0.7523219814241486, + "grad_norm": 0.2600394679701279, + "learning_rate": 0.0001524378682771359, + "loss": 2.4542, + "step": 6561 + }, + { + "epoch": 0.7524366471734892, + "grad_norm": 0.26762343055476856, + "learning_rate": 0.0001523043978482217, + "loss": 2.5312, + "step": 6562 + }, + { + "epoch": 0.75255131292283, + "grad_norm": 0.2631801285456526, + "learning_rate": 0.00015217097537728813, + "loss": 2.3052, + "step": 6563 + }, + { + "epoch": 0.7526659786721707, + "grad_norm": 0.2522708164393584, + "learning_rate": 0.00015203760088273828, + "loss": 2.3605, + "step": 6564 + }, + { + "epoch": 0.7527806444215113, + "grad_norm": 0.26360983448076153, + "learning_rate": 0.00015190427438296862, + "loss": 2.349, + "step": 6565 + }, + { + "epoch": 0.752895310170852, + "grad_norm": 0.237102860279755, + "learning_rate": 0.00015177099589636916, + "loss": 2.1964, + "step": 6566 + }, + { + "epoch": 0.7530099759201926, + "grad_norm": 0.2554446661192114, + "learning_rate": 0.0001516377654413228, + "loss": 2.2934, + "step": 6567 + }, + { + "epoch": 0.7531246416695333, + "grad_norm": 0.24540631856412679, + "learning_rate": 0.00015150458303620618, + "loss": 2.3706, + "step": 6568 + }, + { + "epoch": 0.753239307418874, + "grad_norm": 0.27470871946578723, + "learning_rate": 0.00015137144869938968, + "loss": 2.4719, + "step": 6569 + }, + { + "epoch": 0.7533539731682146, + "grad_norm": 0.2636079681191475, + "learning_rate": 0.00015123836244923622, + "loss": 2.5388, + "step": 6570 + }, + { + "epoch": 0.7534686389175553, + "grad_norm": 0.2514471183025824, + "learning_rate": 0.00015110532430410268, + "loss": 2.5087, + "step": 6571 + }, + { + "epoch": 0.753583304666896, + "grad_norm": 0.24977845291221487, + "learning_rate": 0.00015097233428233925, + "loss": 2.3574, + "step": 6572 + }, + { + "epoch": 0.7536979704162367, + "grad_norm": 0.26476615615970445, + "learning_rate": 0.00015083939240228888, + "loss": 2.4463, + "step": 6573 + }, + { + "epoch": 0.7538126361655774, + "grad_norm": 0.25745675599193374, + "learning_rate": 0.00015070649868228898, + "loss": 2.4495, + "step": 6574 + }, + { + "epoch": 0.753927301914918, + "grad_norm": 0.2970827011285602, + "learning_rate": 0.00015057365314066952, + "loss": 2.4109, + "step": 6575 + }, + { + "epoch": 0.7540419676642587, + "grad_norm": 0.24908745403969004, + "learning_rate": 0.0001504408557957538, + "loss": 2.4128, + "step": 6576 + }, + { + "epoch": 0.7541566334135994, + "grad_norm": 0.2663490710353967, + "learning_rate": 0.0001503081066658588, + "loss": 2.4695, + "step": 6577 + }, + { + "epoch": 0.75427129916294, + "grad_norm": 0.25961752483260114, + "learning_rate": 0.00015017540576929477, + "loss": 2.4745, + "step": 6578 + }, + { + "epoch": 0.7543859649122807, + "grad_norm": 0.258382235975336, + "learning_rate": 0.00015004275312436528, + "loss": 2.4189, + "step": 6579 + }, + { + "epoch": 0.7545006306616213, + "grad_norm": 0.245218710372783, + "learning_rate": 0.00014991014874936721, + "loss": 2.3867, + "step": 6580 + }, + { + "epoch": 0.754615296410962, + "grad_norm": 0.2505824358915988, + "learning_rate": 0.00014977759266259093, + "loss": 2.4547, + "step": 6581 + }, + { + "epoch": 0.7547299621603027, + "grad_norm": 0.24960901032548094, + "learning_rate": 0.00014964508488231971, + "loss": 2.2998, + "step": 6582 + }, + { + "epoch": 0.7548446279096434, + "grad_norm": 0.26770038840401794, + "learning_rate": 0.00014951262542683052, + "loss": 2.3962, + "step": 6583 + }, + { + "epoch": 0.7549592936589841, + "grad_norm": 0.26525596711295824, + "learning_rate": 0.00014938021431439407, + "loss": 2.431, + "step": 6584 + }, + { + "epoch": 0.7550739594083248, + "grad_norm": 0.2200162860346518, + "learning_rate": 0.00014924785156327354, + "loss": 2.339, + "step": 6585 + }, + { + "epoch": 0.7551886251576654, + "grad_norm": 0.25439755147628335, + "learning_rate": 0.00014911553719172587, + "loss": 2.3767, + "step": 6586 + }, + { + "epoch": 0.7553032909070061, + "grad_norm": 0.27934193198072843, + "learning_rate": 0.00014898327121800153, + "loss": 2.5087, + "step": 6587 + }, + { + "epoch": 0.7554179566563467, + "grad_norm": 0.2627670808905417, + "learning_rate": 0.0001488510536603437, + "loss": 2.2752, + "step": 6588 + }, + { + "epoch": 0.7555326224056874, + "grad_norm": 0.27328945197109195, + "learning_rate": 0.00014871888453698957, + "loss": 2.3786, + "step": 6589 + }, + { + "epoch": 0.7556472881550281, + "grad_norm": 0.25542486950594584, + "learning_rate": 0.00014858676386616949, + "loss": 2.4662, + "step": 6590 + }, + { + "epoch": 0.7557619539043687, + "grad_norm": 0.29053477814777107, + "learning_rate": 0.00014845469166610664, + "loss": 2.4517, + "step": 6591 + }, + { + "epoch": 0.7558766196537094, + "grad_norm": 0.2401803572319788, + "learning_rate": 0.00014832266795501803, + "loss": 2.4132, + "step": 6592 + }, + { + "epoch": 0.7559912854030502, + "grad_norm": 0.2607204767078145, + "learning_rate": 0.00014819069275111375, + "loss": 2.4297, + "step": 6593 + }, + { + "epoch": 0.7561059511523908, + "grad_norm": 0.2465156351953693, + "learning_rate": 0.00014805876607259732, + "loss": 2.3357, + "step": 6594 + }, + { + "epoch": 0.7562206169017315, + "grad_norm": 0.26379271350811356, + "learning_rate": 0.0001479268879376655, + "loss": 2.5467, + "step": 6595 + }, + { + "epoch": 0.7563352826510721, + "grad_norm": 0.25282091845670956, + "learning_rate": 0.00014779505836450858, + "loss": 2.3736, + "step": 6596 + }, + { + "epoch": 0.7564499484004128, + "grad_norm": 0.2716544263188572, + "learning_rate": 0.0001476632773713095, + "loss": 2.418, + "step": 6597 + }, + { + "epoch": 0.7565646141497535, + "grad_norm": 0.2368122502261607, + "learning_rate": 0.00014753154497624516, + "loss": 2.5258, + "step": 6598 + }, + { + "epoch": 0.7566792798990941, + "grad_norm": 0.26398318338078525, + "learning_rate": 0.00014739986119748555, + "loss": 2.3533, + "step": 6599 + }, + { + "epoch": 0.7567939456484348, + "grad_norm": 0.27607560676181087, + "learning_rate": 0.00014726822605319395, + "loss": 2.3845, + "step": 6600 + }, + { + "epoch": 0.7569086113977754, + "grad_norm": 0.2582099129002738, + "learning_rate": 0.00014713663956152683, + "loss": 2.4197, + "step": 6601 + }, + { + "epoch": 0.7570232771471161, + "grad_norm": 0.2614857032104042, + "learning_rate": 0.00014700510174063425, + "loss": 2.3975, + "step": 6602 + }, + { + "epoch": 0.7571379428964569, + "grad_norm": 0.24998381044406973, + "learning_rate": 0.00014687361260865895, + "loss": 2.294, + "step": 6603 + }, + { + "epoch": 0.7572526086457975, + "grad_norm": 0.27955160821198743, + "learning_rate": 0.0001467421721837378, + "loss": 2.3737, + "step": 6604 + }, + { + "epoch": 0.7573672743951382, + "grad_norm": 0.27421178767257853, + "learning_rate": 0.00014661078048400022, + "loss": 2.3944, + "step": 6605 + }, + { + "epoch": 0.7574819401444789, + "grad_norm": 0.23959803315329295, + "learning_rate": 0.0001464794375275692, + "loss": 2.4179, + "step": 6606 + }, + { + "epoch": 0.7575966058938195, + "grad_norm": 0.23597878298447664, + "learning_rate": 0.00014634814333256107, + "loss": 2.259, + "step": 6607 + }, + { + "epoch": 0.7577112716431602, + "grad_norm": 0.284674838340585, + "learning_rate": 0.00014621689791708537, + "loss": 2.4184, + "step": 6608 + }, + { + "epoch": 0.7578259373925008, + "grad_norm": 0.26071227690661253, + "learning_rate": 0.00014608570129924492, + "loss": 2.3996, + "step": 6609 + }, + { + "epoch": 0.7579406031418415, + "grad_norm": 0.25387333916424853, + "learning_rate": 0.00014595455349713583, + "loss": 2.3992, + "step": 6610 + }, + { + "epoch": 0.7580552688911822, + "grad_norm": 0.24547623314803763, + "learning_rate": 0.00014582345452884726, + "loss": 2.3626, + "step": 6611 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.2540895935021022, + "learning_rate": 0.0001456924044124619, + "loss": 2.4054, + "step": 6612 + }, + { + "epoch": 0.7582846003898636, + "grad_norm": 0.2587215324575647, + "learning_rate": 0.00014556140316605566, + "loss": 2.3148, + "step": 6613 + }, + { + "epoch": 0.7583992661392042, + "grad_norm": 0.26255031206946017, + "learning_rate": 0.00014543045080769762, + "loss": 2.3216, + "step": 6614 + }, + { + "epoch": 0.7585139318885449, + "grad_norm": 0.22832438793093263, + "learning_rate": 0.00014529954735545016, + "loss": 2.3179, + "step": 6615 + }, + { + "epoch": 0.7586285976378856, + "grad_norm": 0.24235394915497896, + "learning_rate": 0.00014516869282736911, + "loss": 2.505, + "step": 6616 + }, + { + "epoch": 0.7587432633872262, + "grad_norm": 0.2317537446371527, + "learning_rate": 0.00014503788724150302, + "loss": 2.3544, + "step": 6617 + }, + { + "epoch": 0.7588579291365669, + "grad_norm": 0.25603494773416513, + "learning_rate": 0.00014490713061589405, + "loss": 2.4477, + "step": 6618 + }, + { + "epoch": 0.7589725948859076, + "grad_norm": 0.23909052454498408, + "learning_rate": 0.00014477642296857807, + "loss": 2.3316, + "step": 6619 + }, + { + "epoch": 0.7590872606352482, + "grad_norm": 0.2552529622855681, + "learning_rate": 0.00014464576431758315, + "loss": 2.3904, + "step": 6620 + }, + { + "epoch": 0.7592019263845889, + "grad_norm": 0.2548669530931078, + "learning_rate": 0.0001445151546809314, + "loss": 2.4366, + "step": 6621 + }, + { + "epoch": 0.7593165921339295, + "grad_norm": 0.2567057188057503, + "learning_rate": 0.00014438459407663807, + "loss": 2.3519, + "step": 6622 + }, + { + "epoch": 0.7594312578832703, + "grad_norm": 0.2366407440583382, + "learning_rate": 0.000144254082522711, + "loss": 2.3935, + "step": 6623 + }, + { + "epoch": 0.759545923632611, + "grad_norm": 0.27813547277917605, + "learning_rate": 0.00014412362003715228, + "loss": 2.391, + "step": 6624 + }, + { + "epoch": 0.7596605893819516, + "grad_norm": 0.24437914698457763, + "learning_rate": 0.00014399320663795672, + "loss": 2.3812, + "step": 6625 + }, + { + "epoch": 0.7597752551312923, + "grad_norm": 0.23773796960309754, + "learning_rate": 0.00014386284234311204, + "loss": 2.4151, + "step": 6626 + }, + { + "epoch": 0.759889920880633, + "grad_norm": 0.28067293554375405, + "learning_rate": 0.00014373252717059964, + "loss": 2.4878, + "step": 6627 + }, + { + "epoch": 0.7600045866299736, + "grad_norm": 0.24667583741589158, + "learning_rate": 0.00014360226113839408, + "loss": 2.5272, + "step": 6628 + }, + { + "epoch": 0.7601192523793143, + "grad_norm": 0.21924024300205358, + "learning_rate": 0.00014347204426446304, + "loss": 2.3447, + "step": 6629 + }, + { + "epoch": 0.7602339181286549, + "grad_norm": 0.26552192308108785, + "learning_rate": 0.0001433418765667674, + "loss": 2.3299, + "step": 6630 + }, + { + "epoch": 0.7603485838779956, + "grad_norm": 0.2544526239602189, + "learning_rate": 0.0001432117580632616, + "loss": 2.3384, + "step": 6631 + }, + { + "epoch": 0.7604632496273364, + "grad_norm": 0.241834312867679, + "learning_rate": 0.00014308168877189253, + "loss": 2.3787, + "step": 6632 + }, + { + "epoch": 0.760577915376677, + "grad_norm": 0.26889820285213384, + "learning_rate": 0.00014295166871060088, + "loss": 2.4072, + "step": 6633 + }, + { + "epoch": 0.7606925811260177, + "grad_norm": 0.24229494223821071, + "learning_rate": 0.00014282169789732091, + "loss": 2.3826, + "step": 6634 + }, + { + "epoch": 0.7608072468753583, + "grad_norm": 0.28425367269356444, + "learning_rate": 0.00014269177634997914, + "loss": 2.2299, + "step": 6635 + }, + { + "epoch": 0.760921912624699, + "grad_norm": 0.234340480475067, + "learning_rate": 0.00014256190408649589, + "loss": 2.3652, + "step": 6636 + }, + { + "epoch": 0.7610365783740397, + "grad_norm": 0.2264685852149525, + "learning_rate": 0.00014243208112478474, + "loss": 2.4637, + "step": 6637 + }, + { + "epoch": 0.7611512441233803, + "grad_norm": 0.262455095360687, + "learning_rate": 0.0001423023074827518, + "loss": 2.3766, + "step": 6638 + }, + { + "epoch": 0.761265909872721, + "grad_norm": 0.24091388722021084, + "learning_rate": 0.00014217258317829735, + "loss": 2.3629, + "step": 6639 + }, + { + "epoch": 0.7613805756220617, + "grad_norm": 0.2649176508161098, + "learning_rate": 0.00014204290822931442, + "loss": 2.4058, + "step": 6640 + }, + { + "epoch": 0.7614952413714023, + "grad_norm": 0.2628773440882439, + "learning_rate": 0.00014191328265368886, + "loss": 2.3025, + "step": 6641 + }, + { + "epoch": 0.7616099071207431, + "grad_norm": 0.2433668136557576, + "learning_rate": 0.00014178370646930017, + "loss": 2.3005, + "step": 6642 + }, + { + "epoch": 0.7617245728700837, + "grad_norm": 0.268670919624635, + "learning_rate": 0.000141654179694021, + "loss": 2.4849, + "step": 6643 + }, + { + "epoch": 0.7618392386194244, + "grad_norm": 0.3109078186889851, + "learning_rate": 0.000141524702345717, + "loss": 2.3522, + "step": 6644 + }, + { + "epoch": 0.7619539043687651, + "grad_norm": 0.24743456555178936, + "learning_rate": 0.0001413952744422472, + "loss": 2.4379, + "step": 6645 + }, + { + "epoch": 0.7620685701181057, + "grad_norm": 0.24247268933711663, + "learning_rate": 0.0001412658960014638, + "loss": 2.3638, + "step": 6646 + }, + { + "epoch": 0.7621832358674464, + "grad_norm": 0.27971227884244154, + "learning_rate": 0.0001411365670412118, + "loss": 2.2063, + "step": 6647 + }, + { + "epoch": 0.762297901616787, + "grad_norm": 0.24328706614973591, + "learning_rate": 0.0001410072875793298, + "loss": 2.4731, + "step": 6648 + }, + { + "epoch": 0.7624125673661277, + "grad_norm": 0.26000443963884157, + "learning_rate": 0.00014087805763364947, + "loss": 2.2662, + "step": 6649 + }, + { + "epoch": 0.7625272331154684, + "grad_norm": 0.2562946972523383, + "learning_rate": 0.0001407488772219956, + "loss": 2.3624, + "step": 6650 + }, + { + "epoch": 0.762641898864809, + "grad_norm": 0.2902918259582824, + "learning_rate": 0.00014061974636218617, + "loss": 2.3978, + "step": 6651 + }, + { + "epoch": 0.7627565646141498, + "grad_norm": 0.2785332006422166, + "learning_rate": 0.00014049066507203244, + "loss": 2.3816, + "step": 6652 + }, + { + "epoch": 0.7628712303634905, + "grad_norm": 0.2600523856272482, + "learning_rate": 0.00014036163336933832, + "loss": 2.3424, + "step": 6653 + }, + { + "epoch": 0.7629858961128311, + "grad_norm": 0.2591644829640088, + "learning_rate": 0.00014023265127190187, + "loss": 2.3293, + "step": 6654 + }, + { + "epoch": 0.7631005618621718, + "grad_norm": 0.26723978082085725, + "learning_rate": 0.00014010371879751322, + "loss": 2.297, + "step": 6655 + }, + { + "epoch": 0.7632152276115124, + "grad_norm": 0.28077701040527386, + "learning_rate": 0.00013997483596395627, + "loss": 2.4163, + "step": 6656 + }, + { + "epoch": 0.7633298933608531, + "grad_norm": 0.2896252058771666, + "learning_rate": 0.00013984600278900804, + "loss": 2.4001, + "step": 6657 + }, + { + "epoch": 0.7634445591101938, + "grad_norm": 0.24914817655355395, + "learning_rate": 0.00013971721929043857, + "loss": 2.4258, + "step": 6658 + }, + { + "epoch": 0.7635592248595344, + "grad_norm": 0.2540792073362412, + "learning_rate": 0.00013958848548601104, + "loss": 2.2948, + "step": 6659 + }, + { + "epoch": 0.7636738906088751, + "grad_norm": 0.25133489254091174, + "learning_rate": 0.00013945980139348207, + "loss": 2.4366, + "step": 6660 + }, + { + "epoch": 0.7637885563582159, + "grad_norm": 0.2524374119126822, + "learning_rate": 0.00013933116703060077, + "loss": 2.4123, + "step": 6661 + }, + { + "epoch": 0.7639032221075565, + "grad_norm": 0.268768194441521, + "learning_rate": 0.0001392025824151101, + "loss": 2.4821, + "step": 6662 + }, + { + "epoch": 0.7640178878568972, + "grad_norm": 0.26056616480961897, + "learning_rate": 0.00013907404756474567, + "loss": 2.525, + "step": 6663 + }, + { + "epoch": 0.7641325536062378, + "grad_norm": 0.3000439367305072, + "learning_rate": 0.00013894556249723655, + "loss": 2.5407, + "step": 6664 + }, + { + "epoch": 0.7642472193555785, + "grad_norm": 0.25870897775299373, + "learning_rate": 0.0001388171272303047, + "loss": 2.2766, + "step": 6665 + }, + { + "epoch": 0.7643618851049192, + "grad_norm": 0.23113707867982647, + "learning_rate": 0.00013868874178166562, + "loss": 2.2746, + "step": 6666 + }, + { + "epoch": 0.7644765508542598, + "grad_norm": 0.22918475292273205, + "learning_rate": 0.00013856040616902715, + "loss": 2.3619, + "step": 6667 + }, + { + "epoch": 0.7645912166036005, + "grad_norm": 0.25070597552018997, + "learning_rate": 0.0001384321204100909, + "loss": 2.4057, + "step": 6668 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.2555856940874532, + "learning_rate": 0.00013830388452255182, + "loss": 2.4079, + "step": 6669 + }, + { + "epoch": 0.7648205481022818, + "grad_norm": 0.25605074313532655, + "learning_rate": 0.0001381756985240971, + "loss": 2.4445, + "step": 6670 + }, + { + "epoch": 0.7649352138516226, + "grad_norm": 0.24135209261256807, + "learning_rate": 0.00013804756243240786, + "loss": 2.4277, + "step": 6671 + }, + { + "epoch": 0.7650498796009632, + "grad_norm": 0.2646191550267873, + "learning_rate": 0.00013791947626515805, + "loss": 2.4888, + "step": 6672 + }, + { + "epoch": 0.7651645453503039, + "grad_norm": 0.2508601066718837, + "learning_rate": 0.0001377914400400143, + "loss": 2.47, + "step": 6673 + }, + { + "epoch": 0.7652792110996446, + "grad_norm": 0.25034297759677093, + "learning_rate": 0.00013766345377463725, + "loss": 2.4777, + "step": 6674 + }, + { + "epoch": 0.7653938768489852, + "grad_norm": 0.2375621951467455, + "learning_rate": 0.00013753551748668007, + "loss": 2.2888, + "step": 6675 + }, + { + "epoch": 0.7655085425983259, + "grad_norm": 0.2725747605508751, + "learning_rate": 0.00013740763119378898, + "loss": 2.6211, + "step": 6676 + }, + { + "epoch": 0.7656232083476665, + "grad_norm": 0.24579444362489467, + "learning_rate": 0.00013727979491360347, + "loss": 2.4411, + "step": 6677 + }, + { + "epoch": 0.7657378740970072, + "grad_norm": 0.24996767475687234, + "learning_rate": 0.00013715200866375627, + "loss": 2.364, + "step": 6678 + }, + { + "epoch": 0.7658525398463479, + "grad_norm": 0.24963513282280045, + "learning_rate": 0.00013702427246187288, + "loss": 2.3863, + "step": 6679 + }, + { + "epoch": 0.7659672055956885, + "grad_norm": 0.2619643266332412, + "learning_rate": 0.00013689658632557217, + "loss": 2.4444, + "step": 6680 + }, + { + "epoch": 0.7660818713450293, + "grad_norm": 0.2483229166621221, + "learning_rate": 0.00013676895027246616, + "loss": 2.4623, + "step": 6681 + }, + { + "epoch": 0.76619653709437, + "grad_norm": 0.25409547386113196, + "learning_rate": 0.00013664136432015944, + "loss": 2.3516, + "step": 6682 + }, + { + "epoch": 0.7663112028437106, + "grad_norm": 0.22683002366358138, + "learning_rate": 0.00013651382848625022, + "loss": 2.2265, + "step": 6683 + }, + { + "epoch": 0.7664258685930513, + "grad_norm": 0.24179439370549818, + "learning_rate": 0.0001363863427883299, + "loss": 2.2857, + "step": 6684 + }, + { + "epoch": 0.7665405343423919, + "grad_norm": 0.2550536490417984, + "learning_rate": 0.00013625890724398238, + "loss": 2.4005, + "step": 6685 + }, + { + "epoch": 0.7666552000917326, + "grad_norm": 0.24149959111924035, + "learning_rate": 0.00013613152187078503, + "loss": 2.2975, + "step": 6686 + }, + { + "epoch": 0.7667698658410733, + "grad_norm": 0.2723342907512863, + "learning_rate": 0.00013600418668630843, + "loss": 2.2921, + "step": 6687 + }, + { + "epoch": 0.7668845315904139, + "grad_norm": 0.25620256232546235, + "learning_rate": 0.00013587690170811562, + "loss": 2.4593, + "step": 6688 + }, + { + "epoch": 0.7669991973397546, + "grad_norm": 0.25299496307253233, + "learning_rate": 0.00013574966695376352, + "loss": 2.4248, + "step": 6689 + }, + { + "epoch": 0.7671138630890952, + "grad_norm": 0.26404362104130036, + "learning_rate": 0.00013562248244080178, + "loss": 2.4834, + "step": 6690 + }, + { + "epoch": 0.767228528838436, + "grad_norm": 0.268537337735386, + "learning_rate": 0.00013549534818677278, + "loss": 2.3818, + "step": 6691 + }, + { + "epoch": 0.7673431945877767, + "grad_norm": 0.25040490214991357, + "learning_rate": 0.0001353682642092124, + "loss": 2.327, + "step": 6692 + }, + { + "epoch": 0.7674578603371173, + "grad_norm": 0.2624302879955897, + "learning_rate": 0.00013524123052564946, + "loss": 2.4375, + "step": 6693 + }, + { + "epoch": 0.767572526086458, + "grad_norm": 0.2546766462944758, + "learning_rate": 0.00013511424715360583, + "loss": 2.3069, + "step": 6694 + }, + { + "epoch": 0.7676871918357987, + "grad_norm": 0.2591590947335354, + "learning_rate": 0.00013498731411059646, + "loss": 2.3715, + "step": 6695 + }, + { + "epoch": 0.7678018575851393, + "grad_norm": 0.238711854993485, + "learning_rate": 0.00013486043141412946, + "loss": 2.382, + "step": 6696 + }, + { + "epoch": 0.76791652333448, + "grad_norm": 0.26332454285062235, + "learning_rate": 0.00013473359908170563, + "loss": 2.3549, + "step": 6697 + }, + { + "epoch": 0.7680311890838206, + "grad_norm": 0.2714399003481173, + "learning_rate": 0.00013460681713081918, + "loss": 2.409, + "step": 6698 + }, + { + "epoch": 0.7681458548331613, + "grad_norm": 0.2535074321111449, + "learning_rate": 0.0001344800855789573, + "loss": 2.3398, + "step": 6699 + }, + { + "epoch": 0.768260520582502, + "grad_norm": 0.2761185754544816, + "learning_rate": 0.00013435340444360017, + "loss": 2.4476, + "step": 6700 + }, + { + "epoch": 0.7683751863318427, + "grad_norm": 0.2828665761643112, + "learning_rate": 0.00013422677374222097, + "loss": 2.4357, + "step": 6701 + }, + { + "epoch": 0.7684898520811834, + "grad_norm": 0.2652456285604216, + "learning_rate": 0.0001341001934922863, + "loss": 2.442, + "step": 6702 + }, + { + "epoch": 0.768604517830524, + "grad_norm": 0.2660266702564812, + "learning_rate": 0.00013397366371125491, + "loss": 2.4041, + "step": 6703 + }, + { + "epoch": 0.7687191835798647, + "grad_norm": 0.28071514494688726, + "learning_rate": 0.00013384718441657983, + "loss": 2.5201, + "step": 6704 + }, + { + "epoch": 0.7688338493292054, + "grad_norm": 0.24888585726959186, + "learning_rate": 0.000133720755625706, + "loss": 2.4086, + "step": 6705 + }, + { + "epoch": 0.768948515078546, + "grad_norm": 0.2741294606970871, + "learning_rate": 0.000133594377356072, + "loss": 2.5103, + "step": 6706 + }, + { + "epoch": 0.7690631808278867, + "grad_norm": 0.25793879299601646, + "learning_rate": 0.00013346804962510934, + "loss": 2.3862, + "step": 6707 + }, + { + "epoch": 0.7691778465772274, + "grad_norm": 0.2669726901008002, + "learning_rate": 0.00013334177245024248, + "loss": 2.2828, + "step": 6708 + }, + { + "epoch": 0.769292512326568, + "grad_norm": 0.2727491160952012, + "learning_rate": 0.00013321554584888896, + "loss": 2.3787, + "step": 6709 + }, + { + "epoch": 0.7694071780759087, + "grad_norm": 0.24429506144682955, + "learning_rate": 0.00013308936983845947, + "loss": 2.3596, + "step": 6710 + }, + { + "epoch": 0.7695218438252494, + "grad_norm": 0.2416728458294508, + "learning_rate": 0.00013296324443635738, + "loss": 2.3857, + "step": 6711 + }, + { + "epoch": 0.7696365095745901, + "grad_norm": 0.28979540804612186, + "learning_rate": 0.00013283716965997933, + "loss": 2.4526, + "step": 6712 + }, + { + "epoch": 0.7697511753239308, + "grad_norm": 0.2427351281685031, + "learning_rate": 0.00013271114552671493, + "loss": 2.4514, + "step": 6713 + }, + { + "epoch": 0.7698658410732714, + "grad_norm": 0.26762829063114646, + "learning_rate": 0.0001325851720539469, + "loss": 2.3893, + "step": 6714 + }, + { + "epoch": 0.7699805068226121, + "grad_norm": 0.2421377987805488, + "learning_rate": 0.0001324592492590508, + "loss": 2.3948, + "step": 6715 + }, + { + "epoch": 0.7700951725719528, + "grad_norm": 0.2580493156200732, + "learning_rate": 0.00013233337715939543, + "loss": 2.4551, + "step": 6716 + }, + { + "epoch": 0.7702098383212934, + "grad_norm": 0.28378972312913053, + "learning_rate": 0.00013220755577234217, + "loss": 2.5038, + "step": 6717 + }, + { + "epoch": 0.7703245040706341, + "grad_norm": 0.23599179436306258, + "learning_rate": 0.00013208178511524572, + "loss": 2.3167, + "step": 6718 + }, + { + "epoch": 0.7704391698199747, + "grad_norm": 0.2709500909450494, + "learning_rate": 0.00013195606520545412, + "loss": 2.4124, + "step": 6719 + }, + { + "epoch": 0.7705538355693154, + "grad_norm": 0.2697822011116564, + "learning_rate": 0.00013183039606030761, + "loss": 2.4825, + "step": 6720 + }, + { + "epoch": 0.7706685013186562, + "grad_norm": 0.24119099763353175, + "learning_rate": 0.00013170477769714007, + "loss": 2.4273, + "step": 6721 + }, + { + "epoch": 0.7707831670679968, + "grad_norm": 0.2491072135795041, + "learning_rate": 0.00013157921013327818, + "loss": 2.4016, + "step": 6722 + }, + { + "epoch": 0.7708978328173375, + "grad_norm": 0.26240384137404155, + "learning_rate": 0.00013145369338604123, + "loss": 2.3999, + "step": 6723 + }, + { + "epoch": 0.7710124985666781, + "grad_norm": 0.25394163439574824, + "learning_rate": 0.0001313282274727423, + "loss": 2.4186, + "step": 6724 + }, + { + "epoch": 0.7711271643160188, + "grad_norm": 0.29144253215777816, + "learning_rate": 0.00013120281241068706, + "loss": 2.3929, + "step": 6725 + }, + { + "epoch": 0.7712418300653595, + "grad_norm": 0.24468945285131885, + "learning_rate": 0.00013107744821717378, + "loss": 2.4242, + "step": 6726 + }, + { + "epoch": 0.7713564958147001, + "grad_norm": 0.23992107096356521, + "learning_rate": 0.00013095213490949416, + "loss": 2.4502, + "step": 6727 + }, + { + "epoch": 0.7714711615640408, + "grad_norm": 0.25561032155131896, + "learning_rate": 0.00013082687250493287, + "loss": 2.4754, + "step": 6728 + }, + { + "epoch": 0.7715858273133815, + "grad_norm": 0.24810590538965988, + "learning_rate": 0.00013070166102076741, + "loss": 2.3403, + "step": 6729 + }, + { + "epoch": 0.7717004930627221, + "grad_norm": 0.2535647232351421, + "learning_rate": 0.00013057650047426834, + "loss": 2.4285, + "step": 6730 + }, + { + "epoch": 0.7718151588120629, + "grad_norm": 0.2607598592409844, + "learning_rate": 0.00013045139088269935, + "loss": 2.4769, + "step": 6731 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 0.24281646364076165, + "learning_rate": 0.00013032633226331663, + "loss": 2.3052, + "step": 6732 + }, + { + "epoch": 0.7720444903107442, + "grad_norm": 0.27588018337218645, + "learning_rate": 0.0001302013246333696, + "loss": 2.35, + "step": 6733 + }, + { + "epoch": 0.7721591560600849, + "grad_norm": 0.23288382665180205, + "learning_rate": 0.0001300763680101011, + "loss": 2.4203, + "step": 6734 + }, + { + "epoch": 0.7722738218094255, + "grad_norm": 0.26362651134625326, + "learning_rate": 0.00012995146241074606, + "loss": 2.3343, + "step": 6735 + }, + { + "epoch": 0.7723884875587662, + "grad_norm": 0.23797718744264573, + "learning_rate": 0.00012982660785253308, + "loss": 2.4258, + "step": 6736 + }, + { + "epoch": 0.7725031533081068, + "grad_norm": 0.23939278455290353, + "learning_rate": 0.00012970180435268352, + "loss": 2.4245, + "step": 6737 + }, + { + "epoch": 0.7726178190574475, + "grad_norm": 0.2531636203113712, + "learning_rate": 0.00012957705192841125, + "loss": 2.4769, + "step": 6738 + }, + { + "epoch": 0.7727324848067882, + "grad_norm": 0.2426578461368766, + "learning_rate": 0.0001294523505969239, + "loss": 2.5486, + "step": 6739 + }, + { + "epoch": 0.7728471505561288, + "grad_norm": 0.28811562532877744, + "learning_rate": 0.00012932770037542163, + "loss": 2.4666, + "step": 6740 + }, + { + "epoch": 0.7729618163054696, + "grad_norm": 0.266875482029193, + "learning_rate": 0.0001292031012810973, + "loss": 2.4406, + "step": 6741 + }, + { + "epoch": 0.7730764820548103, + "grad_norm": 0.23962694371038845, + "learning_rate": 0.0001290785533311371, + "loss": 2.3326, + "step": 6742 + }, + { + "epoch": 0.7731911478041509, + "grad_norm": 0.2922606547071169, + "learning_rate": 0.00012895405654272013, + "loss": 2.4653, + "step": 6743 + }, + { + "epoch": 0.7733058135534916, + "grad_norm": 0.2621888353280769, + "learning_rate": 0.00012882961093301821, + "loss": 2.4894, + "step": 6744 + }, + { + "epoch": 0.7734204793028322, + "grad_norm": 0.24649893214913532, + "learning_rate": 0.00012870521651919638, + "loss": 2.3794, + "step": 6745 + }, + { + "epoch": 0.7735351450521729, + "grad_norm": 0.24967562397585344, + "learning_rate": 0.00012858087331841245, + "loss": 2.3528, + "step": 6746 + }, + { + "epoch": 0.7736498108015136, + "grad_norm": 0.25300181162821855, + "learning_rate": 0.00012845658134781707, + "loss": 2.3722, + "step": 6747 + }, + { + "epoch": 0.7737644765508542, + "grad_norm": 0.2558374445529858, + "learning_rate": 0.00012833234062455408, + "loss": 2.2594, + "step": 6748 + }, + { + "epoch": 0.7738791423001949, + "grad_norm": 0.2309763115483486, + "learning_rate": 0.00012820815116576002, + "loss": 2.397, + "step": 6749 + }, + { + "epoch": 0.7739938080495357, + "grad_norm": 0.24819263235431924, + "learning_rate": 0.00012808401298856448, + "loss": 2.4286, + "step": 6750 + }, + { + "epoch": 0.7741084737988763, + "grad_norm": 0.2644804030517646, + "learning_rate": 0.00012795992611009, + "loss": 2.3965, + "step": 6751 + }, + { + "epoch": 0.774223139548217, + "grad_norm": 0.2621979907022684, + "learning_rate": 0.00012783589054745216, + "loss": 2.3417, + "step": 6752 + }, + { + "epoch": 0.7743378052975576, + "grad_norm": 0.2547485839487469, + "learning_rate": 0.0001277119063177588, + "loss": 2.3489, + "step": 6753 + }, + { + "epoch": 0.7744524710468983, + "grad_norm": 0.24904654280401417, + "learning_rate": 0.00012758797343811185, + "loss": 2.3808, + "step": 6754 + }, + { + "epoch": 0.774567136796239, + "grad_norm": 0.26786175103033183, + "learning_rate": 0.00012746409192560494, + "loss": 2.4109, + "step": 6755 + }, + { + "epoch": 0.7746818025455796, + "grad_norm": 0.2515542511824764, + "learning_rate": 0.0001273402617973254, + "loss": 2.4858, + "step": 6756 + }, + { + "epoch": 0.7747964682949203, + "grad_norm": 0.23798496270028302, + "learning_rate": 0.0001272164830703532, + "loss": 2.3564, + "step": 6757 + }, + { + "epoch": 0.7749111340442609, + "grad_norm": 0.23510520203873148, + "learning_rate": 0.00012709275576176126, + "loss": 2.4671, + "step": 6758 + }, + { + "epoch": 0.7750257997936016, + "grad_norm": 0.2652416024050623, + "learning_rate": 0.0001269690798886154, + "loss": 2.3175, + "step": 6759 + }, + { + "epoch": 0.7751404655429424, + "grad_norm": 0.2288314665896917, + "learning_rate": 0.0001268454554679745, + "loss": 2.4389, + "step": 6760 + }, + { + "epoch": 0.775255131292283, + "grad_norm": 0.31235679679253275, + "learning_rate": 0.00012672188251688988, + "loss": 2.4288, + "step": 6761 + }, + { + "epoch": 0.7753697970416237, + "grad_norm": 0.24916806110380782, + "learning_rate": 0.0001265983610524063, + "loss": 2.4176, + "step": 6762 + }, + { + "epoch": 0.7754844627909644, + "grad_norm": 0.26552106376410894, + "learning_rate": 0.00012647489109156106, + "loss": 2.3202, + "step": 6763 + }, + { + "epoch": 0.775599128540305, + "grad_norm": 0.23830204637106392, + "learning_rate": 0.00012635147265138458, + "loss": 2.4354, + "step": 6764 + }, + { + "epoch": 0.7757137942896457, + "grad_norm": 0.2717537917902372, + "learning_rate": 0.0001262281057489001, + "loss": 2.3551, + "step": 6765 + }, + { + "epoch": 0.7758284600389863, + "grad_norm": 0.2416107359097658, + "learning_rate": 0.00012610479040112377, + "loss": 2.4243, + "step": 6766 + }, + { + "epoch": 0.775943125788327, + "grad_norm": 0.2363310438225181, + "learning_rate": 0.0001259815266250644, + "loss": 2.4865, + "step": 6767 + }, + { + "epoch": 0.7760577915376677, + "grad_norm": 0.2512659079546379, + "learning_rate": 0.00012585831443772384, + "loss": 2.51, + "step": 6768 + }, + { + "epoch": 0.7761724572870083, + "grad_norm": 0.2503441228925365, + "learning_rate": 0.00012573515385609735, + "loss": 2.3356, + "step": 6769 + }, + { + "epoch": 0.7762871230363491, + "grad_norm": 0.2489387387398845, + "learning_rate": 0.00012561204489717204, + "loss": 2.4542, + "step": 6770 + }, + { + "epoch": 0.7764017887856897, + "grad_norm": 0.2501216635199761, + "learning_rate": 0.0001254889875779287, + "loss": 2.2867, + "step": 6771 + }, + { + "epoch": 0.7765164545350304, + "grad_norm": 0.23454760424364315, + "learning_rate": 0.00012536598191534081, + "loss": 2.3632, + "step": 6772 + }, + { + "epoch": 0.7766311202843711, + "grad_norm": 0.25266289697076355, + "learning_rate": 0.00012524302792637427, + "loss": 2.4499, + "step": 6773 + }, + { + "epoch": 0.7767457860337117, + "grad_norm": 0.2257019356786494, + "learning_rate": 0.00012512012562798864, + "loss": 2.3209, + "step": 6774 + }, + { + "epoch": 0.7768604517830524, + "grad_norm": 0.2490724954470137, + "learning_rate": 0.00012499727503713593, + "loss": 2.3692, + "step": 6775 + }, + { + "epoch": 0.7769751175323931, + "grad_norm": 0.25975149663652675, + "learning_rate": 0.0001248744761707608, + "loss": 2.401, + "step": 6776 + }, + { + "epoch": 0.7770897832817337, + "grad_norm": 0.2616494721500692, + "learning_rate": 0.00012475172904580117, + "loss": 2.3494, + "step": 6777 + }, + { + "epoch": 0.7772044490310744, + "grad_norm": 0.26254857361496403, + "learning_rate": 0.00012462903367918766, + "loss": 2.3636, + "step": 6778 + }, + { + "epoch": 0.777319114780415, + "grad_norm": 0.25641683363116075, + "learning_rate": 0.00012450639008784365, + "loss": 2.2955, + "step": 6779 + }, + { + "epoch": 0.7774337805297558, + "grad_norm": 0.24962073410301486, + "learning_rate": 0.00012438379828868563, + "loss": 2.4942, + "step": 6780 + }, + { + "epoch": 0.7775484462790965, + "grad_norm": 0.28383966436989877, + "learning_rate": 0.00012426125829862288, + "loss": 2.3656, + "step": 6781 + }, + { + "epoch": 0.7776631120284371, + "grad_norm": 0.2734082063305767, + "learning_rate": 0.00012413877013455716, + "loss": 2.3577, + "step": 6782 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.28827860255289034, + "learning_rate": 0.0001240163338133834, + "loss": 2.457, + "step": 6783 + }, + { + "epoch": 0.7778924435271185, + "grad_norm": 0.29734988787010436, + "learning_rate": 0.0001238939493519897, + "loss": 2.376, + "step": 6784 + }, + { + "epoch": 0.7780071092764591, + "grad_norm": 0.26946110298299003, + "learning_rate": 0.0001237716167672564, + "loss": 2.5524, + "step": 6785 + }, + { + "epoch": 0.7781217750257998, + "grad_norm": 0.248865538876543, + "learning_rate": 0.000123649336076057, + "loss": 2.3772, + "step": 6786 + }, + { + "epoch": 0.7782364407751404, + "grad_norm": 0.271226080741363, + "learning_rate": 0.0001235271072952579, + "loss": 2.3481, + "step": 6787 + }, + { + "epoch": 0.7783511065244811, + "grad_norm": 0.26892458397232577, + "learning_rate": 0.00012340493044171787, + "loss": 2.4388, + "step": 6788 + }, + { + "epoch": 0.7784657722738219, + "grad_norm": 0.28585478622875443, + "learning_rate": 0.0001232828055322892, + "loss": 2.3622, + "step": 6789 + }, + { + "epoch": 0.7785804380231625, + "grad_norm": 0.23281361216201132, + "learning_rate": 0.00012316073258381683, + "loss": 2.4165, + "step": 6790 + }, + { + "epoch": 0.7786951037725032, + "grad_norm": 0.26412077109659277, + "learning_rate": 0.00012303871161313805, + "loss": 2.4098, + "step": 6791 + }, + { + "epoch": 0.7788097695218438, + "grad_norm": 0.2631627870531466, + "learning_rate": 0.00012291674263708346, + "loss": 2.448, + "step": 6792 + }, + { + "epoch": 0.7789244352711845, + "grad_norm": 0.25745554900012735, + "learning_rate": 0.00012279482567247634, + "loss": 2.3164, + "step": 6793 + }, + { + "epoch": 0.7790391010205252, + "grad_norm": 0.25013605326549, + "learning_rate": 0.00012267296073613287, + "loss": 2.3426, + "step": 6794 + }, + { + "epoch": 0.7791537667698658, + "grad_norm": 0.3236628537191183, + "learning_rate": 0.00012255114784486194, + "loss": 2.4647, + "step": 6795 + }, + { + "epoch": 0.7792684325192065, + "grad_norm": 0.2839006306201851, + "learning_rate": 0.00012242938701546545, + "loss": 2.2653, + "step": 6796 + }, + { + "epoch": 0.7793830982685472, + "grad_norm": 0.25635341730623284, + "learning_rate": 0.00012230767826473777, + "loss": 2.3439, + "step": 6797 + }, + { + "epoch": 0.7794977640178878, + "grad_norm": 0.2627817694497659, + "learning_rate": 0.0001221860216094664, + "loss": 2.559, + "step": 6798 + }, + { + "epoch": 0.7796124297672286, + "grad_norm": 0.2421796728964254, + "learning_rate": 0.00012206441706643151, + "loss": 2.3482, + "step": 6799 + }, + { + "epoch": 0.7797270955165692, + "grad_norm": 0.25841308985607714, + "learning_rate": 0.00012194286465240623, + "loss": 2.5083, + "step": 6800 + }, + { + "epoch": 0.7798417612659099, + "grad_norm": 0.24702732459447166, + "learning_rate": 0.00012182136438415631, + "loss": 2.3457, + "step": 6801 + }, + { + "epoch": 0.7799564270152506, + "grad_norm": 0.24193101268822317, + "learning_rate": 0.00012169991627844063, + "loss": 2.4964, + "step": 6802 + }, + { + "epoch": 0.7800710927645912, + "grad_norm": 0.26623638447971226, + "learning_rate": 0.00012157852035201012, + "loss": 2.3721, + "step": 6803 + }, + { + "epoch": 0.7801857585139319, + "grad_norm": 0.25753972279585624, + "learning_rate": 0.00012145717662160966, + "loss": 2.5166, + "step": 6804 + }, + { + "epoch": 0.7803004242632725, + "grad_norm": 0.24598753773961207, + "learning_rate": 0.0001213358851039758, + "loss": 2.4094, + "step": 6805 + }, + { + "epoch": 0.7804150900126132, + "grad_norm": 0.2548999635180375, + "learning_rate": 0.00012121464581583868, + "loss": 2.4119, + "step": 6806 + }, + { + "epoch": 0.7805297557619539, + "grad_norm": 0.22325814670259, + "learning_rate": 0.0001210934587739208, + "loss": 2.4082, + "step": 6807 + }, + { + "epoch": 0.7806444215112945, + "grad_norm": 0.25903921462501833, + "learning_rate": 0.00012097232399493768, + "loss": 2.4685, + "step": 6808 + }, + { + "epoch": 0.7807590872606353, + "grad_norm": 0.25636680944453333, + "learning_rate": 0.00012085124149559757, + "loss": 2.514, + "step": 6809 + }, + { + "epoch": 0.780873753009976, + "grad_norm": 0.24377042532608056, + "learning_rate": 0.00012073021129260158, + "loss": 2.4624, + "step": 6810 + }, + { + "epoch": 0.7809884187593166, + "grad_norm": 0.26951914093943835, + "learning_rate": 0.00012060923340264329, + "loss": 2.4367, + "step": 6811 + }, + { + "epoch": 0.7811030845086573, + "grad_norm": 0.2667855628320824, + "learning_rate": 0.00012048830784240939, + "loss": 2.6164, + "step": 6812 + }, + { + "epoch": 0.7812177502579979, + "grad_norm": 0.266745529842439, + "learning_rate": 0.00012036743462857924, + "loss": 2.4539, + "step": 6813 + }, + { + "epoch": 0.7813324160073386, + "grad_norm": 0.25270594220148607, + "learning_rate": 0.00012024661377782503, + "loss": 2.5068, + "step": 6814 + }, + { + "epoch": 0.7814470817566793, + "grad_norm": 0.23801915189603112, + "learning_rate": 0.00012012584530681175, + "loss": 2.2616, + "step": 6815 + }, + { + "epoch": 0.7815617475060199, + "grad_norm": 0.29365007203058474, + "learning_rate": 0.00012000512923219714, + "loss": 2.4966, + "step": 6816 + }, + { + "epoch": 0.7816764132553606, + "grad_norm": 0.222740240741158, + "learning_rate": 0.00011988446557063148, + "loss": 2.3433, + "step": 6817 + }, + { + "epoch": 0.7817910790047014, + "grad_norm": 0.22859403909681947, + "learning_rate": 0.00011976385433875797, + "loss": 2.3845, + "step": 6818 + }, + { + "epoch": 0.781905744754042, + "grad_norm": 0.24469848142173548, + "learning_rate": 0.00011964329555321307, + "loss": 2.2305, + "step": 6819 + }, + { + "epoch": 0.7820204105033827, + "grad_norm": 0.2374452383230014, + "learning_rate": 0.0001195227892306252, + "loss": 2.422, + "step": 6820 + }, + { + "epoch": 0.7821350762527233, + "grad_norm": 0.24625181140730534, + "learning_rate": 0.00011940233538761597, + "loss": 2.5437, + "step": 6821 + }, + { + "epoch": 0.782249742002064, + "grad_norm": 0.259538065771925, + "learning_rate": 0.00011928193404079967, + "loss": 2.3026, + "step": 6822 + }, + { + "epoch": 0.7823644077514047, + "grad_norm": 0.23823653868265982, + "learning_rate": 0.0001191615852067835, + "loss": 2.2802, + "step": 6823 + }, + { + "epoch": 0.7824790735007453, + "grad_norm": 0.2510478757940728, + "learning_rate": 0.00011904128890216715, + "loss": 2.4359, + "step": 6824 + }, + { + "epoch": 0.782593739250086, + "grad_norm": 0.24451770316097682, + "learning_rate": 0.00011892104514354347, + "loss": 2.2862, + "step": 6825 + }, + { + "epoch": 0.7827084049994266, + "grad_norm": 0.2372839016313924, + "learning_rate": 0.0001188008539474974, + "loss": 2.3743, + "step": 6826 + }, + { + "epoch": 0.7828230707487673, + "grad_norm": 0.2609022122381084, + "learning_rate": 0.0001186807153306072, + "loss": 2.267, + "step": 6827 + }, + { + "epoch": 0.782937736498108, + "grad_norm": 0.2776608846118546, + "learning_rate": 0.00011856062930944372, + "loss": 2.4965, + "step": 6828 + }, + { + "epoch": 0.7830524022474487, + "grad_norm": 0.24238856462568917, + "learning_rate": 0.00011844059590057054, + "loss": 2.4751, + "step": 6829 + }, + { + "epoch": 0.7831670679967894, + "grad_norm": 0.25136234726901846, + "learning_rate": 0.00011832061512054398, + "loss": 2.2689, + "step": 6830 + }, + { + "epoch": 0.7832817337461301, + "grad_norm": 0.2668117376700046, + "learning_rate": 0.00011820068698591324, + "loss": 2.2448, + "step": 6831 + }, + { + "epoch": 0.7833963994954707, + "grad_norm": 0.2351329250658453, + "learning_rate": 0.00011808081151321982, + "loss": 2.3166, + "step": 6832 + }, + { + "epoch": 0.7835110652448114, + "grad_norm": 0.2603827509706882, + "learning_rate": 0.00011796098871899824, + "loss": 2.2767, + "step": 6833 + }, + { + "epoch": 0.783625730994152, + "grad_norm": 0.2849077807721982, + "learning_rate": 0.00011784121861977625, + "loss": 2.3031, + "step": 6834 + }, + { + "epoch": 0.7837403967434927, + "grad_norm": 0.2521469876072976, + "learning_rate": 0.00011772150123207343, + "loss": 2.372, + "step": 6835 + }, + { + "epoch": 0.7838550624928334, + "grad_norm": 0.280494473510368, + "learning_rate": 0.0001176018365724026, + "loss": 2.4214, + "step": 6836 + }, + { + "epoch": 0.783969728242174, + "grad_norm": 0.263191933019923, + "learning_rate": 0.00011748222465726937, + "loss": 2.3821, + "step": 6837 + }, + { + "epoch": 0.7840843939915147, + "grad_norm": 0.2413776709440937, + "learning_rate": 0.00011736266550317154, + "loss": 2.3315, + "step": 6838 + }, + { + "epoch": 0.7841990597408554, + "grad_norm": 0.26708315330374316, + "learning_rate": 0.00011724315912660038, + "loss": 2.467, + "step": 6839 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.24713110207981037, + "learning_rate": 0.00011712370554403956, + "loss": 2.4164, + "step": 6840 + }, + { + "epoch": 0.7844283912395368, + "grad_norm": 0.27159626918779534, + "learning_rate": 0.00011700430477196516, + "loss": 2.3713, + "step": 6841 + }, + { + "epoch": 0.7845430569888774, + "grad_norm": 0.2706520747692881, + "learning_rate": 0.0001168849568268463, + "loss": 2.4882, + "step": 6842 + }, + { + "epoch": 0.7846577227382181, + "grad_norm": 0.24992521687614402, + "learning_rate": 0.00011676566172514485, + "loss": 2.4342, + "step": 6843 + }, + { + "epoch": 0.7847723884875588, + "grad_norm": 0.24723570819689836, + "learning_rate": 0.00011664641948331528, + "loss": 2.4522, + "step": 6844 + }, + { + "epoch": 0.7848870542368994, + "grad_norm": 0.255953626650492, + "learning_rate": 0.0001165272301178047, + "loss": 2.4019, + "step": 6845 + }, + { + "epoch": 0.7850017199862401, + "grad_norm": 0.24775299340232967, + "learning_rate": 0.00011640809364505329, + "loss": 2.4685, + "step": 6846 + }, + { + "epoch": 0.7851163857355807, + "grad_norm": 0.2573375269025107, + "learning_rate": 0.00011628901008149323, + "loss": 2.4114, + "step": 6847 + }, + { + "epoch": 0.7852310514849214, + "grad_norm": 0.24830087530723977, + "learning_rate": 0.00011616997944355, + "loss": 2.2887, + "step": 6848 + }, + { + "epoch": 0.7853457172342622, + "grad_norm": 0.25495053815830065, + "learning_rate": 0.00011605100174764172, + "loss": 2.4399, + "step": 6849 + }, + { + "epoch": 0.7854603829836028, + "grad_norm": 0.2463684664289024, + "learning_rate": 0.00011593207701017894, + "loss": 2.4928, + "step": 6850 + }, + { + "epoch": 0.7855750487329435, + "grad_norm": 0.2466823067834659, + "learning_rate": 0.00011581320524756517, + "loss": 2.4406, + "step": 6851 + }, + { + "epoch": 0.7856897144822842, + "grad_norm": 0.2411050771898711, + "learning_rate": 0.00011569438647619662, + "loss": 2.4069, + "step": 6852 + }, + { + "epoch": 0.7858043802316248, + "grad_norm": 0.254349851043376, + "learning_rate": 0.00011557562071246163, + "loss": 2.5172, + "step": 6853 + }, + { + "epoch": 0.7859190459809655, + "grad_norm": 0.25046515278037307, + "learning_rate": 0.00011545690797274228, + "loss": 2.4405, + "step": 6854 + }, + { + "epoch": 0.7860337117303061, + "grad_norm": 0.2544690464158616, + "learning_rate": 0.00011533824827341227, + "loss": 2.3124, + "step": 6855 + }, + { + "epoch": 0.7861483774796468, + "grad_norm": 0.2548610387103973, + "learning_rate": 0.00011521964163083853, + "loss": 2.461, + "step": 6856 + }, + { + "epoch": 0.7862630432289875, + "grad_norm": 0.21374969542326022, + "learning_rate": 0.0001151010880613807, + "loss": 2.4123, + "step": 6857 + }, + { + "epoch": 0.7863777089783281, + "grad_norm": 0.24147315581330375, + "learning_rate": 0.00011498258758139096, + "loss": 2.3451, + "step": 6858 + }, + { + "epoch": 0.7864923747276689, + "grad_norm": 0.24057022307280926, + "learning_rate": 0.00011486414020721409, + "loss": 2.435, + "step": 6859 + }, + { + "epoch": 0.7866070404770095, + "grad_norm": 0.2495311796770366, + "learning_rate": 0.00011474574595518777, + "loss": 2.5052, + "step": 6860 + }, + { + "epoch": 0.7867217062263502, + "grad_norm": 0.22915948494123575, + "learning_rate": 0.0001146274048416423, + "loss": 2.3789, + "step": 6861 + }, + { + "epoch": 0.7868363719756909, + "grad_norm": 0.2760430460121054, + "learning_rate": 0.00011450911688290033, + "loss": 2.4967, + "step": 6862 + }, + { + "epoch": 0.7869510377250315, + "grad_norm": 0.2475679763885997, + "learning_rate": 0.00011439088209527754, + "loss": 2.3535, + "step": 6863 + }, + { + "epoch": 0.7870657034743722, + "grad_norm": 0.2711487140131415, + "learning_rate": 0.00011427270049508221, + "loss": 2.4046, + "step": 6864 + }, + { + "epoch": 0.7871803692237129, + "grad_norm": 0.25477261163360804, + "learning_rate": 0.00011415457209861524, + "loss": 2.403, + "step": 6865 + }, + { + "epoch": 0.7872950349730535, + "grad_norm": 0.2508496060122097, + "learning_rate": 0.00011403649692217011, + "loss": 2.3584, + "step": 6866 + }, + { + "epoch": 0.7874097007223942, + "grad_norm": 0.25643849399689145, + "learning_rate": 0.00011391847498203328, + "loss": 2.4111, + "step": 6867 + }, + { + "epoch": 0.7875243664717348, + "grad_norm": 0.251245652083769, + "learning_rate": 0.00011380050629448313, + "loss": 2.4071, + "step": 6868 + }, + { + "epoch": 0.7876390322210756, + "grad_norm": 0.25048520917210765, + "learning_rate": 0.0001136825908757918, + "loss": 2.2692, + "step": 6869 + }, + { + "epoch": 0.7877536979704163, + "grad_norm": 0.2622116550506559, + "learning_rate": 0.00011356472874222307, + "loss": 2.3229, + "step": 6870 + }, + { + "epoch": 0.7878683637197569, + "grad_norm": 0.25364752025709036, + "learning_rate": 0.00011344691991003386, + "loss": 2.2759, + "step": 6871 + }, + { + "epoch": 0.7879830294690976, + "grad_norm": 0.23652624271123546, + "learning_rate": 0.00011332916439547363, + "loss": 2.3852, + "step": 6872 + }, + { + "epoch": 0.7880976952184382, + "grad_norm": 0.260825287035466, + "learning_rate": 0.00011321146221478462, + "loss": 2.279, + "step": 6873 + }, + { + "epoch": 0.7882123609677789, + "grad_norm": 0.2535277582055584, + "learning_rate": 0.0001130938133842015, + "loss": 2.3158, + "step": 6874 + }, + { + "epoch": 0.7883270267171196, + "grad_norm": 0.24090115930141076, + "learning_rate": 0.00011297621791995188, + "loss": 2.4916, + "step": 6875 + }, + { + "epoch": 0.7884416924664602, + "grad_norm": 0.27281986870535163, + "learning_rate": 0.00011285867583825549, + "loss": 2.5464, + "step": 6876 + }, + { + "epoch": 0.788556358215801, + "grad_norm": 0.2717714055125925, + "learning_rate": 0.00011274118715532522, + "loss": 2.3902, + "step": 6877 + }, + { + "epoch": 0.7886710239651417, + "grad_norm": 0.27084529366944915, + "learning_rate": 0.00011262375188736634, + "loss": 2.4643, + "step": 6878 + }, + { + "epoch": 0.7887856897144823, + "grad_norm": 0.2683490756462135, + "learning_rate": 0.00011250637005057685, + "loss": 2.4136, + "step": 6879 + }, + { + "epoch": 0.788900355463823, + "grad_norm": 0.26444407424945043, + "learning_rate": 0.00011238904166114727, + "loss": 2.5191, + "step": 6880 + }, + { + "epoch": 0.7890150212131636, + "grad_norm": 0.25597989549114103, + "learning_rate": 0.00011227176673526101, + "loss": 2.3583, + "step": 6881 + }, + { + "epoch": 0.7891296869625043, + "grad_norm": 0.2394916937439897, + "learning_rate": 0.00011215454528909358, + "loss": 2.3644, + "step": 6882 + }, + { + "epoch": 0.789244352711845, + "grad_norm": 0.2905449000116514, + "learning_rate": 0.00011203737733881353, + "loss": 2.2051, + "step": 6883 + }, + { + "epoch": 0.7893590184611856, + "grad_norm": 0.29653055768440506, + "learning_rate": 0.00011192026290058233, + "loss": 2.367, + "step": 6884 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.24697146859657346, + "learning_rate": 0.00011180320199055322, + "loss": 2.5159, + "step": 6885 + }, + { + "epoch": 0.789588349959867, + "grad_norm": 0.24185975784157224, + "learning_rate": 0.00011168619462487267, + "loss": 2.4004, + "step": 6886 + }, + { + "epoch": 0.7897030157092076, + "grad_norm": 0.27624984270824127, + "learning_rate": 0.00011156924081967978, + "loss": 2.5588, + "step": 6887 + }, + { + "epoch": 0.7898176814585484, + "grad_norm": 0.24214968784755503, + "learning_rate": 0.00011145234059110571, + "loss": 2.4612, + "step": 6888 + }, + { + "epoch": 0.789932347207889, + "grad_norm": 0.25095297246538084, + "learning_rate": 0.00011133549395527497, + "loss": 2.3112, + "step": 6889 + }, + { + "epoch": 0.7900470129572297, + "grad_norm": 0.2650610791748218, + "learning_rate": 0.00011121870092830439, + "loss": 2.4206, + "step": 6890 + }, + { + "epoch": 0.7901616787065704, + "grad_norm": 0.2524768539756423, + "learning_rate": 0.00011110196152630303, + "loss": 2.3046, + "step": 6891 + }, + { + "epoch": 0.790276344455911, + "grad_norm": 0.26181449084924485, + "learning_rate": 0.00011098527576537299, + "loss": 2.3896, + "step": 6892 + }, + { + "epoch": 0.7903910102052517, + "grad_norm": 0.23455171418778667, + "learning_rate": 0.00011086864366160887, + "loss": 2.326, + "step": 6893 + }, + { + "epoch": 0.7905056759545923, + "grad_norm": 0.26729612630694116, + "learning_rate": 0.00011075206523109783, + "loss": 2.3269, + "step": 6894 + }, + { + "epoch": 0.790620341703933, + "grad_norm": 0.24974223499420717, + "learning_rate": 0.00011063554048991964, + "loss": 2.399, + "step": 6895 + }, + { + "epoch": 0.7907350074532737, + "grad_norm": 0.2469709455907023, + "learning_rate": 0.00011051906945414686, + "loss": 2.3645, + "step": 6896 + }, + { + "epoch": 0.7908496732026143, + "grad_norm": 0.2667849329771901, + "learning_rate": 0.00011040265213984407, + "loss": 2.4399, + "step": 6897 + }, + { + "epoch": 0.7909643389519551, + "grad_norm": 0.26081779772397223, + "learning_rate": 0.00011028628856306905, + "loss": 2.3812, + "step": 6898 + }, + { + "epoch": 0.7910790047012958, + "grad_norm": 0.2549624530959558, + "learning_rate": 0.00011016997873987189, + "loss": 2.4652, + "step": 6899 + }, + { + "epoch": 0.7911936704506364, + "grad_norm": 0.24701631797960671, + "learning_rate": 0.00011005372268629538, + "loss": 2.3831, + "step": 6900 + }, + { + "epoch": 0.7913083361999771, + "grad_norm": 0.2333277746841834, + "learning_rate": 0.00010993752041837474, + "loss": 2.4187, + "step": 6901 + }, + { + "epoch": 0.7914230019493177, + "grad_norm": 0.24730614753844446, + "learning_rate": 0.00010982137195213799, + "loss": 2.4569, + "step": 6902 + }, + { + "epoch": 0.7915376676986584, + "grad_norm": 0.24357275050037164, + "learning_rate": 0.00010970527730360524, + "loss": 2.4461, + "step": 6903 + }, + { + "epoch": 0.7916523334479991, + "grad_norm": 0.2545264542299276, + "learning_rate": 0.00010958923648878993, + "loss": 2.382, + "step": 6904 + }, + { + "epoch": 0.7917669991973397, + "grad_norm": 0.2515001283477543, + "learning_rate": 0.0001094732495236977, + "loss": 2.3751, + "step": 6905 + }, + { + "epoch": 0.7918816649466804, + "grad_norm": 0.25198306552429056, + "learning_rate": 0.00010935731642432644, + "loss": 2.4867, + "step": 6906 + }, + { + "epoch": 0.791996330696021, + "grad_norm": 0.25088314249643856, + "learning_rate": 0.00010924143720666708, + "loss": 2.41, + "step": 6907 + }, + { + "epoch": 0.7921109964453618, + "grad_norm": 0.2582129420231825, + "learning_rate": 0.00010912561188670284, + "loss": 2.2643, + "step": 6908 + }, + { + "epoch": 0.7922256621947025, + "grad_norm": 0.24178145220770467, + "learning_rate": 0.00010900984048040974, + "loss": 2.4595, + "step": 6909 + }, + { + "epoch": 0.7923403279440431, + "grad_norm": 0.25189715797031564, + "learning_rate": 0.00010889412300375623, + "loss": 2.3524, + "step": 6910 + }, + { + "epoch": 0.7924549936933838, + "grad_norm": 0.2577826975392892, + "learning_rate": 0.00010877845947270337, + "loss": 2.3243, + "step": 6911 + }, + { + "epoch": 0.7925696594427245, + "grad_norm": 0.2508361227879777, + "learning_rate": 0.00010866284990320457, + "loss": 2.5711, + "step": 6912 + }, + { + "epoch": 0.7926843251920651, + "grad_norm": 0.29107575404734926, + "learning_rate": 0.00010854729431120608, + "loss": 2.5415, + "step": 6913 + }, + { + "epoch": 0.7927989909414058, + "grad_norm": 0.24585978633505923, + "learning_rate": 0.00010843179271264652, + "loss": 2.3768, + "step": 6914 + }, + { + "epoch": 0.7929136566907464, + "grad_norm": 0.27533813824283, + "learning_rate": 0.00010831634512345722, + "loss": 2.2871, + "step": 6915 + }, + { + "epoch": 0.7930283224400871, + "grad_norm": 0.2747661895741441, + "learning_rate": 0.00010820095155956194, + "loss": 2.4561, + "step": 6916 + }, + { + "epoch": 0.7931429881894279, + "grad_norm": 0.2696816385766819, + "learning_rate": 0.00010808561203687712, + "loss": 2.3377, + "step": 6917 + }, + { + "epoch": 0.7932576539387685, + "grad_norm": 0.25766336227000675, + "learning_rate": 0.00010797032657131135, + "loss": 2.3815, + "step": 6918 + }, + { + "epoch": 0.7933723196881092, + "grad_norm": 0.2606992113217092, + "learning_rate": 0.00010785509517876646, + "loss": 2.419, + "step": 6919 + }, + { + "epoch": 0.7934869854374499, + "grad_norm": 0.2501851928546129, + "learning_rate": 0.00010773991787513615, + "loss": 2.314, + "step": 6920 + }, + { + "epoch": 0.7936016511867905, + "grad_norm": 0.2530499049338274, + "learning_rate": 0.00010762479467630698, + "loss": 2.2749, + "step": 6921 + }, + { + "epoch": 0.7937163169361312, + "grad_norm": 0.2616008267612015, + "learning_rate": 0.00010750972559815803, + "loss": 2.4719, + "step": 6922 + }, + { + "epoch": 0.7938309826854718, + "grad_norm": 0.2660578440712656, + "learning_rate": 0.00010739471065656092, + "loss": 2.4008, + "step": 6923 + }, + { + "epoch": 0.7939456484348125, + "grad_norm": 0.27026244871971705, + "learning_rate": 0.00010727974986737971, + "loss": 2.3796, + "step": 6924 + }, + { + "epoch": 0.7940603141841532, + "grad_norm": 0.2622607520937587, + "learning_rate": 0.00010716484324647119, + "loss": 2.3566, + "step": 6925 + }, + { + "epoch": 0.7941749799334938, + "grad_norm": 0.2659642750302182, + "learning_rate": 0.00010704999080968431, + "loss": 2.3265, + "step": 6926 + }, + { + "epoch": 0.7942896456828346, + "grad_norm": 0.2601833551664661, + "learning_rate": 0.00010693519257286094, + "loss": 2.4049, + "step": 6927 + }, + { + "epoch": 0.7944043114321752, + "grad_norm": 0.25758034533303387, + "learning_rate": 0.0001068204485518352, + "loss": 2.5314, + "step": 6928 + }, + { + "epoch": 0.7945189771815159, + "grad_norm": 0.25068599214894854, + "learning_rate": 0.00010670575876243388, + "loss": 2.4118, + "step": 6929 + }, + { + "epoch": 0.7946336429308566, + "grad_norm": 0.2806115194867441, + "learning_rate": 0.00010659112322047631, + "loss": 2.4053, + "step": 6930 + }, + { + "epoch": 0.7947483086801972, + "grad_norm": 0.2750254357405606, + "learning_rate": 0.00010647654194177436, + "loss": 2.3932, + "step": 6931 + }, + { + "epoch": 0.7948629744295379, + "grad_norm": 0.2470003239993299, + "learning_rate": 0.0001063620149421321, + "loss": 2.2538, + "step": 6932 + }, + { + "epoch": 0.7949776401788786, + "grad_norm": 0.2539465837010201, + "learning_rate": 0.00010624754223734628, + "loss": 2.4609, + "step": 6933 + }, + { + "epoch": 0.7950923059282192, + "grad_norm": 0.2656069059003555, + "learning_rate": 0.00010613312384320672, + "loss": 2.3554, + "step": 6934 + }, + { + "epoch": 0.7952069716775599, + "grad_norm": 0.26398845005673016, + "learning_rate": 0.00010601875977549474, + "loss": 2.4941, + "step": 6935 + }, + { + "epoch": 0.7953216374269005, + "grad_norm": 0.22946196700084354, + "learning_rate": 0.00010590445004998494, + "loss": 2.4425, + "step": 6936 + }, + { + "epoch": 0.7954363031762413, + "grad_norm": 0.25718615111915194, + "learning_rate": 0.00010579019468244421, + "loss": 2.3944, + "step": 6937 + }, + { + "epoch": 0.795550968925582, + "grad_norm": 0.2417548408696056, + "learning_rate": 0.00010567599368863156, + "loss": 2.4083, + "step": 6938 + }, + { + "epoch": 0.7956656346749226, + "grad_norm": 0.28213505526766525, + "learning_rate": 0.00010556184708429922, + "loss": 2.4168, + "step": 6939 + }, + { + "epoch": 0.7957803004242633, + "grad_norm": 0.2677138329051195, + "learning_rate": 0.00010544775488519154, + "loss": 2.5642, + "step": 6940 + }, + { + "epoch": 0.7958949661736039, + "grad_norm": 0.23018926770589765, + "learning_rate": 0.00010533371710704504, + "loss": 2.5062, + "step": 6941 + }, + { + "epoch": 0.7960096319229446, + "grad_norm": 0.27874228429895287, + "learning_rate": 0.00010521973376558925, + "loss": 2.3756, + "step": 6942 + }, + { + "epoch": 0.7961242976722853, + "grad_norm": 0.2658523993443978, + "learning_rate": 0.00010510580487654603, + "loss": 2.2162, + "step": 6943 + }, + { + "epoch": 0.7962389634216259, + "grad_norm": 0.24738403324609673, + "learning_rate": 0.00010499193045562966, + "loss": 2.4539, + "step": 6944 + }, + { + "epoch": 0.7963536291709666, + "grad_norm": 0.27451845904506583, + "learning_rate": 0.0001048781105185469, + "loss": 2.4648, + "step": 6945 + }, + { + "epoch": 0.7964682949203074, + "grad_norm": 0.2412453083949022, + "learning_rate": 0.00010476434508099725, + "loss": 2.5148, + "step": 6946 + }, + { + "epoch": 0.796582960669648, + "grad_norm": 0.24290417464598424, + "learning_rate": 0.00010465063415867216, + "loss": 2.2668, + "step": 6947 + }, + { + "epoch": 0.7966976264189887, + "grad_norm": 0.26647036110151556, + "learning_rate": 0.00010453697776725596, + "loss": 2.3113, + "step": 6948 + }, + { + "epoch": 0.7968122921683293, + "grad_norm": 0.2656498102169688, + "learning_rate": 0.00010442337592242567, + "loss": 2.3041, + "step": 6949 + }, + { + "epoch": 0.79692695791767, + "grad_norm": 0.25222819204654473, + "learning_rate": 0.00010430982863985023, + "loss": 2.3448, + "step": 6950 + }, + { + "epoch": 0.7970416236670107, + "grad_norm": 0.263812063873034, + "learning_rate": 0.00010419633593519134, + "loss": 2.3792, + "step": 6951 + }, + { + "epoch": 0.7971562894163513, + "grad_norm": 0.265180835628077, + "learning_rate": 0.00010408289782410335, + "loss": 2.4562, + "step": 6952 + }, + { + "epoch": 0.797270955165692, + "grad_norm": 0.24865198742248873, + "learning_rate": 0.00010396951432223245, + "loss": 2.3708, + "step": 6953 + }, + { + "epoch": 0.7973856209150327, + "grad_norm": 0.26402752258513074, + "learning_rate": 0.00010385618544521819, + "loss": 2.4901, + "step": 6954 + }, + { + "epoch": 0.7975002866643733, + "grad_norm": 0.2517870364257527, + "learning_rate": 0.00010374291120869206, + "loss": 2.3083, + "step": 6955 + }, + { + "epoch": 0.797614952413714, + "grad_norm": 0.2694991147854156, + "learning_rate": 0.00010362969162827785, + "loss": 2.2431, + "step": 6956 + }, + { + "epoch": 0.7977296181630547, + "grad_norm": 0.2298091318554434, + "learning_rate": 0.00010351652671959222, + "loss": 2.3173, + "step": 6957 + }, + { + "epoch": 0.7978442839123954, + "grad_norm": 0.26051970132182495, + "learning_rate": 0.00010340341649824403, + "loss": 2.3735, + "step": 6958 + }, + { + "epoch": 0.7979589496617361, + "grad_norm": 0.2778167878131525, + "learning_rate": 0.00010329036097983474, + "loss": 2.3545, + "step": 6959 + }, + { + "epoch": 0.7980736154110767, + "grad_norm": 0.2621648668343236, + "learning_rate": 0.00010317736017995816, + "loss": 2.266, + "step": 6960 + }, + { + "epoch": 0.7981882811604174, + "grad_norm": 0.26108324101635194, + "learning_rate": 0.00010306441411420075, + "loss": 2.391, + "step": 6961 + }, + { + "epoch": 0.798302946909758, + "grad_norm": 0.284550242432633, + "learning_rate": 0.00010295152279814096, + "loss": 2.4193, + "step": 6962 + }, + { + "epoch": 0.7984176126590987, + "grad_norm": 0.2685279586193713, + "learning_rate": 0.00010283868624735021, + "loss": 2.4443, + "step": 6963 + }, + { + "epoch": 0.7985322784084394, + "grad_norm": 0.2571220058511691, + "learning_rate": 0.0001027259044773921, + "loss": 2.2945, + "step": 6964 + }, + { + "epoch": 0.79864694415778, + "grad_norm": 0.27042574686473425, + "learning_rate": 0.00010261317750382276, + "loss": 2.3661, + "step": 6965 + }, + { + "epoch": 0.7987616099071208, + "grad_norm": 0.24170155808899835, + "learning_rate": 0.00010250050534219068, + "loss": 2.2534, + "step": 6966 + }, + { + "epoch": 0.7988762756564615, + "grad_norm": 0.2630985580919513, + "learning_rate": 0.000102387888008037, + "loss": 2.2916, + "step": 6967 + }, + { + "epoch": 0.7989909414058021, + "grad_norm": 0.2598798516149883, + "learning_rate": 0.00010227532551689472, + "loss": 2.3505, + "step": 6968 + }, + { + "epoch": 0.7991056071551428, + "grad_norm": 0.2659340699640982, + "learning_rate": 0.00010216281788429022, + "loss": 2.4232, + "step": 6969 + }, + { + "epoch": 0.7992202729044834, + "grad_norm": 0.26269622946639093, + "learning_rate": 0.0001020503651257414, + "loss": 2.4228, + "step": 6970 + }, + { + "epoch": 0.7993349386538241, + "grad_norm": 0.25905272759009235, + "learning_rate": 0.00010193796725675907, + "loss": 2.5631, + "step": 6971 + }, + { + "epoch": 0.7994496044031648, + "grad_norm": 0.24945300195445935, + "learning_rate": 0.00010182562429284647, + "loss": 2.4002, + "step": 6972 + }, + { + "epoch": 0.7995642701525054, + "grad_norm": 0.2684725179355554, + "learning_rate": 0.00010171333624949908, + "loss": 2.3093, + "step": 6973 + }, + { + "epoch": 0.7996789359018461, + "grad_norm": 0.25163134128389636, + "learning_rate": 0.00010160110314220489, + "loss": 2.2555, + "step": 6974 + }, + { + "epoch": 0.7997936016511867, + "grad_norm": 0.23819828850716698, + "learning_rate": 0.00010148892498644452, + "loss": 2.4777, + "step": 6975 + }, + { + "epoch": 0.7999082674005275, + "grad_norm": 0.28552381843621605, + "learning_rate": 0.00010137680179769048, + "loss": 2.3167, + "step": 6976 + }, + { + "epoch": 0.8000229331498682, + "grad_norm": 0.27070904619950015, + "learning_rate": 0.0001012647335914082, + "loss": 2.3954, + "step": 6977 + }, + { + "epoch": 0.8001375988992088, + "grad_norm": 0.2725708048755457, + "learning_rate": 0.00010115272038305529, + "loss": 2.3674, + "step": 6978 + }, + { + "epoch": 0.8002522646485495, + "grad_norm": 0.24615764486365646, + "learning_rate": 0.00010104076218808194, + "loss": 2.5392, + "step": 6979 + }, + { + "epoch": 0.8003669303978902, + "grad_norm": 0.2990770754159958, + "learning_rate": 0.00010092885902193055, + "loss": 2.3088, + "step": 6980 + }, + { + "epoch": 0.8004815961472308, + "grad_norm": 0.2526128135764045, + "learning_rate": 0.00010081701090003625, + "loss": 2.2668, + "step": 6981 + }, + { + "epoch": 0.8005962618965715, + "grad_norm": 0.26216620350526504, + "learning_rate": 0.00010070521783782599, + "loss": 2.2908, + "step": 6982 + }, + { + "epoch": 0.8007109276459121, + "grad_norm": 0.24547498160989933, + "learning_rate": 0.00010059347985071959, + "loss": 2.3308, + "step": 6983 + }, + { + "epoch": 0.8008255933952528, + "grad_norm": 0.26559221926058574, + "learning_rate": 0.00010048179695412951, + "loss": 2.35, + "step": 6984 + }, + { + "epoch": 0.8009402591445935, + "grad_norm": 0.2539138703304889, + "learning_rate": 0.00010037016916345987, + "loss": 2.4227, + "step": 6985 + }, + { + "epoch": 0.8010549248939342, + "grad_norm": 0.25726414016761157, + "learning_rate": 0.00010025859649410778, + "loss": 2.4401, + "step": 6986 + }, + { + "epoch": 0.8011695906432749, + "grad_norm": 0.2557555577590632, + "learning_rate": 0.0001001470789614627, + "loss": 2.4826, + "step": 6987 + }, + { + "epoch": 0.8012842563926156, + "grad_norm": 0.24985395161000665, + "learning_rate": 0.00010003561658090588, + "loss": 2.4515, + "step": 6988 + }, + { + "epoch": 0.8013989221419562, + "grad_norm": 0.2467958590704012, + "learning_rate": 9.992420936781188e-05, + "loss": 2.3719, + "step": 6989 + }, + { + "epoch": 0.8015135878912969, + "grad_norm": 0.2320999153479567, + "learning_rate": 9.981285733754719e-05, + "loss": 2.4159, + "step": 6990 + }, + { + "epoch": 0.8016282536406375, + "grad_norm": 0.25129107169356285, + "learning_rate": 9.970156050547047e-05, + "loss": 2.3715, + "step": 6991 + }, + { + "epoch": 0.8017429193899782, + "grad_norm": 0.28175688629122697, + "learning_rate": 9.959031888693304e-05, + "loss": 2.5136, + "step": 6992 + }, + { + "epoch": 0.8018575851393189, + "grad_norm": 0.27126125663573897, + "learning_rate": 9.947913249727864e-05, + "loss": 2.422, + "step": 6993 + }, + { + "epoch": 0.8019722508886595, + "grad_norm": 0.2575020097669227, + "learning_rate": 9.936800135184332e-05, + "loss": 2.4822, + "step": 6994 + }, + { + "epoch": 0.8020869166380002, + "grad_norm": 0.25548222948312577, + "learning_rate": 9.925692546595544e-05, + "loss": 2.3899, + "step": 6995 + }, + { + "epoch": 0.8022015823873409, + "grad_norm": 0.2720799436566901, + "learning_rate": 9.914590485493596e-05, + "loss": 2.3977, + "step": 6996 + }, + { + "epoch": 0.8023162481366816, + "grad_norm": 0.2671158416325561, + "learning_rate": 9.903493953409781e-05, + "loss": 2.2552, + "step": 6997 + }, + { + "epoch": 0.8024309138860223, + "grad_norm": 0.2771785017714363, + "learning_rate": 9.892402951874657e-05, + "loss": 2.3894, + "step": 6998 + }, + { + "epoch": 0.8025455796353629, + "grad_norm": 0.2785747470819906, + "learning_rate": 9.881317482418045e-05, + "loss": 2.5308, + "step": 6999 + }, + { + "epoch": 0.8026602453847036, + "grad_norm": 0.25888800760965536, + "learning_rate": 9.87023754656895e-05, + "loss": 2.4156, + "step": 7000 + }, + { + "epoch": 0.8027749111340443, + "grad_norm": 0.25068459974895557, + "learning_rate": 9.859163145855632e-05, + "loss": 2.3743, + "step": 7001 + }, + { + "epoch": 0.8028895768833849, + "grad_norm": 0.24856172773104443, + "learning_rate": 9.848094281805626e-05, + "loss": 2.2163, + "step": 7002 + }, + { + "epoch": 0.8030042426327256, + "grad_norm": 0.2661545202434637, + "learning_rate": 9.837030955945619e-05, + "loss": 2.4266, + "step": 7003 + }, + { + "epoch": 0.8031189083820662, + "grad_norm": 0.2619773901015697, + "learning_rate": 9.825973169801627e-05, + "loss": 2.3976, + "step": 7004 + }, + { + "epoch": 0.803233574131407, + "grad_norm": 0.26514591740103366, + "learning_rate": 9.814920924898869e-05, + "loss": 2.2835, + "step": 7005 + }, + { + "epoch": 0.8033482398807477, + "grad_norm": 0.2752083855936904, + "learning_rate": 9.80387422276175e-05, + "loss": 2.4224, + "step": 7006 + }, + { + "epoch": 0.8034629056300883, + "grad_norm": 0.2522019012537233, + "learning_rate": 9.792833064913981e-05, + "loss": 2.3522, + "step": 7007 + }, + { + "epoch": 0.803577571379429, + "grad_norm": 0.28428501299962844, + "learning_rate": 9.781797452878471e-05, + "loss": 2.5217, + "step": 7008 + }, + { + "epoch": 0.8036922371287696, + "grad_norm": 0.26404147779891035, + "learning_rate": 9.770767388177371e-05, + "loss": 2.4935, + "step": 7009 + }, + { + "epoch": 0.8038069028781103, + "grad_norm": 0.2581773548310228, + "learning_rate": 9.759742872332073e-05, + "loss": 2.5377, + "step": 7010 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 0.22350432203916074, + "learning_rate": 9.74872390686321e-05, + "loss": 2.3019, + "step": 7011 + }, + { + "epoch": 0.8040362343767916, + "grad_norm": 0.24545712652346155, + "learning_rate": 9.737710493290614e-05, + "loss": 2.34, + "step": 7012 + }, + { + "epoch": 0.8041509001261323, + "grad_norm": 0.29974318717026505, + "learning_rate": 9.726702633133383e-05, + "loss": 2.4811, + "step": 7013 + }, + { + "epoch": 0.804265565875473, + "grad_norm": 0.26622374012307676, + "learning_rate": 9.715700327909843e-05, + "loss": 2.3258, + "step": 7014 + }, + { + "epoch": 0.8043802316248136, + "grad_norm": 0.25365192080274335, + "learning_rate": 9.704703579137558e-05, + "loss": 2.2716, + "step": 7015 + }, + { + "epoch": 0.8044948973741544, + "grad_norm": 0.23543868496073328, + "learning_rate": 9.693712388333314e-05, + "loss": 2.2966, + "step": 7016 + }, + { + "epoch": 0.804609563123495, + "grad_norm": 0.30021137705531065, + "learning_rate": 9.682726757013155e-05, + "loss": 2.5061, + "step": 7017 + }, + { + "epoch": 0.8047242288728357, + "grad_norm": 0.26473142034752084, + "learning_rate": 9.671746686692296e-05, + "loss": 2.3841, + "step": 7018 + }, + { + "epoch": 0.8048388946221764, + "grad_norm": 0.26205358357994735, + "learning_rate": 9.660772178885275e-05, + "loss": 2.2289, + "step": 7019 + }, + { + "epoch": 0.804953560371517, + "grad_norm": 0.28138023223175274, + "learning_rate": 9.649803235105786e-05, + "loss": 2.4809, + "step": 7020 + }, + { + "epoch": 0.8050682261208577, + "grad_norm": 0.2908001264279954, + "learning_rate": 9.638839856866799e-05, + "loss": 2.3245, + "step": 7021 + }, + { + "epoch": 0.8051828918701984, + "grad_norm": 0.2748606317175616, + "learning_rate": 9.627882045680497e-05, + "loss": 2.4546, + "step": 7022 + }, + { + "epoch": 0.805297557619539, + "grad_norm": 0.27900928243017625, + "learning_rate": 9.616929803058305e-05, + "loss": 2.499, + "step": 7023 + }, + { + "epoch": 0.8054122233688797, + "grad_norm": 0.2560660648049793, + "learning_rate": 9.605983130510876e-05, + "loss": 2.4843, + "step": 7024 + }, + { + "epoch": 0.8055268891182203, + "grad_norm": 0.24602628591873082, + "learning_rate": 9.595042029548112e-05, + "loss": 2.44, + "step": 7025 + }, + { + "epoch": 0.8056415548675611, + "grad_norm": 0.24566556641200737, + "learning_rate": 9.584106501679091e-05, + "loss": 2.3937, + "step": 7026 + }, + { + "epoch": 0.8057562206169018, + "grad_norm": 0.2536659896352178, + "learning_rate": 9.573176548412182e-05, + "loss": 2.3585, + "step": 7027 + }, + { + "epoch": 0.8058708863662424, + "grad_norm": 0.24893256907622652, + "learning_rate": 9.562252171254965e-05, + "loss": 2.5434, + "step": 7028 + }, + { + "epoch": 0.8059855521155831, + "grad_norm": 0.2520849704166014, + "learning_rate": 9.55133337171425e-05, + "loss": 2.4621, + "step": 7029 + }, + { + "epoch": 0.8061002178649237, + "grad_norm": 0.26405524592418683, + "learning_rate": 9.540420151296069e-05, + "loss": 2.5345, + "step": 7030 + }, + { + "epoch": 0.8062148836142644, + "grad_norm": 0.24279935683284148, + "learning_rate": 9.529512511505716e-05, + "loss": 2.2723, + "step": 7031 + }, + { + "epoch": 0.8063295493636051, + "grad_norm": 0.2575212809317785, + "learning_rate": 9.51861045384766e-05, + "loss": 2.387, + "step": 7032 + }, + { + "epoch": 0.8064442151129457, + "grad_norm": 0.27138751271438716, + "learning_rate": 9.507713979825628e-05, + "loss": 2.4858, + "step": 7033 + }, + { + "epoch": 0.8065588808622864, + "grad_norm": 0.2794879446439531, + "learning_rate": 9.496823090942631e-05, + "loss": 2.2195, + "step": 7034 + }, + { + "epoch": 0.8066735466116272, + "grad_norm": 0.2613049952315897, + "learning_rate": 9.485937788700816e-05, + "loss": 2.3202, + "step": 7035 + }, + { + "epoch": 0.8067882123609678, + "grad_norm": 0.2709212811018067, + "learning_rate": 9.475058074601611e-05, + "loss": 2.402, + "step": 7036 + }, + { + "epoch": 0.8069028781103085, + "grad_norm": 0.23662644129606814, + "learning_rate": 9.464183950145683e-05, + "loss": 2.4382, + "step": 7037 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 0.2557539109207896, + "learning_rate": 9.45331541683287e-05, + "loss": 2.3822, + "step": 7038 + }, + { + "epoch": 0.8071322096089898, + "grad_norm": 0.27669445720539104, + "learning_rate": 9.442452476162322e-05, + "loss": 2.4056, + "step": 7039 + }, + { + "epoch": 0.8072468753583305, + "grad_norm": 0.2643439824210586, + "learning_rate": 9.431595129632364e-05, + "loss": 2.4038, + "step": 7040 + }, + { + "epoch": 0.8073615411076711, + "grad_norm": 0.2856784498370159, + "learning_rate": 9.420743378740542e-05, + "loss": 2.2515, + "step": 7041 + }, + { + "epoch": 0.8074762068570118, + "grad_norm": 0.23474042333803755, + "learning_rate": 9.409897224983666e-05, + "loss": 2.3596, + "step": 7042 + }, + { + "epoch": 0.8075908726063524, + "grad_norm": 0.24035333735611183, + "learning_rate": 9.399056669857747e-05, + "loss": 2.4408, + "step": 7043 + }, + { + "epoch": 0.8077055383556931, + "grad_norm": 0.27243032553725854, + "learning_rate": 9.38822171485803e-05, + "loss": 2.2979, + "step": 7044 + }, + { + "epoch": 0.8078202041050339, + "grad_norm": 0.250095781034164, + "learning_rate": 9.377392361479003e-05, + "loss": 2.3978, + "step": 7045 + }, + { + "epoch": 0.8079348698543745, + "grad_norm": 0.2743427116142907, + "learning_rate": 9.366568611214376e-05, + "loss": 2.41, + "step": 7046 + }, + { + "epoch": 0.8080495356037152, + "grad_norm": 0.2531163547179839, + "learning_rate": 9.35575046555705e-05, + "loss": 2.4264, + "step": 7047 + }, + { + "epoch": 0.8081642013530559, + "grad_norm": 0.24258897702375937, + "learning_rate": 9.344937925999187e-05, + "loss": 2.3214, + "step": 7048 + }, + { + "epoch": 0.8082788671023965, + "grad_norm": 0.26544358624606484, + "learning_rate": 9.334130994032202e-05, + "loss": 2.4451, + "step": 7049 + }, + { + "epoch": 0.8083935328517372, + "grad_norm": 0.2256731967244761, + "learning_rate": 9.323329671146674e-05, + "loss": 2.4189, + "step": 7050 + }, + { + "epoch": 0.8085081986010778, + "grad_norm": 0.26571066615449296, + "learning_rate": 9.312533958832453e-05, + "loss": 2.4761, + "step": 7051 + }, + { + "epoch": 0.8086228643504185, + "grad_norm": 0.26344904774416017, + "learning_rate": 9.301743858578609e-05, + "loss": 2.4168, + "step": 7052 + }, + { + "epoch": 0.8087375300997592, + "grad_norm": 0.27775565013374215, + "learning_rate": 9.290959371873391e-05, + "loss": 2.3632, + "step": 7053 + }, + { + "epoch": 0.8088521958490998, + "grad_norm": 0.25176749154829475, + "learning_rate": 9.28018050020436e-05, + "loss": 2.4131, + "step": 7054 + }, + { + "epoch": 0.8089668615984406, + "grad_norm": 0.28701635137919107, + "learning_rate": 9.269407245058254e-05, + "loss": 2.384, + "step": 7055 + }, + { + "epoch": 0.8090815273477813, + "grad_norm": 0.22856369421702732, + "learning_rate": 9.258639607921005e-05, + "loss": 2.3651, + "step": 7056 + }, + { + "epoch": 0.8091961930971219, + "grad_norm": 0.24426358781748547, + "learning_rate": 9.24787759027782e-05, + "loss": 2.3433, + "step": 7057 + }, + { + "epoch": 0.8093108588464626, + "grad_norm": 0.2511764342959095, + "learning_rate": 9.237121193613119e-05, + "loss": 2.3026, + "step": 7058 + }, + { + "epoch": 0.8094255245958032, + "grad_norm": 0.2654578168667341, + "learning_rate": 9.22637041941054e-05, + "loss": 2.4127, + "step": 7059 + }, + { + "epoch": 0.8095401903451439, + "grad_norm": 0.27094303590526236, + "learning_rate": 9.215625269152938e-05, + "loss": 2.3983, + "step": 7060 + }, + { + "epoch": 0.8096548560944846, + "grad_norm": 0.24355249859018951, + "learning_rate": 9.204885744322428e-05, + "loss": 2.3707, + "step": 7061 + }, + { + "epoch": 0.8097695218438252, + "grad_norm": 0.23892316931040286, + "learning_rate": 9.19415184640029e-05, + "loss": 2.2796, + "step": 7062 + }, + { + "epoch": 0.8098841875931659, + "grad_norm": 0.28657650179592775, + "learning_rate": 9.183423576867078e-05, + "loss": 2.4289, + "step": 7063 + }, + { + "epoch": 0.8099988533425065, + "grad_norm": 0.25075207637843466, + "learning_rate": 9.172700937202544e-05, + "loss": 2.4402, + "step": 7064 + }, + { + "epoch": 0.8101135190918473, + "grad_norm": 0.2673021913373499, + "learning_rate": 9.161983928885676e-05, + "loss": 2.4787, + "step": 7065 + }, + { + "epoch": 0.810228184841188, + "grad_norm": 0.2563132248206784, + "learning_rate": 9.151272553394685e-05, + "loss": 2.4824, + "step": 7066 + }, + { + "epoch": 0.8103428505905286, + "grad_norm": 0.2755890212139305, + "learning_rate": 9.140566812207008e-05, + "loss": 2.4895, + "step": 7067 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.2517563116213636, + "learning_rate": 9.129866706799262e-05, + "loss": 2.4264, + "step": 7068 + }, + { + "epoch": 0.81057218208921, + "grad_norm": 0.2421034258556156, + "learning_rate": 9.119172238647377e-05, + "loss": 2.291, + "step": 7069 + }, + { + "epoch": 0.8106868478385506, + "grad_norm": 0.23326836811781465, + "learning_rate": 9.108483409226403e-05, + "loss": 2.4193, + "step": 7070 + }, + { + "epoch": 0.8108015135878913, + "grad_norm": 0.24349759531050258, + "learning_rate": 9.097800220010689e-05, + "loss": 2.4006, + "step": 7071 + }, + { + "epoch": 0.8109161793372319, + "grad_norm": 0.25781496298250534, + "learning_rate": 9.087122672473758e-05, + "loss": 2.5394, + "step": 7072 + }, + { + "epoch": 0.8110308450865726, + "grad_norm": 0.25836956641708336, + "learning_rate": 9.07645076808839e-05, + "loss": 2.5001, + "step": 7073 + }, + { + "epoch": 0.8111455108359134, + "grad_norm": 0.25521665297489526, + "learning_rate": 9.065784508326564e-05, + "loss": 2.451, + "step": 7074 + }, + { + "epoch": 0.811260176585254, + "grad_norm": 0.24402218016355487, + "learning_rate": 9.055123894659501e-05, + "loss": 2.3042, + "step": 7075 + }, + { + "epoch": 0.8113748423345947, + "grad_norm": 0.23950617618985487, + "learning_rate": 9.044468928557603e-05, + "loss": 2.3582, + "step": 7076 + }, + { + "epoch": 0.8114895080839353, + "grad_norm": 0.25193677658962943, + "learning_rate": 9.03381961149053e-05, + "loss": 2.4313, + "step": 7077 + }, + { + "epoch": 0.811604173833276, + "grad_norm": 0.2539978155655879, + "learning_rate": 9.023175944927159e-05, + "loss": 2.2715, + "step": 7078 + }, + { + "epoch": 0.8117188395826167, + "grad_norm": 0.24300363592535032, + "learning_rate": 9.012537930335574e-05, + "loss": 2.3649, + "step": 7079 + }, + { + "epoch": 0.8118335053319573, + "grad_norm": 0.285688224511098, + "learning_rate": 9.001905569183089e-05, + "loss": 2.4115, + "step": 7080 + }, + { + "epoch": 0.811948171081298, + "grad_norm": 0.2931415766894181, + "learning_rate": 8.991278862936253e-05, + "loss": 2.4031, + "step": 7081 + }, + { + "epoch": 0.8120628368306387, + "grad_norm": 0.28251524389170907, + "learning_rate": 8.980657813060783e-05, + "loss": 2.3514, + "step": 7082 + }, + { + "epoch": 0.8121775025799793, + "grad_norm": 0.2520673457728205, + "learning_rate": 8.970042421021662e-05, + "loss": 2.2877, + "step": 7083 + }, + { + "epoch": 0.81229216832932, + "grad_norm": 0.26190673540371573, + "learning_rate": 8.959432688283103e-05, + "loss": 2.3681, + "step": 7084 + }, + { + "epoch": 0.8124068340786607, + "grad_norm": 0.2341330233565217, + "learning_rate": 8.948828616308496e-05, + "loss": 2.3811, + "step": 7085 + }, + { + "epoch": 0.8125214998280014, + "grad_norm": 0.2674752073294024, + "learning_rate": 8.938230206560471e-05, + "loss": 2.2678, + "step": 7086 + }, + { + "epoch": 0.8126361655773421, + "grad_norm": 0.2764465992327549, + "learning_rate": 8.927637460500892e-05, + "loss": 2.5112, + "step": 7087 + }, + { + "epoch": 0.8127508313266827, + "grad_norm": 0.25870195241285737, + "learning_rate": 8.91705037959079e-05, + "loss": 2.4008, + "step": 7088 + }, + { + "epoch": 0.8128654970760234, + "grad_norm": 0.2723688475861717, + "learning_rate": 8.906468965290493e-05, + "loss": 2.3863, + "step": 7089 + }, + { + "epoch": 0.8129801628253641, + "grad_norm": 0.25157579990487255, + "learning_rate": 8.895893219059498e-05, + "loss": 2.4284, + "step": 7090 + }, + { + "epoch": 0.8130948285747047, + "grad_norm": 0.2553041018828866, + "learning_rate": 8.885323142356506e-05, + "loss": 2.3712, + "step": 7091 + }, + { + "epoch": 0.8132094943240454, + "grad_norm": 0.24759221782061655, + "learning_rate": 8.874758736639465e-05, + "loss": 2.3768, + "step": 7092 + }, + { + "epoch": 0.813324160073386, + "grad_norm": 0.23571078160196235, + "learning_rate": 8.864200003365541e-05, + "loss": 2.395, + "step": 7093 + }, + { + "epoch": 0.8134388258227268, + "grad_norm": 0.25477292438650545, + "learning_rate": 8.853646943991106e-05, + "loss": 2.445, + "step": 7094 + }, + { + "epoch": 0.8135534915720675, + "grad_norm": 0.2521028083552525, + "learning_rate": 8.843099559971757e-05, + "loss": 2.4443, + "step": 7095 + }, + { + "epoch": 0.8136681573214081, + "grad_norm": 0.2349413720162316, + "learning_rate": 8.832557852762312e-05, + "loss": 2.4378, + "step": 7096 + }, + { + "epoch": 0.8137828230707488, + "grad_norm": 0.26959163983617296, + "learning_rate": 8.822021823816778e-05, + "loss": 2.5226, + "step": 7097 + }, + { + "epoch": 0.8138974888200894, + "grad_norm": 0.28448664790777556, + "learning_rate": 8.811491474588396e-05, + "loss": 2.4321, + "step": 7098 + }, + { + "epoch": 0.8140121545694301, + "grad_norm": 0.240574969584723, + "learning_rate": 8.800966806529665e-05, + "loss": 2.431, + "step": 7099 + }, + { + "epoch": 0.8141268203187708, + "grad_norm": 0.2609193108109736, + "learning_rate": 8.790447821092224e-05, + "loss": 2.4555, + "step": 7100 + }, + { + "epoch": 0.8142414860681114, + "grad_norm": 0.277569571970675, + "learning_rate": 8.779934519726985e-05, + "loss": 2.3929, + "step": 7101 + }, + { + "epoch": 0.8143561518174521, + "grad_norm": 0.2571115232081349, + "learning_rate": 8.76942690388406e-05, + "loss": 2.2698, + "step": 7102 + }, + { + "epoch": 0.8144708175667928, + "grad_norm": 0.2506312212981786, + "learning_rate": 8.758924975012744e-05, + "loss": 2.4831, + "step": 7103 + }, + { + "epoch": 0.8145854833161335, + "grad_norm": 0.25634071246485335, + "learning_rate": 8.748428734561614e-05, + "loss": 2.3682, + "step": 7104 + }, + { + "epoch": 0.8147001490654742, + "grad_norm": 0.28357040304383274, + "learning_rate": 8.737938183978428e-05, + "loss": 2.4487, + "step": 7105 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.2552235608366805, + "learning_rate": 8.727453324710133e-05, + "loss": 2.2901, + "step": 7106 + }, + { + "epoch": 0.8149294805641555, + "grad_norm": 0.25067560930010707, + "learning_rate": 8.716974158202923e-05, + "loss": 2.3063, + "step": 7107 + }, + { + "epoch": 0.8150441463134962, + "grad_norm": 0.2594420757705762, + "learning_rate": 8.706500685902197e-05, + "loss": 2.3089, + "step": 7108 + }, + { + "epoch": 0.8151588120628368, + "grad_norm": 0.23952720973807762, + "learning_rate": 8.696032909252582e-05, + "loss": 2.2999, + "step": 7109 + }, + { + "epoch": 0.8152734778121775, + "grad_norm": 0.2906314094419773, + "learning_rate": 8.685570829697898e-05, + "loss": 2.3311, + "step": 7110 + }, + { + "epoch": 0.8153881435615181, + "grad_norm": 0.25751071352592, + "learning_rate": 8.675114448681209e-05, + "loss": 2.362, + "step": 7111 + }, + { + "epoch": 0.8155028093108588, + "grad_norm": 0.2744821924998915, + "learning_rate": 8.664663767644742e-05, + "loss": 2.4209, + "step": 7112 + }, + { + "epoch": 0.8156174750601995, + "grad_norm": 0.25013760260730145, + "learning_rate": 8.654218788029989e-05, + "loss": 2.476, + "step": 7113 + }, + { + "epoch": 0.8157321408095402, + "grad_norm": 0.2787722648642111, + "learning_rate": 8.643779511277628e-05, + "loss": 2.3429, + "step": 7114 + }, + { + "epoch": 0.8158468065588809, + "grad_norm": 0.26471771747976514, + "learning_rate": 8.633345938827563e-05, + "loss": 2.3694, + "step": 7115 + }, + { + "epoch": 0.8159614723082216, + "grad_norm": 0.2246152208849204, + "learning_rate": 8.622918072118908e-05, + "loss": 2.3453, + "step": 7116 + }, + { + "epoch": 0.8160761380575622, + "grad_norm": 0.26320255971681095, + "learning_rate": 8.612495912589996e-05, + "loss": 2.5282, + "step": 7117 + }, + { + "epoch": 0.8161908038069029, + "grad_norm": 0.2498666067842061, + "learning_rate": 8.602079461678331e-05, + "loss": 2.2792, + "step": 7118 + }, + { + "epoch": 0.8163054695562435, + "grad_norm": 0.2728040852870363, + "learning_rate": 8.591668720820716e-05, + "loss": 2.446, + "step": 7119 + }, + { + "epoch": 0.8164201353055842, + "grad_norm": 0.2864231242241152, + "learning_rate": 8.581263691453073e-05, + "loss": 2.4344, + "step": 7120 + }, + { + "epoch": 0.8165348010549249, + "grad_norm": 0.26482610475196167, + "learning_rate": 8.57086437501059e-05, + "loss": 2.4839, + "step": 7121 + }, + { + "epoch": 0.8166494668042655, + "grad_norm": 0.28031591614485957, + "learning_rate": 8.560470772927653e-05, + "loss": 2.4847, + "step": 7122 + }, + { + "epoch": 0.8167641325536062, + "grad_norm": 0.23398675095287777, + "learning_rate": 8.550082886637861e-05, + "loss": 2.413, + "step": 7123 + }, + { + "epoch": 0.816878798302947, + "grad_norm": 0.2740168981089584, + "learning_rate": 8.539700717574034e-05, + "loss": 2.4516, + "step": 7124 + }, + { + "epoch": 0.8169934640522876, + "grad_norm": 0.243484692303628, + "learning_rate": 8.529324267168198e-05, + "loss": 2.3453, + "step": 7125 + }, + { + "epoch": 0.8171081298016283, + "grad_norm": 0.25010723336214113, + "learning_rate": 8.518953536851565e-05, + "loss": 2.3364, + "step": 7126 + }, + { + "epoch": 0.8172227955509689, + "grad_norm": 0.26525581037187146, + "learning_rate": 8.50858852805459e-05, + "loss": 2.5474, + "step": 7127 + }, + { + "epoch": 0.8173374613003096, + "grad_norm": 0.26629206328340216, + "learning_rate": 8.498229242206929e-05, + "loss": 2.3893, + "step": 7128 + }, + { + "epoch": 0.8174521270496503, + "grad_norm": 0.2896460817829774, + "learning_rate": 8.487875680737444e-05, + "loss": 2.3058, + "step": 7129 + }, + { + "epoch": 0.8175667927989909, + "grad_norm": 0.2486763486324493, + "learning_rate": 8.477527845074223e-05, + "loss": 2.2317, + "step": 7130 + }, + { + "epoch": 0.8176814585483316, + "grad_norm": 0.2838162039631978, + "learning_rate": 8.467185736644556e-05, + "loss": 2.3573, + "step": 7131 + }, + { + "epoch": 0.8177961242976722, + "grad_norm": 0.26982437009572635, + "learning_rate": 8.456849356874912e-05, + "loss": 2.3196, + "step": 7132 + }, + { + "epoch": 0.817910790047013, + "grad_norm": 0.22435517710224578, + "learning_rate": 8.446518707191009e-05, + "loss": 2.3347, + "step": 7133 + }, + { + "epoch": 0.8180254557963537, + "grad_norm": 0.2672491983132503, + "learning_rate": 8.436193789017788e-05, + "loss": 2.3318, + "step": 7134 + }, + { + "epoch": 0.8181401215456943, + "grad_norm": 0.2667778275181249, + "learning_rate": 8.425874603779349e-05, + "loss": 2.4536, + "step": 7135 + }, + { + "epoch": 0.818254787295035, + "grad_norm": 0.23423843421644505, + "learning_rate": 8.415561152899026e-05, + "loss": 2.4576, + "step": 7136 + }, + { + "epoch": 0.8183694530443757, + "grad_norm": 0.24676450697557595, + "learning_rate": 8.405253437799387e-05, + "loss": 2.3365, + "step": 7137 + }, + { + "epoch": 0.8184841187937163, + "grad_norm": 0.239433093917491, + "learning_rate": 8.394951459902145e-05, + "loss": 2.4747, + "step": 7138 + }, + { + "epoch": 0.818598784543057, + "grad_norm": 0.25456402768893965, + "learning_rate": 8.384655220628296e-05, + "loss": 2.481, + "step": 7139 + }, + { + "epoch": 0.8187134502923976, + "grad_norm": 0.25384640230990885, + "learning_rate": 8.374364721398015e-05, + "loss": 2.3056, + "step": 7140 + }, + { + "epoch": 0.8188281160417383, + "grad_norm": 0.29496457373132967, + "learning_rate": 8.364079963630644e-05, + "loss": 2.3623, + "step": 7141 + }, + { + "epoch": 0.818942781791079, + "grad_norm": 0.25549648465115943, + "learning_rate": 8.353800948744794e-05, + "loss": 2.327, + "step": 7142 + }, + { + "epoch": 0.8190574475404196, + "grad_norm": 0.2665849365789555, + "learning_rate": 8.343527678158252e-05, + "loss": 2.4726, + "step": 7143 + }, + { + "epoch": 0.8191721132897604, + "grad_norm": 0.2401832604274856, + "learning_rate": 8.333260153288025e-05, + "loss": 2.4543, + "step": 7144 + }, + { + "epoch": 0.819286779039101, + "grad_norm": 0.2505407988253365, + "learning_rate": 8.322998375550316e-05, + "loss": 2.3091, + "step": 7145 + }, + { + "epoch": 0.8194014447884417, + "grad_norm": 0.24556674004304613, + "learning_rate": 8.312742346360558e-05, + "loss": 2.383, + "step": 7146 + }, + { + "epoch": 0.8195161105377824, + "grad_norm": 0.2654076651511823, + "learning_rate": 8.302492067133349e-05, + "loss": 2.5243, + "step": 7147 + }, + { + "epoch": 0.819630776287123, + "grad_norm": 0.2586343692827218, + "learning_rate": 8.292247539282522e-05, + "loss": 2.5485, + "step": 7148 + }, + { + "epoch": 0.8197454420364637, + "grad_norm": 0.2492835164469896, + "learning_rate": 8.282008764221149e-05, + "loss": 2.4998, + "step": 7149 + }, + { + "epoch": 0.8198601077858044, + "grad_norm": 0.26311181205565626, + "learning_rate": 8.271775743361443e-05, + "loss": 2.3439, + "step": 7150 + }, + { + "epoch": 0.819974773535145, + "grad_norm": 0.27589743830621305, + "learning_rate": 8.261548478114855e-05, + "loss": 2.4721, + "step": 7151 + }, + { + "epoch": 0.8200894392844857, + "grad_norm": 0.233550982589699, + "learning_rate": 8.251326969892059e-05, + "loss": 2.2327, + "step": 7152 + }, + { + "epoch": 0.8202041050338263, + "grad_norm": 0.24329465127089503, + "learning_rate": 8.241111220102882e-05, + "loss": 2.4646, + "step": 7153 + }, + { + "epoch": 0.8203187707831671, + "grad_norm": 0.26338360481784595, + "learning_rate": 8.23090123015643e-05, + "loss": 2.3021, + "step": 7154 + }, + { + "epoch": 0.8204334365325078, + "grad_norm": 0.2566252070336107, + "learning_rate": 8.22069700146097e-05, + "loss": 2.3193, + "step": 7155 + }, + { + "epoch": 0.8205481022818484, + "grad_norm": 0.270564754020459, + "learning_rate": 8.210498535423961e-05, + "loss": 2.4127, + "step": 7156 + }, + { + "epoch": 0.8206627680311891, + "grad_norm": 0.25799502038068656, + "learning_rate": 8.200305833452099e-05, + "loss": 2.5109, + "step": 7157 + }, + { + "epoch": 0.8207774337805298, + "grad_norm": 0.27454837905225354, + "learning_rate": 8.190118896951271e-05, + "loss": 2.3174, + "step": 7158 + }, + { + "epoch": 0.8208920995298704, + "grad_norm": 0.25435873418816374, + "learning_rate": 8.179937727326575e-05, + "loss": 2.4281, + "step": 7159 + }, + { + "epoch": 0.8210067652792111, + "grad_norm": 0.2646981899494223, + "learning_rate": 8.16976232598231e-05, + "loss": 2.4499, + "step": 7160 + }, + { + "epoch": 0.8211214310285517, + "grad_norm": 0.2539208233098313, + "learning_rate": 8.159592694321982e-05, + "loss": 2.2882, + "step": 7161 + }, + { + "epoch": 0.8212360967778924, + "grad_norm": 0.2690461692585868, + "learning_rate": 8.149428833748279e-05, + "loss": 2.5136, + "step": 7162 + }, + { + "epoch": 0.8213507625272332, + "grad_norm": 0.22786345418460338, + "learning_rate": 8.13927074566313e-05, + "loss": 2.4348, + "step": 7163 + }, + { + "epoch": 0.8214654282765738, + "grad_norm": 0.22456851914412992, + "learning_rate": 8.129118431467636e-05, + "loss": 2.436, + "step": 7164 + }, + { + "epoch": 0.8215800940259145, + "grad_norm": 0.26636807528037715, + "learning_rate": 8.118971892562128e-05, + "loss": 2.332, + "step": 7165 + }, + { + "epoch": 0.8216947597752551, + "grad_norm": 0.24944320363236863, + "learning_rate": 8.108831130346117e-05, + "loss": 2.3177, + "step": 7166 + }, + { + "epoch": 0.8218094255245958, + "grad_norm": 0.2542661002335648, + "learning_rate": 8.098696146218354e-05, + "loss": 2.4709, + "step": 7167 + }, + { + "epoch": 0.8219240912739365, + "grad_norm": 0.2625456967084732, + "learning_rate": 8.088566941576714e-05, + "loss": 2.3651, + "step": 7168 + }, + { + "epoch": 0.8220387570232771, + "grad_norm": 0.2561120172775171, + "learning_rate": 8.078443517818384e-05, + "loss": 2.407, + "step": 7169 + }, + { + "epoch": 0.8221534227726178, + "grad_norm": 0.25807589879116727, + "learning_rate": 8.068325876339666e-05, + "loss": 2.4868, + "step": 7170 + }, + { + "epoch": 0.8222680885219585, + "grad_norm": 0.24191543879822863, + "learning_rate": 8.058214018536104e-05, + "loss": 2.4519, + "step": 7171 + }, + { + "epoch": 0.8223827542712991, + "grad_norm": 0.23565348805849098, + "learning_rate": 8.04810794580243e-05, + "loss": 2.27, + "step": 7172 + }, + { + "epoch": 0.8224974200206399, + "grad_norm": 0.23200899431052055, + "learning_rate": 8.038007659532587e-05, + "loss": 2.5146, + "step": 7173 + }, + { + "epoch": 0.8226120857699805, + "grad_norm": 0.2501445489851966, + "learning_rate": 8.027913161119726e-05, + "loss": 2.566, + "step": 7174 + }, + { + "epoch": 0.8227267515193212, + "grad_norm": 0.2535352682465857, + "learning_rate": 8.017824451956191e-05, + "loss": 2.3483, + "step": 7175 + }, + { + "epoch": 0.8228414172686619, + "grad_norm": 0.2399969547449492, + "learning_rate": 8.00774153343351e-05, + "loss": 2.403, + "step": 7176 + }, + { + "epoch": 0.8229560830180025, + "grad_norm": 0.2718698146732127, + "learning_rate": 7.99766440694244e-05, + "loss": 2.2132, + "step": 7177 + }, + { + "epoch": 0.8230707487673432, + "grad_norm": 0.24650301176187286, + "learning_rate": 7.987593073872923e-05, + "loss": 2.4001, + "step": 7178 + }, + { + "epoch": 0.8231854145166838, + "grad_norm": 0.2683963185710929, + "learning_rate": 7.977527535614116e-05, + "loss": 2.2794, + "step": 7179 + }, + { + "epoch": 0.8233000802660245, + "grad_norm": 0.2708995664577427, + "learning_rate": 7.967467793554361e-05, + "loss": 2.3415, + "step": 7180 + }, + { + "epoch": 0.8234147460153652, + "grad_norm": 0.2515790197110842, + "learning_rate": 7.95741384908123e-05, + "loss": 2.498, + "step": 7181 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.24202869051115636, + "learning_rate": 7.947365703581432e-05, + "loss": 2.4365, + "step": 7182 + }, + { + "epoch": 0.8236440775140466, + "grad_norm": 0.24264061029127898, + "learning_rate": 7.937323358440934e-05, + "loss": 2.3535, + "step": 7183 + }, + { + "epoch": 0.8237587432633873, + "grad_norm": 0.25760007293484516, + "learning_rate": 7.927286815044915e-05, + "loss": 2.24, + "step": 7184 + }, + { + "epoch": 0.8238734090127279, + "grad_norm": 0.2594058159413161, + "learning_rate": 7.91725607477769e-05, + "loss": 2.4142, + "step": 7185 + }, + { + "epoch": 0.8239880747620686, + "grad_norm": 0.24630730261397657, + "learning_rate": 7.907231139022814e-05, + "loss": 2.4586, + "step": 7186 + }, + { + "epoch": 0.8241027405114092, + "grad_norm": 0.2760166702430448, + "learning_rate": 7.897212009163057e-05, + "loss": 2.4808, + "step": 7187 + }, + { + "epoch": 0.8242174062607499, + "grad_norm": 0.2465036927951378, + "learning_rate": 7.887198686580327e-05, + "loss": 2.4958, + "step": 7188 + }, + { + "epoch": 0.8243320720100906, + "grad_norm": 0.2658216266709044, + "learning_rate": 7.877191172655807e-05, + "loss": 2.4915, + "step": 7189 + }, + { + "epoch": 0.8244467377594312, + "grad_norm": 0.2433147344105137, + "learning_rate": 7.867189468769842e-05, + "loss": 2.3368, + "step": 7190 + }, + { + "epoch": 0.8245614035087719, + "grad_norm": 0.2652262281953106, + "learning_rate": 7.857193576301953e-05, + "loss": 2.5328, + "step": 7191 + }, + { + "epoch": 0.8246760692581127, + "grad_norm": 0.2362629446715802, + "learning_rate": 7.847203496630895e-05, + "loss": 2.4088, + "step": 7192 + }, + { + "epoch": 0.8247907350074533, + "grad_norm": 0.2540998828739804, + "learning_rate": 7.837219231134613e-05, + "loss": 2.2888, + "step": 7193 + }, + { + "epoch": 0.824905400756794, + "grad_norm": 0.2723361365359184, + "learning_rate": 7.827240781190237e-05, + "loss": 2.5134, + "step": 7194 + }, + { + "epoch": 0.8250200665061346, + "grad_norm": 0.28267377765310747, + "learning_rate": 7.817268148174111e-05, + "loss": 2.5182, + "step": 7195 + }, + { + "epoch": 0.8251347322554753, + "grad_norm": 0.24393442745004826, + "learning_rate": 7.80730133346178e-05, + "loss": 2.2889, + "step": 7196 + }, + { + "epoch": 0.825249398004816, + "grad_norm": 0.2347990545299825, + "learning_rate": 7.797340338427955e-05, + "loss": 2.339, + "step": 7197 + }, + { + "epoch": 0.8253640637541566, + "grad_norm": 0.2760651031431398, + "learning_rate": 7.787385164446559e-05, + "loss": 2.3882, + "step": 7198 + }, + { + "epoch": 0.8254787295034973, + "grad_norm": 0.24786390848628995, + "learning_rate": 7.77743581289076e-05, + "loss": 2.2813, + "step": 7199 + }, + { + "epoch": 0.8255933952528379, + "grad_norm": 0.25148135423263224, + "learning_rate": 7.767492285132844e-05, + "loss": 2.4449, + "step": 7200 + }, + { + "epoch": 0.8257080610021786, + "grad_norm": 0.24763676702980145, + "learning_rate": 7.757554582544341e-05, + "loss": 2.3728, + "step": 7201 + }, + { + "epoch": 0.8258227267515194, + "grad_norm": 0.24976633216874178, + "learning_rate": 7.747622706495983e-05, + "loss": 2.4171, + "step": 7202 + }, + { + "epoch": 0.82593739250086, + "grad_norm": 0.2392046371267394, + "learning_rate": 7.73769665835764e-05, + "loss": 2.3823, + "step": 7203 + }, + { + "epoch": 0.8260520582502007, + "grad_norm": 0.28685577705728543, + "learning_rate": 7.727776439498463e-05, + "loss": 2.38, + "step": 7204 + }, + { + "epoch": 0.8261667239995414, + "grad_norm": 0.2513136031169034, + "learning_rate": 7.717862051286761e-05, + "loss": 2.3509, + "step": 7205 + }, + { + "epoch": 0.826281389748882, + "grad_norm": 0.26066376689976095, + "learning_rate": 7.707953495090004e-05, + "loss": 2.4258, + "step": 7206 + }, + { + "epoch": 0.8263960554982227, + "grad_norm": 0.2436961269898676, + "learning_rate": 7.6980507722749e-05, + "loss": 2.4257, + "step": 7207 + }, + { + "epoch": 0.8265107212475633, + "grad_norm": 0.24391059271418528, + "learning_rate": 7.68815388420735e-05, + "loss": 2.3175, + "step": 7208 + }, + { + "epoch": 0.826625386996904, + "grad_norm": 0.2712509630415017, + "learning_rate": 7.678262832252431e-05, + "loss": 2.3714, + "step": 7209 + }, + { + "epoch": 0.8267400527462447, + "grad_norm": 0.2780572645754265, + "learning_rate": 7.668377617774424e-05, + "loss": 2.3674, + "step": 7210 + }, + { + "epoch": 0.8268547184955853, + "grad_norm": 0.3202583144289909, + "learning_rate": 7.658498242136824e-05, + "loss": 2.3247, + "step": 7211 + }, + { + "epoch": 0.826969384244926, + "grad_norm": 0.26791707902707, + "learning_rate": 7.648624706702285e-05, + "loss": 2.3877, + "step": 7212 + }, + { + "epoch": 0.8270840499942668, + "grad_norm": 0.28624905373613635, + "learning_rate": 7.63875701283267e-05, + "loss": 2.4451, + "step": 7213 + }, + { + "epoch": 0.8271987157436074, + "grad_norm": 0.2656631069633248, + "learning_rate": 7.628895161889048e-05, + "loss": 2.4496, + "step": 7214 + }, + { + "epoch": 0.8273133814929481, + "grad_norm": 0.2500916687785805, + "learning_rate": 7.619039155231672e-05, + "loss": 2.4095, + "step": 7215 + }, + { + "epoch": 0.8274280472422887, + "grad_norm": 0.2588078405510094, + "learning_rate": 7.60918899421999e-05, + "loss": 2.4207, + "step": 7216 + }, + { + "epoch": 0.8275427129916294, + "grad_norm": 0.2829804167681859, + "learning_rate": 7.599344680212655e-05, + "loss": 2.3761, + "step": 7217 + }, + { + "epoch": 0.8276573787409701, + "grad_norm": 0.2714230677983044, + "learning_rate": 7.589506214567465e-05, + "loss": 2.4679, + "step": 7218 + }, + { + "epoch": 0.8277720444903107, + "grad_norm": 0.2716022097493529, + "learning_rate": 7.5796735986415e-05, + "loss": 2.3279, + "step": 7219 + }, + { + "epoch": 0.8278867102396514, + "grad_norm": 0.2772807912380061, + "learning_rate": 7.56984683379095e-05, + "loss": 2.4198, + "step": 7220 + }, + { + "epoch": 0.828001375988992, + "grad_norm": 0.24843591433734002, + "learning_rate": 7.560025921371232e-05, + "loss": 2.4135, + "step": 7221 + }, + { + "epoch": 0.8281160417383328, + "grad_norm": 0.2513581266651729, + "learning_rate": 7.550210862736961e-05, + "loss": 2.3976, + "step": 7222 + }, + { + "epoch": 0.8282307074876735, + "grad_norm": 0.24577162491321267, + "learning_rate": 7.540401659241936e-05, + "loss": 2.4007, + "step": 7223 + }, + { + "epoch": 0.8283453732370141, + "grad_norm": 0.27997173488984356, + "learning_rate": 7.530598312239145e-05, + "loss": 2.383, + "step": 7224 + }, + { + "epoch": 0.8284600389863548, + "grad_norm": 0.24539548333258915, + "learning_rate": 7.520800823080792e-05, + "loss": 2.488, + "step": 7225 + }, + { + "epoch": 0.8285747047356955, + "grad_norm": 0.26838255800805433, + "learning_rate": 7.511009193118223e-05, + "loss": 2.4194, + "step": 7226 + }, + { + "epoch": 0.8286893704850361, + "grad_norm": 0.2634384952414978, + "learning_rate": 7.501223423702025e-05, + "loss": 2.4499, + "step": 7227 + }, + { + "epoch": 0.8288040362343768, + "grad_norm": 0.2396692024773577, + "learning_rate": 7.491443516181951e-05, + "loss": 2.3293, + "step": 7228 + }, + { + "epoch": 0.8289187019837174, + "grad_norm": 0.2662685080998943, + "learning_rate": 7.48166947190696e-05, + "loss": 2.47, + "step": 7229 + }, + { + "epoch": 0.8290333677330581, + "grad_norm": 0.2449644219170579, + "learning_rate": 7.471901292225197e-05, + "loss": 2.4226, + "step": 7230 + }, + { + "epoch": 0.8291480334823988, + "grad_norm": 0.26497573512014916, + "learning_rate": 7.462138978483996e-05, + "loss": 2.4894, + "step": 7231 + }, + { + "epoch": 0.8292626992317395, + "grad_norm": 0.26858564074668034, + "learning_rate": 7.452382532029867e-05, + "loss": 2.2864, + "step": 7232 + }, + { + "epoch": 0.8293773649810802, + "grad_norm": 0.278777023164981, + "learning_rate": 7.442631954208529e-05, + "loss": 2.48, + "step": 7233 + }, + { + "epoch": 0.8294920307304208, + "grad_norm": 0.28443270161861384, + "learning_rate": 7.432887246364911e-05, + "loss": 2.3328, + "step": 7234 + }, + { + "epoch": 0.8296066964797615, + "grad_norm": 0.28633282641630514, + "learning_rate": 7.423148409843089e-05, + "loss": 2.3754, + "step": 7235 + }, + { + "epoch": 0.8297213622291022, + "grad_norm": 0.2502414400694604, + "learning_rate": 7.413415445986355e-05, + "loss": 2.3553, + "step": 7236 + }, + { + "epoch": 0.8298360279784428, + "grad_norm": 0.2743945045142924, + "learning_rate": 7.403688356137194e-05, + "loss": 2.4973, + "step": 7237 + }, + { + "epoch": 0.8299506937277835, + "grad_norm": 0.26592234998722886, + "learning_rate": 7.393967141637248e-05, + "loss": 2.4752, + "step": 7238 + }, + { + "epoch": 0.8300653594771242, + "grad_norm": 0.2564211960408765, + "learning_rate": 7.384251803827397e-05, + "loss": 2.5098, + "step": 7239 + }, + { + "epoch": 0.8301800252264648, + "grad_norm": 0.26588368484504593, + "learning_rate": 7.374542344047692e-05, + "loss": 2.3298, + "step": 7240 + }, + { + "epoch": 0.8302946909758055, + "grad_norm": 0.2650345696005376, + "learning_rate": 7.364838763637344e-05, + "loss": 2.2663, + "step": 7241 + }, + { + "epoch": 0.8304093567251462, + "grad_norm": 0.24417553992010202, + "learning_rate": 7.35514106393479e-05, + "loss": 2.3542, + "step": 7242 + }, + { + "epoch": 0.8305240224744869, + "grad_norm": 0.23503875689848353, + "learning_rate": 7.345449246277636e-05, + "loss": 2.4518, + "step": 7243 + }, + { + "epoch": 0.8306386882238276, + "grad_norm": 0.2696371879271095, + "learning_rate": 7.335763312002691e-05, + "loss": 2.3158, + "step": 7244 + }, + { + "epoch": 0.8307533539731682, + "grad_norm": 0.2508278778191043, + "learning_rate": 7.326083262445942e-05, + "loss": 2.4071, + "step": 7245 + }, + { + "epoch": 0.8308680197225089, + "grad_norm": 0.25406213633540037, + "learning_rate": 7.316409098942584e-05, + "loss": 2.3312, + "step": 7246 + }, + { + "epoch": 0.8309826854718496, + "grad_norm": 0.24449508957366375, + "learning_rate": 7.306740822826957e-05, + "loss": 2.291, + "step": 7247 + }, + { + "epoch": 0.8310973512211902, + "grad_norm": 0.27581977431894283, + "learning_rate": 7.297078435432608e-05, + "loss": 2.5688, + "step": 7248 + }, + { + "epoch": 0.8312120169705309, + "grad_norm": 0.26075882817435575, + "learning_rate": 7.287421938092325e-05, + "loss": 2.428, + "step": 7249 + }, + { + "epoch": 0.8313266827198715, + "grad_norm": 0.24713561039219137, + "learning_rate": 7.277771332137995e-05, + "loss": 2.3161, + "step": 7250 + }, + { + "epoch": 0.8314413484692122, + "grad_norm": 0.28519544977886985, + "learning_rate": 7.26812661890075e-05, + "loss": 2.4297, + "step": 7251 + }, + { + "epoch": 0.831556014218553, + "grad_norm": 0.2616099370264878, + "learning_rate": 7.258487799710911e-05, + "loss": 2.4194, + "step": 7252 + }, + { + "epoch": 0.8316706799678936, + "grad_norm": 0.29114726733632806, + "learning_rate": 7.248854875897926e-05, + "loss": 2.4057, + "step": 7253 + }, + { + "epoch": 0.8317853457172343, + "grad_norm": 0.22480701367658262, + "learning_rate": 7.239227848790508e-05, + "loss": 2.4236, + "step": 7254 + }, + { + "epoch": 0.8319000114665749, + "grad_norm": 0.23093442136998926, + "learning_rate": 7.22960671971653e-05, + "loss": 2.3343, + "step": 7255 + }, + { + "epoch": 0.8320146772159156, + "grad_norm": 0.2844865147191501, + "learning_rate": 7.219991490003014e-05, + "loss": 2.3658, + "step": 7256 + }, + { + "epoch": 0.8321293429652563, + "grad_norm": 0.23316307431744435, + "learning_rate": 7.210382160976209e-05, + "loss": 2.4953, + "step": 7257 + }, + { + "epoch": 0.8322440087145969, + "grad_norm": 0.27233437079066286, + "learning_rate": 7.200778733961544e-05, + "loss": 2.46, + "step": 7258 + }, + { + "epoch": 0.8323586744639376, + "grad_norm": 0.24225435306764317, + "learning_rate": 7.191181210283626e-05, + "loss": 2.367, + "step": 7259 + }, + { + "epoch": 0.8324733402132783, + "grad_norm": 0.26344696541372714, + "learning_rate": 7.181589591266246e-05, + "loss": 2.4097, + "step": 7260 + }, + { + "epoch": 0.832588005962619, + "grad_norm": 0.26866560268991174, + "learning_rate": 7.172003878232403e-05, + "loss": 2.4727, + "step": 7261 + }, + { + "epoch": 0.8327026717119597, + "grad_norm": 0.2531057048044504, + "learning_rate": 7.162424072504236e-05, + "loss": 2.3087, + "step": 7262 + }, + { + "epoch": 0.8328173374613003, + "grad_norm": 0.2906871180689149, + "learning_rate": 7.152850175403109e-05, + "loss": 2.4627, + "step": 7263 + }, + { + "epoch": 0.832932003210641, + "grad_norm": 0.25555493878835456, + "learning_rate": 7.143282188249562e-05, + "loss": 2.3759, + "step": 7264 + }, + { + "epoch": 0.8330466689599817, + "grad_norm": 0.2660521706000028, + "learning_rate": 7.133720112363313e-05, + "loss": 2.4182, + "step": 7265 + }, + { + "epoch": 0.8331613347093223, + "grad_norm": 0.28004182864190275, + "learning_rate": 7.124163949063267e-05, + "loss": 2.4137, + "step": 7266 + }, + { + "epoch": 0.833276000458663, + "grad_norm": 0.24903384072780274, + "learning_rate": 7.114613699667527e-05, + "loss": 2.3767, + "step": 7267 + }, + { + "epoch": 0.8333906662080036, + "grad_norm": 0.23752177820548975, + "learning_rate": 7.105069365493338e-05, + "loss": 2.3715, + "step": 7268 + }, + { + "epoch": 0.8335053319573443, + "grad_norm": 0.24603997173608957, + "learning_rate": 7.095530947857199e-05, + "loss": 2.2875, + "step": 7269 + }, + { + "epoch": 0.833619997706685, + "grad_norm": 0.27297910269369585, + "learning_rate": 7.08599844807472e-05, + "loss": 2.472, + "step": 7270 + }, + { + "epoch": 0.8337346634560256, + "grad_norm": 0.26005650102857897, + "learning_rate": 7.076471867460743e-05, + "loss": 2.4577, + "step": 7271 + }, + { + "epoch": 0.8338493292053664, + "grad_norm": 0.25524278375618115, + "learning_rate": 7.066951207329276e-05, + "loss": 2.4469, + "step": 7272 + }, + { + "epoch": 0.8339639949547071, + "grad_norm": 0.2782497377626116, + "learning_rate": 7.057436468993505e-05, + "loss": 2.4549, + "step": 7273 + }, + { + "epoch": 0.8340786607040477, + "grad_norm": 0.26019365486685053, + "learning_rate": 7.047927653765817e-05, + "loss": 2.2951, + "step": 7274 + }, + { + "epoch": 0.8341933264533884, + "grad_norm": 0.24025601853475834, + "learning_rate": 7.03842476295778e-05, + "loss": 2.3485, + "step": 7275 + }, + { + "epoch": 0.834307992202729, + "grad_norm": 0.24123276404205093, + "learning_rate": 7.028927797880114e-05, + "loss": 2.3945, + "step": 7276 + }, + { + "epoch": 0.8344226579520697, + "grad_norm": 0.2508800247432839, + "learning_rate": 7.019436759842757e-05, + "loss": 2.3946, + "step": 7277 + }, + { + "epoch": 0.8345373237014104, + "grad_norm": 0.24428499995314762, + "learning_rate": 7.009951650154811e-05, + "loss": 2.3261, + "step": 7278 + }, + { + "epoch": 0.834651989450751, + "grad_norm": 0.2718217213821309, + "learning_rate": 7.00047247012457e-05, + "loss": 2.5009, + "step": 7279 + }, + { + "epoch": 0.8347666552000917, + "grad_norm": 0.26977900984501996, + "learning_rate": 6.990999221059507e-05, + "loss": 2.3587, + "step": 7280 + }, + { + "epoch": 0.8348813209494325, + "grad_norm": 0.24865884789957596, + "learning_rate": 6.981531904266286e-05, + "loss": 2.36, + "step": 7281 + }, + { + "epoch": 0.8349959866987731, + "grad_norm": 0.27179563907116217, + "learning_rate": 6.972070521050722e-05, + "loss": 2.4209, + "step": 7282 + }, + { + "epoch": 0.8351106524481138, + "grad_norm": 0.2950936695707222, + "learning_rate": 6.962615072717831e-05, + "loss": 2.3474, + "step": 7283 + }, + { + "epoch": 0.8352253181974544, + "grad_norm": 0.2404068639289652, + "learning_rate": 6.95316556057185e-05, + "loss": 2.4528, + "step": 7284 + }, + { + "epoch": 0.8353399839467951, + "grad_norm": 0.2551203600407414, + "learning_rate": 6.943721985916113e-05, + "loss": 2.4323, + "step": 7285 + }, + { + "epoch": 0.8354546496961358, + "grad_norm": 0.26486004366700383, + "learning_rate": 6.93428435005321e-05, + "loss": 2.3465, + "step": 7286 + }, + { + "epoch": 0.8355693154454764, + "grad_norm": 0.24235792262936767, + "learning_rate": 6.92485265428488e-05, + "loss": 2.3695, + "step": 7287 + }, + { + "epoch": 0.8356839811948171, + "grad_norm": 0.27733897004089325, + "learning_rate": 6.915426899912013e-05, + "loss": 2.4402, + "step": 7288 + }, + { + "epoch": 0.8357986469441577, + "grad_norm": 0.24963678669127454, + "learning_rate": 6.906007088234756e-05, + "loss": 2.4752, + "step": 7289 + }, + { + "epoch": 0.8359133126934984, + "grad_norm": 0.24232553263118228, + "learning_rate": 6.896593220552383e-05, + "loss": 2.3814, + "step": 7290 + }, + { + "epoch": 0.8360279784428392, + "grad_norm": 0.2491660624907511, + "learning_rate": 6.88718529816334e-05, + "loss": 2.3512, + "step": 7291 + }, + { + "epoch": 0.8361426441921798, + "grad_norm": 0.25502406516151066, + "learning_rate": 6.877783322365283e-05, + "loss": 2.3183, + "step": 7292 + }, + { + "epoch": 0.8362573099415205, + "grad_norm": 0.24969230662069572, + "learning_rate": 6.868387294455026e-05, + "loss": 2.358, + "step": 7293 + }, + { + "epoch": 0.8363719756908612, + "grad_norm": 0.25759095080743494, + "learning_rate": 6.85899721572858e-05, + "loss": 2.5223, + "step": 7294 + }, + { + "epoch": 0.8364866414402018, + "grad_norm": 0.2699824723129227, + "learning_rate": 6.849613087481126e-05, + "loss": 2.36, + "step": 7295 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.24788418939363163, + "learning_rate": 6.840234911007043e-05, + "loss": 2.5134, + "step": 7296 + }, + { + "epoch": 0.8367159729388831, + "grad_norm": 0.2513732101988989, + "learning_rate": 6.830862687599837e-05, + "loss": 2.2801, + "step": 7297 + }, + { + "epoch": 0.8368306386882238, + "grad_norm": 0.23198748868272304, + "learning_rate": 6.821496418552231e-05, + "loss": 2.4725, + "step": 7298 + }, + { + "epoch": 0.8369453044375645, + "grad_norm": 0.263116211005387, + "learning_rate": 6.812136105156163e-05, + "loss": 2.4326, + "step": 7299 + }, + { + "epoch": 0.8370599701869051, + "grad_norm": 0.23793405248878913, + "learning_rate": 6.802781748702674e-05, + "loss": 2.4031, + "step": 7300 + }, + { + "epoch": 0.8371746359362459, + "grad_norm": 0.273780723537298, + "learning_rate": 6.793433350482025e-05, + "loss": 2.3825, + "step": 7301 + }, + { + "epoch": 0.8372893016855865, + "grad_norm": 0.2308942464101055, + "learning_rate": 6.784090911783664e-05, + "loss": 2.3472, + "step": 7302 + }, + { + "epoch": 0.8374039674349272, + "grad_norm": 0.2531369834290967, + "learning_rate": 6.774754433896174e-05, + "loss": 2.3137, + "step": 7303 + }, + { + "epoch": 0.8375186331842679, + "grad_norm": 0.27257079247329097, + "learning_rate": 6.765423918107372e-05, + "loss": 2.3903, + "step": 7304 + }, + { + "epoch": 0.8376332989336085, + "grad_norm": 0.24456377464439655, + "learning_rate": 6.756099365704221e-05, + "loss": 2.4615, + "step": 7305 + }, + { + "epoch": 0.8377479646829492, + "grad_norm": 0.27576612640762316, + "learning_rate": 6.746780777972849e-05, + "loss": 2.3409, + "step": 7306 + }, + { + "epoch": 0.8378626304322899, + "grad_norm": 0.24949247309246808, + "learning_rate": 6.73746815619859e-05, + "loss": 2.4692, + "step": 7307 + }, + { + "epoch": 0.8379772961816305, + "grad_norm": 0.26647716159428586, + "learning_rate": 6.728161501665942e-05, + "loss": 2.482, + "step": 7308 + }, + { + "epoch": 0.8380919619309712, + "grad_norm": 0.2637306942825546, + "learning_rate": 6.718860815658573e-05, + "loss": 2.4026, + "step": 7309 + }, + { + "epoch": 0.8382066276803118, + "grad_norm": 0.24756735286244275, + "learning_rate": 6.70956609945934e-05, + "loss": 2.337, + "step": 7310 + }, + { + "epoch": 0.8383212934296526, + "grad_norm": 0.24668977270785789, + "learning_rate": 6.700277354350287e-05, + "loss": 2.5056, + "step": 7311 + }, + { + "epoch": 0.8384359591789933, + "grad_norm": 0.2388337656994437, + "learning_rate": 6.690994581612597e-05, + "loss": 2.2893, + "step": 7312 + }, + { + "epoch": 0.8385506249283339, + "grad_norm": 0.3004076562737932, + "learning_rate": 6.681717782526659e-05, + "loss": 2.5131, + "step": 7313 + }, + { + "epoch": 0.8386652906776746, + "grad_norm": 0.25350005487696986, + "learning_rate": 6.672446958372025e-05, + "loss": 2.2998, + "step": 7314 + }, + { + "epoch": 0.8387799564270153, + "grad_norm": 0.28073604632860066, + "learning_rate": 6.663182110427441e-05, + "loss": 2.5, + "step": 7315 + }, + { + "epoch": 0.8388946221763559, + "grad_norm": 0.24501884135959578, + "learning_rate": 6.65392323997081e-05, + "loss": 2.278, + "step": 7316 + }, + { + "epoch": 0.8390092879256966, + "grad_norm": 0.256193076309501, + "learning_rate": 6.644670348279225e-05, + "loss": 2.3912, + "step": 7317 + }, + { + "epoch": 0.8391239536750372, + "grad_norm": 0.2573664545228904, + "learning_rate": 6.635423436628917e-05, + "loss": 2.3905, + "step": 7318 + }, + { + "epoch": 0.8392386194243779, + "grad_norm": 0.2570706676683893, + "learning_rate": 6.626182506295363e-05, + "loss": 2.3977, + "step": 7319 + }, + { + "epoch": 0.8393532851737187, + "grad_norm": 0.2890941402018045, + "learning_rate": 6.616947558553137e-05, + "loss": 2.4346, + "step": 7320 + }, + { + "epoch": 0.8394679509230593, + "grad_norm": 0.25485391141520813, + "learning_rate": 6.607718594676048e-05, + "loss": 2.2933, + "step": 7321 + }, + { + "epoch": 0.8395826166724, + "grad_norm": 0.2577630225856674, + "learning_rate": 6.598495615937039e-05, + "loss": 2.3925, + "step": 7322 + }, + { + "epoch": 0.8396972824217406, + "grad_norm": 0.2505373404336989, + "learning_rate": 6.589278623608259e-05, + "loss": 2.3808, + "step": 7323 + }, + { + "epoch": 0.8398119481710813, + "grad_norm": 0.25458866578444955, + "learning_rate": 6.580067618961006e-05, + "loss": 2.3797, + "step": 7324 + }, + { + "epoch": 0.839926613920422, + "grad_norm": 0.26753147989259257, + "learning_rate": 6.570862603265771e-05, + "loss": 2.458, + "step": 7325 + }, + { + "epoch": 0.8400412796697626, + "grad_norm": 0.24254139339112835, + "learning_rate": 6.561663577792198e-05, + "loss": 2.3326, + "step": 7326 + }, + { + "epoch": 0.8401559454191033, + "grad_norm": 0.2859343549058705, + "learning_rate": 6.55247054380913e-05, + "loss": 2.4524, + "step": 7327 + }, + { + "epoch": 0.840270611168444, + "grad_norm": 0.24524962378953882, + "learning_rate": 6.543283502584557e-05, + "loss": 2.2975, + "step": 7328 + }, + { + "epoch": 0.8403852769177846, + "grad_norm": 0.28286575223812377, + "learning_rate": 6.53410245538567e-05, + "loss": 2.4589, + "step": 7329 + }, + { + "epoch": 0.8404999426671254, + "grad_norm": 0.27158284475979777, + "learning_rate": 6.524927403478815e-05, + "loss": 2.4286, + "step": 7330 + }, + { + "epoch": 0.840614608416466, + "grad_norm": 0.2727525703568084, + "learning_rate": 6.51575834812952e-05, + "loss": 2.4674, + "step": 7331 + }, + { + "epoch": 0.8407292741658067, + "grad_norm": 0.29112258870111296, + "learning_rate": 6.506595290602468e-05, + "loss": 2.5155, + "step": 7332 + }, + { + "epoch": 0.8408439399151474, + "grad_norm": 0.25921260413305397, + "learning_rate": 6.497438232161524e-05, + "loss": 2.4807, + "step": 7333 + }, + { + "epoch": 0.840958605664488, + "grad_norm": 0.24785690549810055, + "learning_rate": 6.488287174069762e-05, + "loss": 2.3841, + "step": 7334 + }, + { + "epoch": 0.8410732714138287, + "grad_norm": 0.25456652507733446, + "learning_rate": 6.47914211758937e-05, + "loss": 2.4636, + "step": 7335 + }, + { + "epoch": 0.8411879371631693, + "grad_norm": 0.25809020802231264, + "learning_rate": 6.470003063981733e-05, + "loss": 2.4407, + "step": 7336 + }, + { + "epoch": 0.84130260291251, + "grad_norm": 0.24323508425158163, + "learning_rate": 6.460870014507431e-05, + "loss": 2.2582, + "step": 7337 + }, + { + "epoch": 0.8414172686618507, + "grad_norm": 0.2588758204058675, + "learning_rate": 6.451742970426161e-05, + "loss": 2.4089, + "step": 7338 + }, + { + "epoch": 0.8415319344111913, + "grad_norm": 0.27148679795716, + "learning_rate": 6.44262193299685e-05, + "loss": 2.4399, + "step": 7339 + }, + { + "epoch": 0.841646600160532, + "grad_norm": 0.2657109929302001, + "learning_rate": 6.433506903477571e-05, + "loss": 2.4002, + "step": 7340 + }, + { + "epoch": 0.8417612659098728, + "grad_norm": 0.2577284612194604, + "learning_rate": 6.424397883125554e-05, + "loss": 2.3419, + "step": 7341 + }, + { + "epoch": 0.8418759316592134, + "grad_norm": 0.2550326540740129, + "learning_rate": 6.415294873197225e-05, + "loss": 2.2878, + "step": 7342 + }, + { + "epoch": 0.8419905974085541, + "grad_norm": 0.23077940450244505, + "learning_rate": 6.406197874948166e-05, + "loss": 2.3087, + "step": 7343 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.23822317447418903, + "learning_rate": 6.397106889633136e-05, + "loss": 2.334, + "step": 7344 + }, + { + "epoch": 0.8422199289072354, + "grad_norm": 0.2625904096570945, + "learning_rate": 6.388021918506065e-05, + "loss": 2.4367, + "step": 7345 + }, + { + "epoch": 0.8423345946565761, + "grad_norm": 0.2907368284328103, + "learning_rate": 6.378942962820062e-05, + "loss": 2.4346, + "step": 7346 + }, + { + "epoch": 0.8424492604059167, + "grad_norm": 0.26491362284568687, + "learning_rate": 6.369870023827374e-05, + "loss": 2.4932, + "step": 7347 + }, + { + "epoch": 0.8425639261552574, + "grad_norm": 0.25500391978179204, + "learning_rate": 6.360803102779439e-05, + "loss": 2.4485, + "step": 7348 + }, + { + "epoch": 0.8426785919045982, + "grad_norm": 0.2614024775172327, + "learning_rate": 6.351742200926897e-05, + "loss": 2.582, + "step": 7349 + }, + { + "epoch": 0.8427932576539388, + "grad_norm": 0.2474000174458802, + "learning_rate": 6.342687319519497e-05, + "loss": 2.3858, + "step": 7350 + }, + { + "epoch": 0.8429079234032795, + "grad_norm": 0.2982661846619476, + "learning_rate": 6.333638459806195e-05, + "loss": 2.3497, + "step": 7351 + }, + { + "epoch": 0.8430225891526201, + "grad_norm": 0.2715945361134651, + "learning_rate": 6.324595623035123e-05, + "loss": 2.4733, + "step": 7352 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 0.2651693783731241, + "learning_rate": 6.315558810453537e-05, + "loss": 2.5103, + "step": 7353 + }, + { + "epoch": 0.8432519206513015, + "grad_norm": 0.26166227820021287, + "learning_rate": 6.306528023307922e-05, + "loss": 2.296, + "step": 7354 + }, + { + "epoch": 0.8433665864006421, + "grad_norm": 0.25578684843953703, + "learning_rate": 6.297503262843901e-05, + "loss": 2.4718, + "step": 7355 + }, + { + "epoch": 0.8434812521499828, + "grad_norm": 0.26213480820403906, + "learning_rate": 6.28848453030625e-05, + "loss": 2.463, + "step": 7356 + }, + { + "epoch": 0.8435959178993234, + "grad_norm": 0.24959556893097148, + "learning_rate": 6.279471826938942e-05, + "loss": 2.2073, + "step": 7357 + }, + { + "epoch": 0.8437105836486641, + "grad_norm": 0.24258328351939681, + "learning_rate": 6.270465153985106e-05, + "loss": 2.2358, + "step": 7358 + }, + { + "epoch": 0.8438252493980049, + "grad_norm": 0.24700953373465703, + "learning_rate": 6.261464512687038e-05, + "loss": 2.3821, + "step": 7359 + }, + { + "epoch": 0.8439399151473455, + "grad_norm": 0.26305591558985625, + "learning_rate": 6.252469904286212e-05, + "loss": 2.2233, + "step": 7360 + }, + { + "epoch": 0.8440545808966862, + "grad_norm": 0.2652830551624764, + "learning_rate": 6.243481330023265e-05, + "loss": 2.4158, + "step": 7361 + }, + { + "epoch": 0.8441692466460269, + "grad_norm": 0.24539442068718118, + "learning_rate": 6.234498791137988e-05, + "loss": 2.4527, + "step": 7362 + }, + { + "epoch": 0.8442839123953675, + "grad_norm": 0.2732018574619513, + "learning_rate": 6.225522288869356e-05, + "loss": 2.4278, + "step": 7363 + }, + { + "epoch": 0.8443985781447082, + "grad_norm": 0.22877451667360732, + "learning_rate": 6.216551824455502e-05, + "loss": 2.5291, + "step": 7364 + }, + { + "epoch": 0.8445132438940488, + "grad_norm": 0.2655325772150972, + "learning_rate": 6.207587399133741e-05, + "loss": 2.5398, + "step": 7365 + }, + { + "epoch": 0.8446279096433895, + "grad_norm": 0.25428362933839443, + "learning_rate": 6.19862901414054e-05, + "loss": 2.3839, + "step": 7366 + }, + { + "epoch": 0.8447425753927302, + "grad_norm": 0.26474752531660295, + "learning_rate": 6.189676670711542e-05, + "loss": 2.4153, + "step": 7367 + }, + { + "epoch": 0.8448572411420708, + "grad_norm": 0.27551712502897546, + "learning_rate": 6.180730370081521e-05, + "loss": 2.5247, + "step": 7368 + }, + { + "epoch": 0.8449719068914116, + "grad_norm": 0.24579649068025336, + "learning_rate": 6.171790113484504e-05, + "loss": 2.3655, + "step": 7369 + }, + { + "epoch": 0.8450865726407522, + "grad_norm": 0.252975984238005, + "learning_rate": 6.162855902153586e-05, + "loss": 2.5797, + "step": 7370 + }, + { + "epoch": 0.8452012383900929, + "grad_norm": 0.24513496386807695, + "learning_rate": 6.153927737321086e-05, + "loss": 2.3605, + "step": 7371 + }, + { + "epoch": 0.8453159041394336, + "grad_norm": 0.2526966998419088, + "learning_rate": 6.145005620218469e-05, + "loss": 2.3886, + "step": 7372 + }, + { + "epoch": 0.8454305698887742, + "grad_norm": 0.2577473517761556, + "learning_rate": 6.13608955207638e-05, + "loss": 2.4651, + "step": 7373 + }, + { + "epoch": 0.8455452356381149, + "grad_norm": 0.21669635081659958, + "learning_rate": 6.127179534124605e-05, + "loss": 2.3285, + "step": 7374 + }, + { + "epoch": 0.8456599013874556, + "grad_norm": 0.268442581182433, + "learning_rate": 6.118275567592141e-05, + "loss": 2.5963, + "step": 7375 + }, + { + "epoch": 0.8457745671367962, + "grad_norm": 0.28499603876589635, + "learning_rate": 6.109377653707087e-05, + "loss": 2.2982, + "step": 7376 + }, + { + "epoch": 0.8458892328861369, + "grad_norm": 0.27998811886631714, + "learning_rate": 6.100485793696753e-05, + "loss": 2.5727, + "step": 7377 + }, + { + "epoch": 0.8460038986354775, + "grad_norm": 0.25688106333283234, + "learning_rate": 6.091599988787605e-05, + "loss": 2.3853, + "step": 7378 + }, + { + "epoch": 0.8461185643848183, + "grad_norm": 0.30853933039733067, + "learning_rate": 6.08272024020527e-05, + "loss": 2.4761, + "step": 7379 + }, + { + "epoch": 0.846233230134159, + "grad_norm": 0.2638574023641479, + "learning_rate": 6.073846549174533e-05, + "loss": 2.4848, + "step": 7380 + }, + { + "epoch": 0.8463478958834996, + "grad_norm": 0.2365175073026522, + "learning_rate": 6.0649789169193727e-05, + "loss": 2.5048, + "step": 7381 + }, + { + "epoch": 0.8464625616328403, + "grad_norm": 0.2424481589956357, + "learning_rate": 6.056117344662876e-05, + "loss": 2.3042, + "step": 7382 + }, + { + "epoch": 0.846577227382181, + "grad_norm": 0.23348956570811055, + "learning_rate": 6.047261833627338e-05, + "loss": 2.4641, + "step": 7383 + }, + { + "epoch": 0.8466918931315216, + "grad_norm": 0.2517947819576375, + "learning_rate": 6.0384123850342366e-05, + "loss": 2.3262, + "step": 7384 + }, + { + "epoch": 0.8468065588808623, + "grad_norm": 0.2395713403821714, + "learning_rate": 6.029569000104152e-05, + "loss": 2.3195, + "step": 7385 + }, + { + "epoch": 0.8469212246302029, + "grad_norm": 0.24399739858681516, + "learning_rate": 6.0207316800568676e-05, + "loss": 2.4086, + "step": 7386 + }, + { + "epoch": 0.8470358903795436, + "grad_norm": 0.26589052433398574, + "learning_rate": 6.0119004261113386e-05, + "loss": 2.3383, + "step": 7387 + }, + { + "epoch": 0.8471505561288843, + "grad_norm": 0.24639137050366963, + "learning_rate": 6.003075239485639e-05, + "loss": 2.3629, + "step": 7388 + }, + { + "epoch": 0.847265221878225, + "grad_norm": 0.2560674614993338, + "learning_rate": 5.994256121397057e-05, + "loss": 2.4028, + "step": 7389 + }, + { + "epoch": 0.8473798876275657, + "grad_norm": 0.2379773377532734, + "learning_rate": 5.985443073062036e-05, + "loss": 2.3552, + "step": 7390 + }, + { + "epoch": 0.8474945533769063, + "grad_norm": 0.2802583155125992, + "learning_rate": 5.9766360956961305e-05, + "loss": 2.4047, + "step": 7391 + }, + { + "epoch": 0.847609219126247, + "grad_norm": 0.2583178224968202, + "learning_rate": 5.9678351905141204e-05, + "loss": 2.3878, + "step": 7392 + }, + { + "epoch": 0.8477238848755877, + "grad_norm": 0.2640310657779709, + "learning_rate": 5.9590403587299116e-05, + "loss": 2.3564, + "step": 7393 + }, + { + "epoch": 0.8478385506249283, + "grad_norm": 0.30346044463945665, + "learning_rate": 5.950251601556589e-05, + "loss": 2.3779, + "step": 7394 + }, + { + "epoch": 0.847953216374269, + "grad_norm": 0.28333153589238047, + "learning_rate": 5.941468920206394e-05, + "loss": 2.346, + "step": 7395 + }, + { + "epoch": 0.8480678821236097, + "grad_norm": 0.26647318999969893, + "learning_rate": 5.932692315890742e-05, + "loss": 2.3383, + "step": 7396 + }, + { + "epoch": 0.8481825478729503, + "grad_norm": 0.258954386770001, + "learning_rate": 5.923921789820175e-05, + "loss": 2.4828, + "step": 7397 + }, + { + "epoch": 0.848297213622291, + "grad_norm": 0.2582813128197879, + "learning_rate": 5.9151573432044195e-05, + "loss": 2.3856, + "step": 7398 + }, + { + "epoch": 0.8484118793716316, + "grad_norm": 0.28484231174216457, + "learning_rate": 5.9063989772523917e-05, + "loss": 2.4715, + "step": 7399 + }, + { + "epoch": 0.8485265451209724, + "grad_norm": 0.24275835553791417, + "learning_rate": 5.8976466931721206e-05, + "loss": 2.5656, + "step": 7400 + }, + { + "epoch": 0.8486412108703131, + "grad_norm": 0.3037462476854509, + "learning_rate": 5.888900492170818e-05, + "loss": 2.4641, + "step": 7401 + }, + { + "epoch": 0.8487558766196537, + "grad_norm": 0.2940176942252436, + "learning_rate": 5.8801603754548684e-05, + "loss": 2.4044, + "step": 7402 + }, + { + "epoch": 0.8488705423689944, + "grad_norm": 0.2685151020416385, + "learning_rate": 5.871426344229769e-05, + "loss": 2.2659, + "step": 7403 + }, + { + "epoch": 0.848985208118335, + "grad_norm": 0.2550121924686993, + "learning_rate": 5.8626983997002556e-05, + "loss": 2.3505, + "step": 7404 + }, + { + "epoch": 0.8490998738676757, + "grad_norm": 0.2521927640985913, + "learning_rate": 5.853976543070172e-05, + "loss": 2.4366, + "step": 7405 + }, + { + "epoch": 0.8492145396170164, + "grad_norm": 0.2624617311168068, + "learning_rate": 5.845260775542516e-05, + "loss": 2.3559, + "step": 7406 + }, + { + "epoch": 0.849329205366357, + "grad_norm": 0.2650708839251532, + "learning_rate": 5.83655109831947e-05, + "loss": 2.5392, + "step": 7407 + }, + { + "epoch": 0.8494438711156977, + "grad_norm": 0.24888837987351464, + "learning_rate": 5.8278475126023635e-05, + "loss": 2.4695, + "step": 7408 + }, + { + "epoch": 0.8495585368650385, + "grad_norm": 0.2361990338659879, + "learning_rate": 5.819150019591701e-05, + "loss": 2.3647, + "step": 7409 + }, + { + "epoch": 0.8496732026143791, + "grad_norm": 0.24055793119594715, + "learning_rate": 5.810458620487124e-05, + "loss": 2.4647, + "step": 7410 + }, + { + "epoch": 0.8497878683637198, + "grad_norm": 0.25066155961743153, + "learning_rate": 5.801773316487463e-05, + "loss": 2.3476, + "step": 7411 + }, + { + "epoch": 0.8499025341130604, + "grad_norm": 0.2827528739259388, + "learning_rate": 5.7930941087906695e-05, + "loss": 2.3077, + "step": 7412 + }, + { + "epoch": 0.8500171998624011, + "grad_norm": 0.24786308722520733, + "learning_rate": 5.7844209985938865e-05, + "loss": 2.3916, + "step": 7413 + }, + { + "epoch": 0.8501318656117418, + "grad_norm": 0.2578890418696975, + "learning_rate": 5.7757539870933964e-05, + "loss": 2.4608, + "step": 7414 + }, + { + "epoch": 0.8502465313610824, + "grad_norm": 0.27209706260619676, + "learning_rate": 5.767093075484653e-05, + "loss": 2.4006, + "step": 7415 + }, + { + "epoch": 0.8503611971104231, + "grad_norm": 0.24406850873997418, + "learning_rate": 5.758438264962268e-05, + "loss": 2.2542, + "step": 7416 + }, + { + "epoch": 0.8504758628597638, + "grad_norm": 0.26564558132033356, + "learning_rate": 5.7497895567200085e-05, + "loss": 2.2802, + "step": 7417 + }, + { + "epoch": 0.8505905286091044, + "grad_norm": 0.24160028508585424, + "learning_rate": 5.741146951950776e-05, + "loss": 2.3906, + "step": 7418 + }, + { + "epoch": 0.8507051943584452, + "grad_norm": 0.27690186611941386, + "learning_rate": 5.73251045184669e-05, + "loss": 2.3152, + "step": 7419 + }, + { + "epoch": 0.8508198601077858, + "grad_norm": 0.24059730814558286, + "learning_rate": 5.7238800575989626e-05, + "loss": 2.3433, + "step": 7420 + }, + { + "epoch": 0.8509345258571265, + "grad_norm": 0.2258590769848453, + "learning_rate": 5.7152557703980034e-05, + "loss": 2.3204, + "step": 7421 + }, + { + "epoch": 0.8510491916064672, + "grad_norm": 0.31143347917557757, + "learning_rate": 5.706637591433367e-05, + "loss": 2.5194, + "step": 7422 + }, + { + "epoch": 0.8511638573558078, + "grad_norm": 0.253613217850807, + "learning_rate": 5.6980255218937625e-05, + "loss": 2.4162, + "step": 7423 + }, + { + "epoch": 0.8512785231051485, + "grad_norm": 0.24455910535803058, + "learning_rate": 5.689419562967069e-05, + "loss": 2.5173, + "step": 7424 + }, + { + "epoch": 0.8513931888544891, + "grad_norm": 0.26079317055129914, + "learning_rate": 5.6808197158403185e-05, + "loss": 2.4076, + "step": 7425 + }, + { + "epoch": 0.8515078546038298, + "grad_norm": 0.22975612616919897, + "learning_rate": 5.6722259816996744e-05, + "loss": 2.4616, + "step": 7426 + }, + { + "epoch": 0.8516225203531705, + "grad_norm": 0.2630507239123343, + "learning_rate": 5.6636383617305e-05, + "loss": 2.3969, + "step": 7427 + }, + { + "epoch": 0.8517371861025111, + "grad_norm": 0.2885157925201874, + "learning_rate": 5.65505685711728e-05, + "loss": 2.485, + "step": 7428 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.247694619357967, + "learning_rate": 5.646481469043674e-05, + "loss": 2.3786, + "step": 7429 + }, + { + "epoch": 0.8519665176011926, + "grad_norm": 0.2733639534439636, + "learning_rate": 5.6379121986924966e-05, + "loss": 2.374, + "step": 7430 + }, + { + "epoch": 0.8520811833505332, + "grad_norm": 0.25048987916610066, + "learning_rate": 5.629349047245724e-05, + "loss": 2.4347, + "step": 7431 + }, + { + "epoch": 0.8521958490998739, + "grad_norm": 0.246810369404724, + "learning_rate": 5.620792015884457e-05, + "loss": 2.3322, + "step": 7432 + }, + { + "epoch": 0.8523105148492145, + "grad_norm": 0.22519279725078284, + "learning_rate": 5.612241105788979e-05, + "loss": 2.345, + "step": 7433 + }, + { + "epoch": 0.8524251805985552, + "grad_norm": 0.2628826146842859, + "learning_rate": 5.6036963181387516e-05, + "loss": 2.4617, + "step": 7434 + }, + { + "epoch": 0.8525398463478959, + "grad_norm": 0.25420169764311584, + "learning_rate": 5.5951576541123315e-05, + "loss": 2.2493, + "step": 7435 + }, + { + "epoch": 0.8526545120972365, + "grad_norm": 0.2628101032101516, + "learning_rate": 5.5866251148874894e-05, + "loss": 2.4613, + "step": 7436 + }, + { + "epoch": 0.8527691778465772, + "grad_norm": 0.2680625969096601, + "learning_rate": 5.5780987016411214e-05, + "loss": 2.392, + "step": 7437 + }, + { + "epoch": 0.8528838435959178, + "grad_norm": 0.2609814039963656, + "learning_rate": 5.569578415549259e-05, + "loss": 2.3488, + "step": 7438 + }, + { + "epoch": 0.8529985093452586, + "grad_norm": 0.28461357598423603, + "learning_rate": 5.5610642577871404e-05, + "loss": 2.3195, + "step": 7439 + }, + { + "epoch": 0.8531131750945993, + "grad_norm": 0.24201554296547595, + "learning_rate": 5.552556229529138e-05, + "loss": 2.2454, + "step": 7440 + }, + { + "epoch": 0.8532278408439399, + "grad_norm": 0.2537792738599434, + "learning_rate": 5.54405433194875e-05, + "loss": 2.3136, + "step": 7441 + }, + { + "epoch": 0.8533425065932806, + "grad_norm": 0.2874682166144907, + "learning_rate": 5.535558566218657e-05, + "loss": 2.3575, + "step": 7442 + }, + { + "epoch": 0.8534571723426213, + "grad_norm": 0.24938415360585353, + "learning_rate": 5.527068933510687e-05, + "loss": 2.3451, + "step": 7443 + }, + { + "epoch": 0.8535718380919619, + "grad_norm": 0.24437905448870373, + "learning_rate": 5.518585434995832e-05, + "loss": 2.3803, + "step": 7444 + }, + { + "epoch": 0.8536865038413026, + "grad_norm": 0.2887218110764142, + "learning_rate": 5.510108071844222e-05, + "loss": 2.3729, + "step": 7445 + }, + { + "epoch": 0.8538011695906432, + "grad_norm": 0.2794849551525388, + "learning_rate": 5.501636845225161e-05, + "loss": 2.4006, + "step": 7446 + }, + { + "epoch": 0.8539158353399839, + "grad_norm": 0.23054717174951853, + "learning_rate": 5.4931717563070635e-05, + "loss": 2.3783, + "step": 7447 + }, + { + "epoch": 0.8540305010893247, + "grad_norm": 0.2543642470420606, + "learning_rate": 5.4847128062575415e-05, + "loss": 2.3824, + "step": 7448 + }, + { + "epoch": 0.8541451668386653, + "grad_norm": 0.28237933690372635, + "learning_rate": 5.476259996243371e-05, + "loss": 2.3931, + "step": 7449 + }, + { + "epoch": 0.854259832588006, + "grad_norm": 0.26227786293634653, + "learning_rate": 5.467813327430421e-05, + "loss": 2.3537, + "step": 7450 + }, + { + "epoch": 0.8543744983373467, + "grad_norm": 0.2505258421378121, + "learning_rate": 5.4593728009837585e-05, + "loss": 2.2931, + "step": 7451 + }, + { + "epoch": 0.8544891640866873, + "grad_norm": 0.2679298200199342, + "learning_rate": 5.450938418067614e-05, + "loss": 2.5281, + "step": 7452 + }, + { + "epoch": 0.854603829836028, + "grad_norm": 0.25762434127657835, + "learning_rate": 5.442510179845306e-05, + "loss": 2.391, + "step": 7453 + }, + { + "epoch": 0.8547184955853686, + "grad_norm": 0.2688359171292793, + "learning_rate": 5.434088087479389e-05, + "loss": 2.4066, + "step": 7454 + }, + { + "epoch": 0.8548331613347093, + "grad_norm": 0.2768619120395305, + "learning_rate": 5.425672142131527e-05, + "loss": 2.3442, + "step": 7455 + }, + { + "epoch": 0.85494782708405, + "grad_norm": 0.27156218318543174, + "learning_rate": 5.4172623449625146e-05, + "loss": 2.2954, + "step": 7456 + }, + { + "epoch": 0.8550624928333906, + "grad_norm": 0.28099427245419717, + "learning_rate": 5.408858697132341e-05, + "loss": 2.5027, + "step": 7457 + }, + { + "epoch": 0.8551771585827314, + "grad_norm": 0.23226589385877483, + "learning_rate": 5.4004611998001274e-05, + "loss": 2.3016, + "step": 7458 + }, + { + "epoch": 0.855291824332072, + "grad_norm": 0.2490192815080716, + "learning_rate": 5.392069854124143e-05, + "loss": 2.322, + "step": 7459 + }, + { + "epoch": 0.8554064900814127, + "grad_norm": 0.2788048067416016, + "learning_rate": 5.383684661261823e-05, + "loss": 2.4655, + "step": 7460 + }, + { + "epoch": 0.8555211558307534, + "grad_norm": 0.24090559279598975, + "learning_rate": 5.3753056223697415e-05, + "loss": 2.4112, + "step": 7461 + }, + { + "epoch": 0.855635821580094, + "grad_norm": 0.25805492020046117, + "learning_rate": 5.366932738603619e-05, + "loss": 2.376, + "step": 7462 + }, + { + "epoch": 0.8557504873294347, + "grad_norm": 0.2858213764314826, + "learning_rate": 5.358566011118343e-05, + "loss": 2.2827, + "step": 7463 + }, + { + "epoch": 0.8558651530787754, + "grad_norm": 0.2636881805331364, + "learning_rate": 5.350205441067946e-05, + "loss": 2.4616, + "step": 7464 + }, + { + "epoch": 0.855979818828116, + "grad_norm": 0.2906053505063247, + "learning_rate": 5.341851029605599e-05, + "loss": 2.3821, + "step": 7465 + }, + { + "epoch": 0.8560944845774567, + "grad_norm": 0.25978054088468305, + "learning_rate": 5.333502777883642e-05, + "loss": 2.3743, + "step": 7466 + }, + { + "epoch": 0.8562091503267973, + "grad_norm": 0.267879057807956, + "learning_rate": 5.3251606870535706e-05, + "loss": 2.5273, + "step": 7467 + }, + { + "epoch": 0.8563238160761381, + "grad_norm": 0.2503332371198279, + "learning_rate": 5.3168247582659814e-05, + "loss": 2.3541, + "step": 7468 + }, + { + "epoch": 0.8564384818254788, + "grad_norm": 0.2823829438082458, + "learning_rate": 5.3084949926706924e-05, + "loss": 2.4142, + "step": 7469 + }, + { + "epoch": 0.8565531475748194, + "grad_norm": 0.26504482018333186, + "learning_rate": 5.3001713914166136e-05, + "loss": 2.2558, + "step": 7470 + }, + { + "epoch": 0.8566678133241601, + "grad_norm": 0.27265628064410286, + "learning_rate": 5.291853955651837e-05, + "loss": 2.3745, + "step": 7471 + }, + { + "epoch": 0.8567824790735007, + "grad_norm": 0.2713372114241245, + "learning_rate": 5.283542686523596e-05, + "loss": 2.2901, + "step": 7472 + }, + { + "epoch": 0.8568971448228414, + "grad_norm": 0.2374886649620647, + "learning_rate": 5.2752375851782605e-05, + "loss": 2.4342, + "step": 7473 + }, + { + "epoch": 0.8570118105721821, + "grad_norm": 0.25916418214610326, + "learning_rate": 5.266938652761377e-05, + "loss": 2.2924, + "step": 7474 + }, + { + "epoch": 0.8571264763215227, + "grad_norm": 0.2599817813794228, + "learning_rate": 5.258645890417618e-05, + "loss": 2.2489, + "step": 7475 + }, + { + "epoch": 0.8572411420708634, + "grad_norm": 0.26527406015378374, + "learning_rate": 5.2503592992908075e-05, + "loss": 2.5143, + "step": 7476 + }, + { + "epoch": 0.8573558078202042, + "grad_norm": 0.2290020556249911, + "learning_rate": 5.242078880523921e-05, + "loss": 2.4696, + "step": 7477 + }, + { + "epoch": 0.8574704735695448, + "grad_norm": 0.2440494446577652, + "learning_rate": 5.233804635259093e-05, + "loss": 2.4292, + "step": 7478 + }, + { + "epoch": 0.8575851393188855, + "grad_norm": 0.23967853343296489, + "learning_rate": 5.2255365646375896e-05, + "loss": 2.3563, + "step": 7479 + }, + { + "epoch": 0.8576998050682261, + "grad_norm": 0.29740054177905095, + "learning_rate": 5.217274669799843e-05, + "loss": 2.3164, + "step": 7480 + }, + { + "epoch": 0.8578144708175668, + "grad_norm": 0.2604555153857124, + "learning_rate": 5.20901895188543e-05, + "loss": 2.2748, + "step": 7481 + }, + { + "epoch": 0.8579291365669075, + "grad_norm": 0.24850499433693082, + "learning_rate": 5.200769412033046e-05, + "loss": 2.3135, + "step": 7482 + }, + { + "epoch": 0.8580438023162481, + "grad_norm": 0.26441925455814375, + "learning_rate": 5.1925260513805584e-05, + "loss": 2.308, + "step": 7483 + }, + { + "epoch": 0.8581584680655888, + "grad_norm": 0.23873635506646962, + "learning_rate": 5.1842888710650185e-05, + "loss": 2.2353, + "step": 7484 + }, + { + "epoch": 0.8582731338149295, + "grad_norm": 0.2396979253790572, + "learning_rate": 5.17605787222255e-05, + "loss": 2.2657, + "step": 7485 + }, + { + "epoch": 0.8583877995642701, + "grad_norm": 0.25701420085020904, + "learning_rate": 5.16783305598848e-05, + "loss": 2.4428, + "step": 7486 + }, + { + "epoch": 0.8585024653136109, + "grad_norm": 0.26423914598622267, + "learning_rate": 5.1596144234972665e-05, + "loss": 2.5353, + "step": 7487 + }, + { + "epoch": 0.8586171310629515, + "grad_norm": 0.25885837728136946, + "learning_rate": 5.1514019758824915e-05, + "loss": 2.5074, + "step": 7488 + }, + { + "epoch": 0.8587317968122922, + "grad_norm": 0.2686750134878246, + "learning_rate": 5.143195714276927e-05, + "loss": 2.3648, + "step": 7489 + }, + { + "epoch": 0.8588464625616329, + "grad_norm": 0.26311816594345444, + "learning_rate": 5.1349956398124785e-05, + "loss": 2.3043, + "step": 7490 + }, + { + "epoch": 0.8589611283109735, + "grad_norm": 0.24465547724357115, + "learning_rate": 5.1268017536201696e-05, + "loss": 2.4699, + "step": 7491 + }, + { + "epoch": 0.8590757940603142, + "grad_norm": 0.27444670752145744, + "learning_rate": 5.1186140568301954e-05, + "loss": 2.4068, + "step": 7492 + }, + { + "epoch": 0.8591904598096548, + "grad_norm": 0.27747680186814133, + "learning_rate": 5.110432550571892e-05, + "loss": 2.4593, + "step": 7493 + }, + { + "epoch": 0.8593051255589955, + "grad_norm": 0.24249241700822985, + "learning_rate": 5.102257235973751e-05, + "loss": 2.3549, + "step": 7494 + }, + { + "epoch": 0.8594197913083362, + "grad_norm": 0.2739970833093966, + "learning_rate": 5.0940881141633924e-05, + "loss": 2.3568, + "step": 7495 + }, + { + "epoch": 0.8595344570576768, + "grad_norm": 0.249760384609764, + "learning_rate": 5.0859251862676035e-05, + "loss": 2.4051, + "step": 7496 + }, + { + "epoch": 0.8596491228070176, + "grad_norm": 0.28462301526596834, + "learning_rate": 5.0777684534122845e-05, + "loss": 2.443, + "step": 7497 + }, + { + "epoch": 0.8597637885563583, + "grad_norm": 0.27298531294680267, + "learning_rate": 5.069617916722502e-05, + "loss": 2.3805, + "step": 7498 + }, + { + "epoch": 0.8598784543056989, + "grad_norm": 0.26235266054910095, + "learning_rate": 5.0614735773224953e-05, + "loss": 2.3039, + "step": 7499 + }, + { + "epoch": 0.8599931200550396, + "grad_norm": 0.25426787830790465, + "learning_rate": 5.053335436335593e-05, + "loss": 2.3945, + "step": 7500 + }, + { + "epoch": 0.8601077858043802, + "grad_norm": 0.25500449845568723, + "learning_rate": 5.045203494884309e-05, + "loss": 2.2948, + "step": 7501 + }, + { + "epoch": 0.8602224515537209, + "grad_norm": 0.2582028649308595, + "learning_rate": 5.037077754090297e-05, + "loss": 2.3852, + "step": 7502 + }, + { + "epoch": 0.8603371173030616, + "grad_norm": 0.26029502411623073, + "learning_rate": 5.028958215074314e-05, + "loss": 2.4357, + "step": 7503 + }, + { + "epoch": 0.8604517830524022, + "grad_norm": 0.2566413785869189, + "learning_rate": 5.0208448789563324e-05, + "loss": 2.329, + "step": 7504 + }, + { + "epoch": 0.8605664488017429, + "grad_norm": 0.39235457308815774, + "learning_rate": 5.012737746855428e-05, + "loss": 2.3778, + "step": 7505 + }, + { + "epoch": 0.8606811145510835, + "grad_norm": 0.24477798601771983, + "learning_rate": 5.0046368198898015e-05, + "loss": 2.4195, + "step": 7506 + }, + { + "epoch": 0.8607957803004243, + "grad_norm": 0.2643168138484971, + "learning_rate": 4.996542099176843e-05, + "loss": 2.3121, + "step": 7507 + }, + { + "epoch": 0.860910446049765, + "grad_norm": 0.27102985402351976, + "learning_rate": 4.9884535858330526e-05, + "loss": 2.3153, + "step": 7508 + }, + { + "epoch": 0.8610251117991056, + "grad_norm": 0.24416388518642204, + "learning_rate": 4.9803712809740945e-05, + "loss": 2.3029, + "step": 7509 + }, + { + "epoch": 0.8611397775484463, + "grad_norm": 0.26104062757910423, + "learning_rate": 4.972295185714765e-05, + "loss": 2.3787, + "step": 7510 + }, + { + "epoch": 0.861254443297787, + "grad_norm": 0.25481360665056685, + "learning_rate": 4.964225301169023e-05, + "loss": 2.3383, + "step": 7511 + }, + { + "epoch": 0.8613691090471276, + "grad_norm": 0.23963738105396487, + "learning_rate": 4.956161628449923e-05, + "loss": 2.4837, + "step": 7512 + }, + { + "epoch": 0.8614837747964683, + "grad_norm": 0.2419191317610368, + "learning_rate": 4.94810416866972e-05, + "loss": 2.4131, + "step": 7513 + }, + { + "epoch": 0.8615984405458089, + "grad_norm": 0.21720650100970568, + "learning_rate": 4.940052922939775e-05, + "loss": 2.3352, + "step": 7514 + }, + { + "epoch": 0.8617131062951496, + "grad_norm": 0.24116176726440833, + "learning_rate": 4.93200789237061e-05, + "loss": 2.358, + "step": 7515 + }, + { + "epoch": 0.8618277720444903, + "grad_norm": 0.2309417338364269, + "learning_rate": 4.923969078071883e-05, + "loss": 2.2111, + "step": 7516 + }, + { + "epoch": 0.861942437793831, + "grad_norm": 0.26427672734598406, + "learning_rate": 4.9159364811524e-05, + "loss": 2.3385, + "step": 7517 + }, + { + "epoch": 0.8620571035431717, + "grad_norm": 0.24350230507605078, + "learning_rate": 4.907910102720087e-05, + "loss": 2.3563, + "step": 7518 + }, + { + "epoch": 0.8621717692925124, + "grad_norm": 0.2862664891226002, + "learning_rate": 4.899889943882052e-05, + "loss": 2.5323, + "step": 7519 + }, + { + "epoch": 0.862286435041853, + "grad_norm": 0.2829828106986698, + "learning_rate": 4.891876005744511e-05, + "loss": 2.471, + "step": 7520 + }, + { + "epoch": 0.8624011007911937, + "grad_norm": 0.2798733317098286, + "learning_rate": 4.883868289412835e-05, + "loss": 2.4314, + "step": 7521 + }, + { + "epoch": 0.8625157665405343, + "grad_norm": 0.25685300260632393, + "learning_rate": 4.875866795991535e-05, + "loss": 2.319, + "step": 7522 + }, + { + "epoch": 0.862630432289875, + "grad_norm": 0.28097789743909446, + "learning_rate": 4.867871526584261e-05, + "loss": 2.423, + "step": 7523 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.28397917406149864, + "learning_rate": 4.8598824822938206e-05, + "loss": 2.4456, + "step": 7524 + }, + { + "epoch": 0.8628597637885563, + "grad_norm": 0.26120545778488774, + "learning_rate": 4.8518996642221536e-05, + "loss": 2.3712, + "step": 7525 + }, + { + "epoch": 0.862974429537897, + "grad_norm": 0.26548425278540194, + "learning_rate": 4.8439230734703135e-05, + "loss": 2.3202, + "step": 7526 + }, + { + "epoch": 0.8630890952872377, + "grad_norm": 0.2541897315716967, + "learning_rate": 4.8359527111385304e-05, + "loss": 2.2117, + "step": 7527 + }, + { + "epoch": 0.8632037610365784, + "grad_norm": 0.265459699646245, + "learning_rate": 4.8279885783261704e-05, + "loss": 2.3264, + "step": 7528 + }, + { + "epoch": 0.8633184267859191, + "grad_norm": 0.28437568398474766, + "learning_rate": 4.820030676131726e-05, + "loss": 2.5454, + "step": 7529 + }, + { + "epoch": 0.8634330925352597, + "grad_norm": 0.2898740677015077, + "learning_rate": 4.812079005652842e-05, + "loss": 2.5458, + "step": 7530 + }, + { + "epoch": 0.8635477582846004, + "grad_norm": 0.2610459806874104, + "learning_rate": 4.804133567986307e-05, + "loss": 2.2771, + "step": 7531 + }, + { + "epoch": 0.8636624240339411, + "grad_norm": 0.2604464216622164, + "learning_rate": 4.796194364228029e-05, + "loss": 2.3779, + "step": 7532 + }, + { + "epoch": 0.8637770897832817, + "grad_norm": 0.2730962506091434, + "learning_rate": 4.788261395473065e-05, + "loss": 2.4007, + "step": 7533 + }, + { + "epoch": 0.8638917555326224, + "grad_norm": 0.2664853094252474, + "learning_rate": 4.780334662815644e-05, + "loss": 2.5028, + "step": 7534 + }, + { + "epoch": 0.864006421281963, + "grad_norm": 0.27936988057249607, + "learning_rate": 4.7724141673490816e-05, + "loss": 2.3651, + "step": 7535 + }, + { + "epoch": 0.8641210870313037, + "grad_norm": 0.2454247628582261, + "learning_rate": 4.7644999101658705e-05, + "loss": 2.3353, + "step": 7536 + }, + { + "epoch": 0.8642357527806445, + "grad_norm": 0.2631340836347278, + "learning_rate": 4.756591892357637e-05, + "loss": 2.4752, + "step": 7537 + }, + { + "epoch": 0.8643504185299851, + "grad_norm": 0.25685027363527335, + "learning_rate": 4.7486901150151197e-05, + "loss": 2.443, + "step": 7538 + }, + { + "epoch": 0.8644650842793258, + "grad_norm": 0.2683025680694736, + "learning_rate": 4.740794579228236e-05, + "loss": 2.4955, + "step": 7539 + }, + { + "epoch": 0.8645797500286664, + "grad_norm": 0.27734681180055326, + "learning_rate": 4.732905286086037e-05, + "loss": 2.5874, + "step": 7540 + }, + { + "epoch": 0.8646944157780071, + "grad_norm": 0.30948720090687226, + "learning_rate": 4.725022236676668e-05, + "loss": 2.4534, + "step": 7541 + }, + { + "epoch": 0.8648090815273478, + "grad_norm": 0.26625111623394787, + "learning_rate": 4.717145432087466e-05, + "loss": 2.4762, + "step": 7542 + }, + { + "epoch": 0.8649237472766884, + "grad_norm": 0.2795490410795455, + "learning_rate": 4.709274873404884e-05, + "loss": 2.477, + "step": 7543 + }, + { + "epoch": 0.8650384130260291, + "grad_norm": 0.24748686151921762, + "learning_rate": 4.701410561714509e-05, + "loss": 2.2869, + "step": 7544 + }, + { + "epoch": 0.8651530787753698, + "grad_norm": 0.25385967593851466, + "learning_rate": 4.693552498101078e-05, + "loss": 2.3031, + "step": 7545 + }, + { + "epoch": 0.8652677445247104, + "grad_norm": 0.2788516388643916, + "learning_rate": 4.6857006836484695e-05, + "loss": 2.4774, + "step": 7546 + }, + { + "epoch": 0.8653824102740512, + "grad_norm": 0.2658955969815931, + "learning_rate": 4.677855119439678e-05, + "loss": 2.5336, + "step": 7547 + }, + { + "epoch": 0.8654970760233918, + "grad_norm": 0.28068508282585364, + "learning_rate": 4.670015806556843e-05, + "loss": 2.2472, + "step": 7548 + }, + { + "epoch": 0.8656117417727325, + "grad_norm": 0.28079822547632677, + "learning_rate": 4.6621827460812725e-05, + "loss": 2.4176, + "step": 7549 + }, + { + "epoch": 0.8657264075220732, + "grad_norm": 0.2767932947049551, + "learning_rate": 4.6543559390933685e-05, + "loss": 2.4025, + "step": 7550 + }, + { + "epoch": 0.8658410732714138, + "grad_norm": 0.2789421233295059, + "learning_rate": 4.6465353866727014e-05, + "loss": 2.3948, + "step": 7551 + }, + { + "epoch": 0.8659557390207545, + "grad_norm": 0.23794909996909555, + "learning_rate": 4.638721089897968e-05, + "loss": 2.2494, + "step": 7552 + }, + { + "epoch": 0.8660704047700952, + "grad_norm": 0.25203129599706925, + "learning_rate": 4.6309130498469744e-05, + "loss": 2.4435, + "step": 7553 + }, + { + "epoch": 0.8661850705194358, + "grad_norm": 0.2658433541751157, + "learning_rate": 4.623111267596725e-05, + "loss": 2.3067, + "step": 7554 + }, + { + "epoch": 0.8662997362687765, + "grad_norm": 0.2509143147990918, + "learning_rate": 4.61531574422332e-05, + "loss": 2.4901, + "step": 7555 + }, + { + "epoch": 0.8664144020181171, + "grad_norm": 0.2627905123146582, + "learning_rate": 4.607526480801988e-05, + "loss": 2.3802, + "step": 7556 + }, + { + "epoch": 0.8665290677674579, + "grad_norm": 0.2541623611844552, + "learning_rate": 4.59974347840712e-05, + "loss": 2.3149, + "step": 7557 + }, + { + "epoch": 0.8666437335167986, + "grad_norm": 0.2514468185470806, + "learning_rate": 4.59196673811223e-05, + "loss": 2.3542, + "step": 7558 + }, + { + "epoch": 0.8667583992661392, + "grad_norm": 0.2518103404012861, + "learning_rate": 4.584196260989975e-05, + "loss": 2.3851, + "step": 7559 + }, + { + "epoch": 0.8668730650154799, + "grad_norm": 0.2748275447138567, + "learning_rate": 4.576432048112145e-05, + "loss": 2.3721, + "step": 7560 + }, + { + "epoch": 0.8669877307648205, + "grad_norm": 0.2532233203036863, + "learning_rate": 4.568674100549664e-05, + "loss": 2.4745, + "step": 7561 + }, + { + "epoch": 0.8671023965141612, + "grad_norm": 0.2589750193162282, + "learning_rate": 4.560922419372587e-05, + "loss": 2.447, + "step": 7562 + }, + { + "epoch": 0.8672170622635019, + "grad_norm": 0.2896788461721633, + "learning_rate": 4.553177005650111e-05, + "loss": 2.4129, + "step": 7563 + }, + { + "epoch": 0.8673317280128425, + "grad_norm": 0.24787864313204927, + "learning_rate": 4.545437860450574e-05, + "loss": 2.404, + "step": 7564 + }, + { + "epoch": 0.8674463937621832, + "grad_norm": 0.24490035891030096, + "learning_rate": 4.53770498484144e-05, + "loss": 2.3503, + "step": 7565 + }, + { + "epoch": 0.867561059511524, + "grad_norm": 0.29008269954674465, + "learning_rate": 4.5299783798893154e-05, + "loss": 2.5506, + "step": 7566 + }, + { + "epoch": 0.8676757252608646, + "grad_norm": 0.25932507536681576, + "learning_rate": 4.522258046659944e-05, + "loss": 2.3907, + "step": 7567 + }, + { + "epoch": 0.8677903910102053, + "grad_norm": 0.24701258550245359, + "learning_rate": 4.514543986218173e-05, + "loss": 2.3926, + "step": 7568 + }, + { + "epoch": 0.8679050567595459, + "grad_norm": 0.2631782799640303, + "learning_rate": 4.5068361996280393e-05, + "loss": 2.5567, + "step": 7569 + }, + { + "epoch": 0.8680197225088866, + "grad_norm": 0.2538545676333995, + "learning_rate": 4.499134687952672e-05, + "loss": 2.5068, + "step": 7570 + }, + { + "epoch": 0.8681343882582273, + "grad_norm": 0.26549123328737745, + "learning_rate": 4.491439452254348e-05, + "loss": 2.4734, + "step": 7571 + }, + { + "epoch": 0.8682490540075679, + "grad_norm": 0.2510687472469412, + "learning_rate": 4.483750493594474e-05, + "loss": 2.3964, + "step": 7572 + }, + { + "epoch": 0.8683637197569086, + "grad_norm": 0.2469279738097297, + "learning_rate": 4.4760678130336076e-05, + "loss": 2.3824, + "step": 7573 + }, + { + "epoch": 0.8684783855062492, + "grad_norm": 0.2357818188278889, + "learning_rate": 4.468391411631417e-05, + "loss": 2.3232, + "step": 7574 + }, + { + "epoch": 0.8685930512555899, + "grad_norm": 0.2655283428412872, + "learning_rate": 4.4607212904467285e-05, + "loss": 2.3735, + "step": 7575 + }, + { + "epoch": 0.8687077170049307, + "grad_norm": 0.2737125971745604, + "learning_rate": 4.453057450537473e-05, + "loss": 2.377, + "step": 7576 + }, + { + "epoch": 0.8688223827542713, + "grad_norm": 0.25866634348935846, + "learning_rate": 4.445399892960738e-05, + "loss": 2.3985, + "step": 7577 + }, + { + "epoch": 0.868937048503612, + "grad_norm": 0.24887854726973824, + "learning_rate": 4.437748618772741e-05, + "loss": 2.3914, + "step": 7578 + }, + { + "epoch": 0.8690517142529527, + "grad_norm": 0.24958462902763495, + "learning_rate": 4.4301036290288254e-05, + "loss": 2.3689, + "step": 7579 + }, + { + "epoch": 0.8691663800022933, + "grad_norm": 0.2633820114493486, + "learning_rate": 4.42246492478347e-05, + "loss": 2.2858, + "step": 7580 + }, + { + "epoch": 0.869281045751634, + "grad_norm": 0.25135418461105513, + "learning_rate": 4.4148325070902994e-05, + "loss": 2.3758, + "step": 7581 + }, + { + "epoch": 0.8693957115009746, + "grad_norm": 0.29293457967361597, + "learning_rate": 4.407206377002049e-05, + "loss": 2.4101, + "step": 7582 + }, + { + "epoch": 0.8695103772503153, + "grad_norm": 0.25136549978553563, + "learning_rate": 4.3995865355705876e-05, + "loss": 2.3329, + "step": 7583 + }, + { + "epoch": 0.869625042999656, + "grad_norm": 0.258351193263078, + "learning_rate": 4.391972983846959e-05, + "loss": 2.3308, + "step": 7584 + }, + { + "epoch": 0.8697397087489966, + "grad_norm": 0.22160632167761943, + "learning_rate": 4.3843657228812836e-05, + "loss": 2.4114, + "step": 7585 + }, + { + "epoch": 0.8698543744983374, + "grad_norm": 0.2670705851476512, + "learning_rate": 4.3767647537228396e-05, + "loss": 2.3808, + "step": 7586 + }, + { + "epoch": 0.8699690402476781, + "grad_norm": 0.24635500806979568, + "learning_rate": 4.369170077420048e-05, + "loss": 2.3992, + "step": 7587 + }, + { + "epoch": 0.8700837059970187, + "grad_norm": 0.2613072259723636, + "learning_rate": 4.361581695020428e-05, + "loss": 2.3956, + "step": 7588 + }, + { + "epoch": 0.8701983717463594, + "grad_norm": 0.2603857962075291, + "learning_rate": 4.353999607570674e-05, + "loss": 2.3957, + "step": 7589 + }, + { + "epoch": 0.8703130374957, + "grad_norm": 0.26819169753217453, + "learning_rate": 4.34642381611659e-05, + "loss": 2.4115, + "step": 7590 + }, + { + "epoch": 0.8704277032450407, + "grad_norm": 0.2871064807701723, + "learning_rate": 4.338854321703095e-05, + "loss": 2.4871, + "step": 7591 + }, + { + "epoch": 0.8705423689943814, + "grad_norm": 0.24936790403539127, + "learning_rate": 4.331291125374265e-05, + "loss": 2.4834, + "step": 7592 + }, + { + "epoch": 0.870657034743722, + "grad_norm": 0.265413715570518, + "learning_rate": 4.3237342281732986e-05, + "loss": 2.3899, + "step": 7593 + }, + { + "epoch": 0.8707717004930627, + "grad_norm": 0.2754066242465423, + "learning_rate": 4.3161836311425286e-05, + "loss": 2.369, + "step": 7594 + }, + { + "epoch": 0.8708863662424033, + "grad_norm": 0.25463282441226226, + "learning_rate": 4.308639335323417e-05, + "loss": 2.5293, + "step": 7595 + }, + { + "epoch": 0.8710010319917441, + "grad_norm": 0.24094454554683684, + "learning_rate": 4.301101341756558e-05, + "loss": 2.4005, + "step": 7596 + }, + { + "epoch": 0.8711156977410848, + "grad_norm": 0.27748375862915653, + "learning_rate": 4.29356965148166e-05, + "loss": 2.42, + "step": 7597 + }, + { + "epoch": 0.8712303634904254, + "grad_norm": 0.24321474036360463, + "learning_rate": 4.286044265537575e-05, + "loss": 2.3251, + "step": 7598 + }, + { + "epoch": 0.8713450292397661, + "grad_norm": 0.2305483631280844, + "learning_rate": 4.278525184962312e-05, + "loss": 2.3091, + "step": 7599 + }, + { + "epoch": 0.8714596949891068, + "grad_norm": 0.2577075625066181, + "learning_rate": 4.2710124107929624e-05, + "loss": 2.4276, + "step": 7600 + }, + { + "epoch": 0.8715743607384474, + "grad_norm": 0.26959321892250643, + "learning_rate": 4.2635059440657767e-05, + "loss": 2.4533, + "step": 7601 + }, + { + "epoch": 0.8716890264877881, + "grad_norm": 0.25885123183305475, + "learning_rate": 4.256005785816142e-05, + "loss": 2.3208, + "step": 7602 + }, + { + "epoch": 0.8718036922371287, + "grad_norm": 0.2654863752665307, + "learning_rate": 4.2485119370785254e-05, + "loss": 2.3138, + "step": 7603 + }, + { + "epoch": 0.8719183579864694, + "grad_norm": 0.26906136080506504, + "learning_rate": 4.2410243988866006e-05, + "loss": 2.341, + "step": 7604 + }, + { + "epoch": 0.8720330237358102, + "grad_norm": 0.2394984111622575, + "learning_rate": 4.233543172273119e-05, + "loss": 2.4601, + "step": 7605 + }, + { + "epoch": 0.8721476894851508, + "grad_norm": 0.2736493805587362, + "learning_rate": 4.226068258269955e-05, + "loss": 2.4671, + "step": 7606 + }, + { + "epoch": 0.8722623552344915, + "grad_norm": 0.2499539050953273, + "learning_rate": 4.218599657908151e-05, + "loss": 2.5294, + "step": 7607 + }, + { + "epoch": 0.8723770209838321, + "grad_norm": 0.25737835074394616, + "learning_rate": 4.211137372217849e-05, + "loss": 2.3614, + "step": 7608 + }, + { + "epoch": 0.8724916867331728, + "grad_norm": 0.2649955642745985, + "learning_rate": 4.203681402228332e-05, + "loss": 2.4215, + "step": 7609 + }, + { + "epoch": 0.8726063524825135, + "grad_norm": 0.2555902386064952, + "learning_rate": 4.1962317489680104e-05, + "loss": 2.4315, + "step": 7610 + }, + { + "epoch": 0.8727210182318541, + "grad_norm": 0.2583100806221419, + "learning_rate": 4.188788413464423e-05, + "loss": 2.3842, + "step": 7611 + }, + { + "epoch": 0.8728356839811948, + "grad_norm": 0.28998045843855647, + "learning_rate": 4.181351396744226e-05, + "loss": 2.4582, + "step": 7612 + }, + { + "epoch": 0.8729503497305355, + "grad_norm": 0.28796453701880076, + "learning_rate": 4.173920699833223e-05, + "loss": 2.3616, + "step": 7613 + }, + { + "epoch": 0.8730650154798761, + "grad_norm": 0.26824842564028595, + "learning_rate": 4.166496323756336e-05, + "loss": 2.3443, + "step": 7614 + }, + { + "epoch": 0.8731796812292169, + "grad_norm": 0.2662512954186086, + "learning_rate": 4.159078269537614e-05, + "loss": 2.2964, + "step": 7615 + }, + { + "epoch": 0.8732943469785575, + "grad_norm": 0.25509339386945734, + "learning_rate": 4.1516665382002385e-05, + "loss": 2.4042, + "step": 7616 + }, + { + "epoch": 0.8734090127278982, + "grad_norm": 0.24454177918960515, + "learning_rate": 4.144261130766519e-05, + "loss": 2.3534, + "step": 7617 + }, + { + "epoch": 0.8735236784772389, + "grad_norm": 0.2374474651301502, + "learning_rate": 4.136862048257872e-05, + "loss": 2.4336, + "step": 7618 + }, + { + "epoch": 0.8736383442265795, + "grad_norm": 0.2313996348113904, + "learning_rate": 4.129469291694893e-05, + "loss": 2.3994, + "step": 7619 + }, + { + "epoch": 0.8737530099759202, + "grad_norm": 0.2616459861301337, + "learning_rate": 4.122082862097243e-05, + "loss": 2.5434, + "step": 7620 + }, + { + "epoch": 0.8738676757252609, + "grad_norm": 0.2664246624796791, + "learning_rate": 4.1147027604837515e-05, + "loss": 2.4715, + "step": 7621 + }, + { + "epoch": 0.8739823414746015, + "grad_norm": 0.28760612650943945, + "learning_rate": 4.107328987872361e-05, + "loss": 2.5415, + "step": 7622 + }, + { + "epoch": 0.8740970072239422, + "grad_norm": 0.27702168168647046, + "learning_rate": 4.09996154528014e-05, + "loss": 2.6015, + "step": 7623 + }, + { + "epoch": 0.8742116729732828, + "grad_norm": 0.2472953021577611, + "learning_rate": 4.0926004337232926e-05, + "loss": 2.3707, + "step": 7624 + }, + { + "epoch": 0.8743263387226236, + "grad_norm": 0.22990184722081428, + "learning_rate": 4.085245654217146e-05, + "loss": 2.4507, + "step": 7625 + }, + { + "epoch": 0.8744410044719643, + "grad_norm": 0.24144417664890302, + "learning_rate": 4.077897207776138e-05, + "loss": 2.4182, + "step": 7626 + }, + { + "epoch": 0.8745556702213049, + "grad_norm": 0.2525417628631351, + "learning_rate": 4.0705550954138585e-05, + "loss": 2.3073, + "step": 7627 + }, + { + "epoch": 0.8746703359706456, + "grad_norm": 0.26678836328118244, + "learning_rate": 4.0632193181430074e-05, + "loss": 2.3723, + "step": 7628 + }, + { + "epoch": 0.8747850017199862, + "grad_norm": 0.26899167050816297, + "learning_rate": 4.055889876975416e-05, + "loss": 2.5314, + "step": 7629 + }, + { + "epoch": 0.8748996674693269, + "grad_norm": 0.2441483534613465, + "learning_rate": 4.048566772922041e-05, + "loss": 2.2773, + "step": 7630 + }, + { + "epoch": 0.8750143332186676, + "grad_norm": 0.24453055855428704, + "learning_rate": 4.0412500069929746e-05, + "loss": 2.4655, + "step": 7631 + }, + { + "epoch": 0.8751289989680082, + "grad_norm": 0.2587396752038846, + "learning_rate": 4.0339395801974096e-05, + "loss": 2.4048, + "step": 7632 + }, + { + "epoch": 0.8752436647173489, + "grad_norm": 0.26359620643091675, + "learning_rate": 4.0266354935436785e-05, + "loss": 2.46, + "step": 7633 + }, + { + "epoch": 0.8753583304666896, + "grad_norm": 0.24242810725388683, + "learning_rate": 4.0193377480392646e-05, + "loss": 2.3804, + "step": 7634 + }, + { + "epoch": 0.8754729962160303, + "grad_norm": 0.26072350752919854, + "learning_rate": 4.0120463446907284e-05, + "loss": 2.4393, + "step": 7635 + }, + { + "epoch": 0.875587661965371, + "grad_norm": 0.2575579224680871, + "learning_rate": 4.0047612845037885e-05, + "loss": 2.4855, + "step": 7636 + }, + { + "epoch": 0.8757023277147116, + "grad_norm": 0.2646997709456313, + "learning_rate": 3.9974825684832905e-05, + "loss": 2.357, + "step": 7637 + }, + { + "epoch": 0.8758169934640523, + "grad_norm": 0.26019690866772366, + "learning_rate": 3.9902101976331654e-05, + "loss": 2.4213, + "step": 7638 + }, + { + "epoch": 0.875931659213393, + "grad_norm": 0.24264201480781492, + "learning_rate": 3.982944172956526e-05, + "loss": 2.3159, + "step": 7639 + }, + { + "epoch": 0.8760463249627336, + "grad_norm": 0.24727180741188337, + "learning_rate": 3.975684495455584e-05, + "loss": 2.3334, + "step": 7640 + }, + { + "epoch": 0.8761609907120743, + "grad_norm": 0.2616552500011797, + "learning_rate": 3.968431166131647e-05, + "loss": 2.3125, + "step": 7641 + }, + { + "epoch": 0.8762756564614149, + "grad_norm": 0.2653104251259976, + "learning_rate": 3.961184185985195e-05, + "loss": 2.2448, + "step": 7642 + }, + { + "epoch": 0.8763903222107556, + "grad_norm": 0.2709995102838466, + "learning_rate": 3.9539435560158045e-05, + "loss": 2.3838, + "step": 7643 + }, + { + "epoch": 0.8765049879600963, + "grad_norm": 0.25529329876921053, + "learning_rate": 3.9467092772221823e-05, + "loss": 2.3685, + "step": 7644 + }, + { + "epoch": 0.876619653709437, + "grad_norm": 0.26286207784063564, + "learning_rate": 3.939481350602159e-05, + "loss": 2.4253, + "step": 7645 + }, + { + "epoch": 0.8767343194587777, + "grad_norm": 0.2652555961270944, + "learning_rate": 3.932259777152702e-05, + "loss": 2.3661, + "step": 7646 + }, + { + "epoch": 0.8768489852081184, + "grad_norm": 0.2559564037677125, + "learning_rate": 3.9250445578698665e-05, + "loss": 2.4806, + "step": 7647 + }, + { + "epoch": 0.876963650957459, + "grad_norm": 0.2564624994804444, + "learning_rate": 3.9178356937488555e-05, + "loss": 2.3919, + "step": 7648 + }, + { + "epoch": 0.8770783167067997, + "grad_norm": 0.25174082041608176, + "learning_rate": 3.910633185784024e-05, + "loss": 2.4126, + "step": 7649 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 0.2616942726240522, + "learning_rate": 3.903437034968793e-05, + "loss": 2.3121, + "step": 7650 + }, + { + "epoch": 0.877307648205481, + "grad_norm": 0.22940061510322224, + "learning_rate": 3.896247242295742e-05, + "loss": 2.5577, + "step": 7651 + }, + { + "epoch": 0.8774223139548217, + "grad_norm": 0.2607146298396803, + "learning_rate": 3.8890638087565765e-05, + "loss": 2.5542, + "step": 7652 + }, + { + "epoch": 0.8775369797041623, + "grad_norm": 0.2821742461765685, + "learning_rate": 3.881886735342083e-05, + "loss": 2.4249, + "step": 7653 + }, + { + "epoch": 0.877651645453503, + "grad_norm": 0.26117832784199285, + "learning_rate": 3.874716023042235e-05, + "loss": 2.4518, + "step": 7654 + }, + { + "epoch": 0.8777663112028438, + "grad_norm": 0.2595817848545602, + "learning_rate": 3.867551672846092e-05, + "loss": 2.3867, + "step": 7655 + }, + { + "epoch": 0.8778809769521844, + "grad_norm": 0.27533690013425666, + "learning_rate": 3.86039368574182e-05, + "loss": 2.4348, + "step": 7656 + }, + { + "epoch": 0.8779956427015251, + "grad_norm": 0.23623155375699328, + "learning_rate": 3.8532420627167384e-05, + "loss": 2.419, + "step": 7657 + }, + { + "epoch": 0.8781103084508657, + "grad_norm": 0.24334702142614192, + "learning_rate": 3.846096804757282e-05, + "loss": 2.4511, + "step": 7658 + }, + { + "epoch": 0.8782249742002064, + "grad_norm": 0.2831851053581169, + "learning_rate": 3.838957912848995e-05, + "loss": 2.3661, + "step": 7659 + }, + { + "epoch": 0.8783396399495471, + "grad_norm": 0.25501863558789634, + "learning_rate": 3.8318253879765506e-05, + "loss": 2.3724, + "step": 7660 + }, + { + "epoch": 0.8784543056988877, + "grad_norm": 0.27956953106040827, + "learning_rate": 3.824699231123763e-05, + "loss": 2.3913, + "step": 7661 + }, + { + "epoch": 0.8785689714482284, + "grad_norm": 0.22748027738332083, + "learning_rate": 3.817579443273522e-05, + "loss": 2.2276, + "step": 7662 + }, + { + "epoch": 0.878683637197569, + "grad_norm": 0.2593140656839005, + "learning_rate": 3.810466025407883e-05, + "loss": 2.5078, + "step": 7663 + }, + { + "epoch": 0.8787983029469097, + "grad_norm": 0.24681092612583647, + "learning_rate": 3.8033589785080034e-05, + "loss": 2.5683, + "step": 7664 + }, + { + "epoch": 0.8789129686962505, + "grad_norm": 0.2416036463231976, + "learning_rate": 3.796258303554162e-05, + "loss": 2.543, + "step": 7665 + }, + { + "epoch": 0.8790276344455911, + "grad_norm": 0.2766024816283266, + "learning_rate": 3.789164001525769e-05, + "loss": 2.5153, + "step": 7666 + }, + { + "epoch": 0.8791423001949318, + "grad_norm": 0.2563242960549551, + "learning_rate": 3.782076073401347e-05, + "loss": 2.516, + "step": 7667 + }, + { + "epoch": 0.8792569659442725, + "grad_norm": 0.23862940113332057, + "learning_rate": 3.774994520158531e-05, + "loss": 2.415, + "step": 7668 + }, + { + "epoch": 0.8793716316936131, + "grad_norm": 0.2522519479222634, + "learning_rate": 3.7679193427740895e-05, + "loss": 2.2903, + "step": 7669 + }, + { + "epoch": 0.8794862974429538, + "grad_norm": 0.2591567267096782, + "learning_rate": 3.760850542223909e-05, + "loss": 2.2433, + "step": 7670 + }, + { + "epoch": 0.8796009631922944, + "grad_norm": 0.2508280644116827, + "learning_rate": 3.753788119483004e-05, + "loss": 2.3279, + "step": 7671 + }, + { + "epoch": 0.8797156289416351, + "grad_norm": 0.2716913282594248, + "learning_rate": 3.746732075525494e-05, + "loss": 2.4559, + "step": 7672 + }, + { + "epoch": 0.8798302946909758, + "grad_norm": 0.28585096843898883, + "learning_rate": 3.73968241132463e-05, + "loss": 2.3828, + "step": 7673 + }, + { + "epoch": 0.8799449604403164, + "grad_norm": 0.29546035398047377, + "learning_rate": 3.7326391278527605e-05, + "loss": 2.4535, + "step": 7674 + }, + { + "epoch": 0.8800596261896572, + "grad_norm": 0.2546297539491112, + "learning_rate": 3.725602226081409e-05, + "loss": 2.3608, + "step": 7675 + }, + { + "epoch": 0.8801742919389978, + "grad_norm": 0.25840799060391817, + "learning_rate": 3.71857170698115e-05, + "loss": 2.4109, + "step": 7676 + }, + { + "epoch": 0.8802889576883385, + "grad_norm": 0.27521806217112205, + "learning_rate": 3.711547571521717e-05, + "loss": 2.2516, + "step": 7677 + }, + { + "epoch": 0.8804036234376792, + "grad_norm": 0.25297973264287055, + "learning_rate": 3.7045298206719594e-05, + "loss": 2.3747, + "step": 7678 + }, + { + "epoch": 0.8805182891870198, + "grad_norm": 0.2571466502587079, + "learning_rate": 3.69751845539984e-05, + "loss": 2.3711, + "step": 7679 + }, + { + "epoch": 0.8806329549363605, + "grad_norm": 0.27663279116394934, + "learning_rate": 3.6905134766724466e-05, + "loss": 2.3519, + "step": 7680 + }, + { + "epoch": 0.8807476206857012, + "grad_norm": 0.24293568497480866, + "learning_rate": 3.683514885455985e-05, + "loss": 2.5819, + "step": 7681 + }, + { + "epoch": 0.8808622864350418, + "grad_norm": 0.26510507872151523, + "learning_rate": 3.676522682715766e-05, + "loss": 2.3801, + "step": 7682 + }, + { + "epoch": 0.8809769521843825, + "grad_norm": 0.2517953498303615, + "learning_rate": 3.6695368694162345e-05, + "loss": 2.2978, + "step": 7683 + }, + { + "epoch": 0.8810916179337231, + "grad_norm": 0.29215431056835484, + "learning_rate": 3.662557446520959e-05, + "loss": 2.3113, + "step": 7684 + }, + { + "epoch": 0.8812062836830639, + "grad_norm": 0.26625909025735733, + "learning_rate": 3.655584414992608e-05, + "loss": 2.3097, + "step": 7685 + }, + { + "epoch": 0.8813209494324046, + "grad_norm": 0.23211484920922443, + "learning_rate": 3.6486177757929784e-05, + "loss": 2.3062, + "step": 7686 + }, + { + "epoch": 0.8814356151817452, + "grad_norm": 0.27727865200689944, + "learning_rate": 3.641657529883002e-05, + "loss": 2.5506, + "step": 7687 + }, + { + "epoch": 0.8815502809310859, + "grad_norm": 0.2512554364969393, + "learning_rate": 3.634703678222689e-05, + "loss": 2.4909, + "step": 7688 + }, + { + "epoch": 0.8816649466804266, + "grad_norm": 0.25459045606606606, + "learning_rate": 3.6277562217711946e-05, + "loss": 2.4853, + "step": 7689 + }, + { + "epoch": 0.8817796124297672, + "grad_norm": 0.2564899100539294, + "learning_rate": 3.620815161486807e-05, + "loss": 2.3574, + "step": 7690 + }, + { + "epoch": 0.8818942781791079, + "grad_norm": 0.25398050635245323, + "learning_rate": 3.613880498326894e-05, + "loss": 2.3883, + "step": 7691 + }, + { + "epoch": 0.8820089439284485, + "grad_norm": 0.2535465928738247, + "learning_rate": 3.6069522332479674e-05, + "loss": 2.3935, + "step": 7692 + }, + { + "epoch": 0.8821236096777892, + "grad_norm": 0.2491559046101294, + "learning_rate": 3.6000303672056537e-05, + "loss": 2.4813, + "step": 7693 + }, + { + "epoch": 0.88223827542713, + "grad_norm": 0.2799190692029175, + "learning_rate": 3.593114901154676e-05, + "loss": 2.2892, + "step": 7694 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.27621329226019237, + "learning_rate": 3.586205836048906e-05, + "loss": 2.4511, + "step": 7695 + }, + { + "epoch": 0.8824676069258113, + "grad_norm": 0.24100600796440189, + "learning_rate": 3.579303172841319e-05, + "loss": 2.4576, + "step": 7696 + }, + { + "epoch": 0.8825822726751519, + "grad_norm": 0.2584762015053148, + "learning_rate": 3.5724069124839945e-05, + "loss": 2.4851, + "step": 7697 + }, + { + "epoch": 0.8826969384244926, + "grad_norm": 0.28550518874219943, + "learning_rate": 3.565517055928147e-05, + "loss": 2.4149, + "step": 7698 + }, + { + "epoch": 0.8828116041738333, + "grad_norm": 0.26534674304662104, + "learning_rate": 3.5586336041241004e-05, + "loss": 2.4513, + "step": 7699 + }, + { + "epoch": 0.8829262699231739, + "grad_norm": 0.29498718440544464, + "learning_rate": 3.5517565580212966e-05, + "loss": 2.5358, + "step": 7700 + }, + { + "epoch": 0.8830409356725146, + "grad_norm": 0.2630769266983824, + "learning_rate": 3.5448859185682856e-05, + "loss": 2.3146, + "step": 7701 + }, + { + "epoch": 0.8831556014218553, + "grad_norm": 0.2693022580221657, + "learning_rate": 3.538021686712761e-05, + "loss": 2.3084, + "step": 7702 + }, + { + "epoch": 0.8832702671711959, + "grad_norm": 0.2629705608509755, + "learning_rate": 3.5311638634014874e-05, + "loss": 2.3208, + "step": 7703 + }, + { + "epoch": 0.8833849329205367, + "grad_norm": 0.25981481205938256, + "learning_rate": 3.524312449580375e-05, + "loss": 2.5015, + "step": 7704 + }, + { + "epoch": 0.8834995986698773, + "grad_norm": 0.24200271527823503, + "learning_rate": 3.517467446194472e-05, + "loss": 2.2255, + "step": 7705 + }, + { + "epoch": 0.883614264419218, + "grad_norm": 0.2608698729408616, + "learning_rate": 3.510628854187886e-05, + "loss": 2.342, + "step": 7706 + }, + { + "epoch": 0.8837289301685587, + "grad_norm": 0.2434117234689837, + "learning_rate": 3.503796674503884e-05, + "loss": 2.3327, + "step": 7707 + }, + { + "epoch": 0.8838435959178993, + "grad_norm": 0.2414002881663324, + "learning_rate": 3.496970908084834e-05, + "loss": 2.4323, + "step": 7708 + }, + { + "epoch": 0.88395826166724, + "grad_norm": 0.2519533507392297, + "learning_rate": 3.490151555872212e-05, + "loss": 2.5395, + "step": 7709 + }, + { + "epoch": 0.8840729274165807, + "grad_norm": 0.2791458549507502, + "learning_rate": 3.483338618806625e-05, + "loss": 2.3837, + "step": 7710 + }, + { + "epoch": 0.8841875931659213, + "grad_norm": 0.2289687750593109, + "learning_rate": 3.4765320978278004e-05, + "loss": 2.2176, + "step": 7711 + }, + { + "epoch": 0.884302258915262, + "grad_norm": 0.2740979751747013, + "learning_rate": 3.469731993874542e-05, + "loss": 2.3464, + "step": 7712 + }, + { + "epoch": 0.8844169246646026, + "grad_norm": 0.2678973478862052, + "learning_rate": 3.4629383078848106e-05, + "loss": 2.4294, + "step": 7713 + }, + { + "epoch": 0.8845315904139434, + "grad_norm": 0.2843393999561222, + "learning_rate": 3.456151040795663e-05, + "loss": 2.5201, + "step": 7714 + }, + { + "epoch": 0.8846462561632841, + "grad_norm": 0.2602932119888259, + "learning_rate": 3.449370193543272e-05, + "loss": 2.3309, + "step": 7715 + }, + { + "epoch": 0.8847609219126247, + "grad_norm": 0.24984559252827407, + "learning_rate": 3.4425957670629284e-05, + "loss": 2.3974, + "step": 7716 + }, + { + "epoch": 0.8848755876619654, + "grad_norm": 0.25928177983440165, + "learning_rate": 3.43582776228904e-05, + "loss": 2.5503, + "step": 7717 + }, + { + "epoch": 0.884990253411306, + "grad_norm": 0.24475822792902074, + "learning_rate": 3.429066180155111e-05, + "loss": 2.3711, + "step": 7718 + }, + { + "epoch": 0.8851049191606467, + "grad_norm": 0.2289359717670924, + "learning_rate": 3.422311021593777e-05, + "loss": 2.4663, + "step": 7719 + }, + { + "epoch": 0.8852195849099874, + "grad_norm": 0.2818462664822109, + "learning_rate": 3.415562287536789e-05, + "loss": 2.5567, + "step": 7720 + }, + { + "epoch": 0.885334250659328, + "grad_norm": 0.2563939199579085, + "learning_rate": 3.408819978915001e-05, + "loss": 2.5003, + "step": 7721 + }, + { + "epoch": 0.8854489164086687, + "grad_norm": 0.25629023485693486, + "learning_rate": 3.4020840966583924e-05, + "loss": 2.2314, + "step": 7722 + }, + { + "epoch": 0.8855635821580095, + "grad_norm": 0.27371491588418795, + "learning_rate": 3.3953546416960525e-05, + "loss": 2.3868, + "step": 7723 + }, + { + "epoch": 0.8856782479073501, + "grad_norm": 0.2580296951907124, + "learning_rate": 3.388631614956156e-05, + "loss": 2.4225, + "step": 7724 + }, + { + "epoch": 0.8857929136566908, + "grad_norm": 0.2525494786826452, + "learning_rate": 3.38191501736605e-05, + "loss": 2.446, + "step": 7725 + }, + { + "epoch": 0.8859075794060314, + "grad_norm": 0.25758612792633184, + "learning_rate": 3.375204849852137e-05, + "loss": 2.373, + "step": 7726 + }, + { + "epoch": 0.8860222451553721, + "grad_norm": 0.2518726559205612, + "learning_rate": 3.3685011133399724e-05, + "loss": 2.4361, + "step": 7727 + }, + { + "epoch": 0.8861369109047128, + "grad_norm": 0.2539401257950697, + "learning_rate": 3.361803808754194e-05, + "loss": 2.2387, + "step": 7728 + }, + { + "epoch": 0.8862515766540534, + "grad_norm": 0.2710292371866573, + "learning_rate": 3.3551129370185786e-05, + "loss": 2.451, + "step": 7729 + }, + { + "epoch": 0.8863662424033941, + "grad_norm": 0.26679334313908754, + "learning_rate": 3.348428499056e-05, + "loss": 2.3997, + "step": 7730 + }, + { + "epoch": 0.8864809081527347, + "grad_norm": 0.2655288831546453, + "learning_rate": 3.341750495788459e-05, + "loss": 2.4122, + "step": 7731 + }, + { + "epoch": 0.8865955739020754, + "grad_norm": 0.2704158250167759, + "learning_rate": 3.3350789281370477e-05, + "loss": 2.3696, + "step": 7732 + }, + { + "epoch": 0.8867102396514162, + "grad_norm": 0.27163187571066705, + "learning_rate": 3.328413797021973e-05, + "loss": 2.3585, + "step": 7733 + }, + { + "epoch": 0.8868249054007568, + "grad_norm": 0.24313872140552956, + "learning_rate": 3.321755103362584e-05, + "loss": 2.3793, + "step": 7734 + }, + { + "epoch": 0.8869395711500975, + "grad_norm": 0.2608363617738911, + "learning_rate": 3.315102848077306e-05, + "loss": 2.4685, + "step": 7735 + }, + { + "epoch": 0.8870542368994382, + "grad_norm": 0.24224073494857384, + "learning_rate": 3.308457032083689e-05, + "loss": 2.2408, + "step": 7736 + }, + { + "epoch": 0.8871689026487788, + "grad_norm": 0.23182832364218836, + "learning_rate": 3.301817656298417e-05, + "loss": 2.2865, + "step": 7737 + }, + { + "epoch": 0.8872835683981195, + "grad_norm": 0.2420227910539694, + "learning_rate": 3.295184721637245e-05, + "loss": 2.363, + "step": 7738 + }, + { + "epoch": 0.8873982341474601, + "grad_norm": 0.24758686548660996, + "learning_rate": 3.288558229015054e-05, + "loss": 2.3525, + "step": 7739 + }, + { + "epoch": 0.8875128998968008, + "grad_norm": 0.2739520645162603, + "learning_rate": 3.2819381793458724e-05, + "loss": 2.2985, + "step": 7740 + }, + { + "epoch": 0.8876275656461415, + "grad_norm": 0.2723640966767095, + "learning_rate": 3.275324573542782e-05, + "loss": 2.4885, + "step": 7741 + }, + { + "epoch": 0.8877422313954821, + "grad_norm": 0.22564194151408037, + "learning_rate": 3.268717412518013e-05, + "loss": 2.2813, + "step": 7742 + }, + { + "epoch": 0.8878568971448229, + "grad_norm": 0.23749219986013811, + "learning_rate": 3.262116697182899e-05, + "loss": 2.4247, + "step": 7743 + }, + { + "epoch": 0.8879715628941636, + "grad_norm": 0.23177614459197315, + "learning_rate": 3.255522428447877e-05, + "loss": 2.2875, + "step": 7744 + }, + { + "epoch": 0.8880862286435042, + "grad_norm": 0.2399891268449904, + "learning_rate": 3.248934607222509e-05, + "loss": 2.424, + "step": 7745 + }, + { + "epoch": 0.8882008943928449, + "grad_norm": 0.26651423750304626, + "learning_rate": 3.2423532344154684e-05, + "loss": 2.4469, + "step": 7746 + }, + { + "epoch": 0.8883155601421855, + "grad_norm": 0.25781809204881734, + "learning_rate": 3.2357783109345005e-05, + "loss": 2.4504, + "step": 7747 + }, + { + "epoch": 0.8884302258915262, + "grad_norm": 0.23994898840168424, + "learning_rate": 3.229209837686514e-05, + "loss": 2.3198, + "step": 7748 + }, + { + "epoch": 0.8885448916408669, + "grad_norm": 0.2483453972214529, + "learning_rate": 3.222647815577495e-05, + "loss": 2.3544, + "step": 7749 + }, + { + "epoch": 0.8886595573902075, + "grad_norm": 0.24268611470012, + "learning_rate": 3.2160922455125527e-05, + "loss": 2.4357, + "step": 7750 + }, + { + "epoch": 0.8887742231395482, + "grad_norm": 0.25492869194138024, + "learning_rate": 3.209543128395903e-05, + "loss": 2.4691, + "step": 7751 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.25429335907543055, + "learning_rate": 3.203000465130879e-05, + "loss": 2.3445, + "step": 7752 + }, + { + "epoch": 0.8890035546382296, + "grad_norm": 0.26002230390629805, + "learning_rate": 3.196464256619902e-05, + "loss": 2.3685, + "step": 7753 + }, + { + "epoch": 0.8891182203875703, + "grad_norm": 0.25829146396279, + "learning_rate": 3.189934503764513e-05, + "loss": 2.4632, + "step": 7754 + }, + { + "epoch": 0.8892328861369109, + "grad_norm": 0.24382908594886604, + "learning_rate": 3.1834112074653974e-05, + "loss": 2.3197, + "step": 7755 + }, + { + "epoch": 0.8893475518862516, + "grad_norm": 0.31918875811060793, + "learning_rate": 3.1768943686222906e-05, + "loss": 2.5497, + "step": 7756 + }, + { + "epoch": 0.8894622176355923, + "grad_norm": 0.2760466868618441, + "learning_rate": 3.170383988134079e-05, + "loss": 2.4055, + "step": 7757 + }, + { + "epoch": 0.8895768833849329, + "grad_norm": 0.2620877757542277, + "learning_rate": 3.16388006689875e-05, + "loss": 2.34, + "step": 7758 + }, + { + "epoch": 0.8896915491342736, + "grad_norm": 0.25451936772442024, + "learning_rate": 3.157382605813369e-05, + "loss": 2.335, + "step": 7759 + }, + { + "epoch": 0.8898062148836142, + "grad_norm": 0.253775132044127, + "learning_rate": 3.150891605774164e-05, + "loss": 2.3736, + "step": 7760 + }, + { + "epoch": 0.8899208806329549, + "grad_norm": 0.24521292453800597, + "learning_rate": 3.144407067676447e-05, + "loss": 2.3017, + "step": 7761 + }, + { + "epoch": 0.8900355463822957, + "grad_norm": 0.24252191368121348, + "learning_rate": 3.137928992414618e-05, + "loss": 2.3949, + "step": 7762 + }, + { + "epoch": 0.8901502121316363, + "grad_norm": 0.22779664868167165, + "learning_rate": 3.131457380882213e-05, + "loss": 2.25, + "step": 7763 + }, + { + "epoch": 0.890264877880977, + "grad_norm": 0.2852451473490904, + "learning_rate": 3.1249922339718685e-05, + "loss": 2.3292, + "step": 7764 + }, + { + "epoch": 0.8903795436303176, + "grad_norm": 0.25278597837861333, + "learning_rate": 3.11853355257532e-05, + "loss": 2.4541, + "step": 7765 + }, + { + "epoch": 0.8904942093796583, + "grad_norm": 0.2548503547050677, + "learning_rate": 3.1120813375834276e-05, + "loss": 2.4103, + "step": 7766 + }, + { + "epoch": 0.890608875128999, + "grad_norm": 0.25096820071754866, + "learning_rate": 3.105635589886163e-05, + "loss": 2.3396, + "step": 7767 + }, + { + "epoch": 0.8907235408783396, + "grad_norm": 0.2568472044355167, + "learning_rate": 3.099196310372565e-05, + "loss": 2.2981, + "step": 7768 + }, + { + "epoch": 0.8908382066276803, + "grad_norm": 0.2651767520122004, + "learning_rate": 3.092763499930834e-05, + "loss": 2.398, + "step": 7769 + }, + { + "epoch": 0.890952872377021, + "grad_norm": 0.2476188786851755, + "learning_rate": 3.0863371594482384e-05, + "loss": 2.3607, + "step": 7770 + }, + { + "epoch": 0.8910675381263616, + "grad_norm": 0.2447304870360554, + "learning_rate": 3.07991728981118e-05, + "loss": 2.4151, + "step": 7771 + }, + { + "epoch": 0.8911822038757024, + "grad_norm": 0.25211444122968185, + "learning_rate": 3.07350389190515e-05, + "loss": 2.3941, + "step": 7772 + }, + { + "epoch": 0.891296869625043, + "grad_norm": 0.24209280085054075, + "learning_rate": 3.067096966614763e-05, + "loss": 2.295, + "step": 7773 + }, + { + "epoch": 0.8914115353743837, + "grad_norm": 0.2734923942913027, + "learning_rate": 3.060696514823719e-05, + "loss": 2.3574, + "step": 7774 + }, + { + "epoch": 0.8915262011237244, + "grad_norm": 0.24551510547654865, + "learning_rate": 3.0543025374148545e-05, + "loss": 2.376, + "step": 7775 + }, + { + "epoch": 0.891640866873065, + "grad_norm": 0.25336843376775836, + "learning_rate": 3.0479150352700878e-05, + "loss": 2.4449, + "step": 7776 + }, + { + "epoch": 0.8917555326224057, + "grad_norm": 0.24539614191350503, + "learning_rate": 3.041534009270447e-05, + "loss": 2.4143, + "step": 7777 + }, + { + "epoch": 0.8918701983717464, + "grad_norm": 0.2554545274162413, + "learning_rate": 3.035159460296083e-05, + "loss": 2.4106, + "step": 7778 + }, + { + "epoch": 0.891984864121087, + "grad_norm": 0.2642047935547276, + "learning_rate": 3.028791389226243e-05, + "loss": 2.349, + "step": 7779 + }, + { + "epoch": 0.8920995298704277, + "grad_norm": 0.26632321175877466, + "learning_rate": 3.0224297969392745e-05, + "loss": 2.3195, + "step": 7780 + }, + { + "epoch": 0.8922141956197683, + "grad_norm": 0.24374035582879336, + "learning_rate": 3.0160746843126418e-05, + "loss": 2.4381, + "step": 7781 + }, + { + "epoch": 0.892328861369109, + "grad_norm": 0.24872889523238803, + "learning_rate": 3.009726052222922e-05, + "loss": 2.4771, + "step": 7782 + }, + { + "epoch": 0.8924435271184498, + "grad_norm": 0.2618677127474087, + "learning_rate": 3.0033839015457644e-05, + "loss": 2.3349, + "step": 7783 + }, + { + "epoch": 0.8925581928677904, + "grad_norm": 0.2692763630918073, + "learning_rate": 2.9970482331559634e-05, + "loss": 2.3675, + "step": 7784 + }, + { + "epoch": 0.8926728586171311, + "grad_norm": 0.25206787187291285, + "learning_rate": 2.9907190479274037e-05, + "loss": 2.2911, + "step": 7785 + }, + { + "epoch": 0.8927875243664717, + "grad_norm": 0.23469985213610883, + "learning_rate": 2.9843963467330703e-05, + "loss": 2.303, + "step": 7786 + }, + { + "epoch": 0.8929021901158124, + "grad_norm": 0.25640932846071607, + "learning_rate": 2.97808013044506e-05, + "loss": 2.3133, + "step": 7787 + }, + { + "epoch": 0.8930168558651531, + "grad_norm": 0.295640791986467, + "learning_rate": 2.971770399934587e-05, + "loss": 2.3341, + "step": 7788 + }, + { + "epoch": 0.8931315216144937, + "grad_norm": 0.2551169985318455, + "learning_rate": 2.965467156071927e-05, + "loss": 2.3173, + "step": 7789 + }, + { + "epoch": 0.8932461873638344, + "grad_norm": 0.2494333370925394, + "learning_rate": 2.959170399726535e-05, + "loss": 2.4005, + "step": 7790 + }, + { + "epoch": 0.8933608531131751, + "grad_norm": 0.264343855895208, + "learning_rate": 2.952880131766894e-05, + "loss": 2.43, + "step": 7791 + }, + { + "epoch": 0.8934755188625157, + "grad_norm": 0.23538877237660957, + "learning_rate": 2.9465963530606433e-05, + "loss": 2.2349, + "step": 7792 + }, + { + "epoch": 0.8935901846118565, + "grad_norm": 0.26909144087688414, + "learning_rate": 2.940319064474506e-05, + "loss": 2.333, + "step": 7793 + }, + { + "epoch": 0.8937048503611971, + "grad_norm": 0.27293954154322986, + "learning_rate": 2.934048266874312e-05, + "loss": 2.3535, + "step": 7794 + }, + { + "epoch": 0.8938195161105378, + "grad_norm": 0.23756380641305277, + "learning_rate": 2.9277839611250023e-05, + "loss": 2.3367, + "step": 7795 + }, + { + "epoch": 0.8939341818598785, + "grad_norm": 0.2756024912949463, + "learning_rate": 2.9215261480906254e-05, + "loss": 2.4312, + "step": 7796 + }, + { + "epoch": 0.8940488476092191, + "grad_norm": 0.2456284058755781, + "learning_rate": 2.9152748286343067e-05, + "loss": 2.5261, + "step": 7797 + }, + { + "epoch": 0.8941635133585598, + "grad_norm": 0.2854948485110286, + "learning_rate": 2.9090300036183125e-05, + "loss": 2.313, + "step": 7798 + }, + { + "epoch": 0.8942781791079004, + "grad_norm": 0.2632978746553978, + "learning_rate": 2.9027916739039927e-05, + "loss": 2.3334, + "step": 7799 + }, + { + "epoch": 0.8943928448572411, + "grad_norm": 0.25831788742131717, + "learning_rate": 2.896559840351809e-05, + "loss": 2.3981, + "step": 7800 + }, + { + "epoch": 0.8945075106065818, + "grad_norm": 0.2675474456749712, + "learning_rate": 2.8903345038213235e-05, + "loss": 2.4568, + "step": 7801 + }, + { + "epoch": 0.8946221763559224, + "grad_norm": 0.2531222299845237, + "learning_rate": 2.8841156651712053e-05, + "loss": 2.3829, + "step": 7802 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.2628036133565752, + "learning_rate": 2.8779033252592123e-05, + "loss": 2.3268, + "step": 7803 + }, + { + "epoch": 0.8948515078546039, + "grad_norm": 0.2349018779280865, + "learning_rate": 2.8716974849422263e-05, + "loss": 2.6534, + "step": 7804 + }, + { + "epoch": 0.8949661736039445, + "grad_norm": 0.27115289899851386, + "learning_rate": 2.865498145076234e-05, + "loss": 2.3735, + "step": 7805 + }, + { + "epoch": 0.8950808393532852, + "grad_norm": 0.24908753717443133, + "learning_rate": 2.8593053065162967e-05, + "loss": 2.3863, + "step": 7806 + }, + { + "epoch": 0.8951955051026258, + "grad_norm": 0.2698628326205673, + "learning_rate": 2.8531189701166137e-05, + "loss": 2.4338, + "step": 7807 + }, + { + "epoch": 0.8953101708519665, + "grad_norm": 0.2725051495545839, + "learning_rate": 2.8469391367304753e-05, + "loss": 2.4051, + "step": 7808 + }, + { + "epoch": 0.8954248366013072, + "grad_norm": 0.24550115934780878, + "learning_rate": 2.8407658072102495e-05, + "loss": 2.3766, + "step": 7809 + }, + { + "epoch": 0.8955395023506478, + "grad_norm": 0.25255966614504294, + "learning_rate": 2.834598982407449e-05, + "loss": 2.322, + "step": 7810 + }, + { + "epoch": 0.8956541680999885, + "grad_norm": 0.2744654425684799, + "learning_rate": 2.828438663172672e-05, + "loss": 2.3363, + "step": 7811 + }, + { + "epoch": 0.8957688338493293, + "grad_norm": 0.24750221878780485, + "learning_rate": 2.8222848503556054e-05, + "loss": 2.2631, + "step": 7812 + }, + { + "epoch": 0.8958834995986699, + "grad_norm": 0.2527257287176193, + "learning_rate": 2.8161375448050474e-05, + "loss": 2.6212, + "step": 7813 + }, + { + "epoch": 0.8959981653480106, + "grad_norm": 0.2565443796704807, + "learning_rate": 2.8099967473689147e-05, + "loss": 2.4679, + "step": 7814 + }, + { + "epoch": 0.8961128310973512, + "grad_norm": 0.2773641159016645, + "learning_rate": 2.8038624588942074e-05, + "loss": 2.3576, + "step": 7815 + }, + { + "epoch": 0.8962274968466919, + "grad_norm": 0.2537068143969078, + "learning_rate": 2.7977346802270322e-05, + "loss": 2.3317, + "step": 7816 + }, + { + "epoch": 0.8963421625960326, + "grad_norm": 0.2636175611266541, + "learning_rate": 2.791613412212607e-05, + "loss": 2.3346, + "step": 7817 + }, + { + "epoch": 0.8964568283453732, + "grad_norm": 0.294148898208544, + "learning_rate": 2.7854986556952344e-05, + "loss": 2.308, + "step": 7818 + }, + { + "epoch": 0.8965714940947139, + "grad_norm": 0.25534715728180185, + "learning_rate": 2.7793904115183287e-05, + "loss": 2.4505, + "step": 7819 + }, + { + "epoch": 0.8966861598440545, + "grad_norm": 0.222332000673108, + "learning_rate": 2.7732886805244108e-05, + "loss": 2.3406, + "step": 7820 + }, + { + "epoch": 0.8968008255933952, + "grad_norm": 0.2596438436912729, + "learning_rate": 2.7671934635550956e-05, + "loss": 2.4462, + "step": 7821 + }, + { + "epoch": 0.896915491342736, + "grad_norm": 0.2464858570761382, + "learning_rate": 2.7611047614511052e-05, + "loss": 2.4006, + "step": 7822 + }, + { + "epoch": 0.8970301570920766, + "grad_norm": 0.23735821048363162, + "learning_rate": 2.7550225750522627e-05, + "loss": 2.3803, + "step": 7823 + }, + { + "epoch": 0.8971448228414173, + "grad_norm": 0.25752105411774967, + "learning_rate": 2.748946905197469e-05, + "loss": 2.3885, + "step": 7824 + }, + { + "epoch": 0.897259488590758, + "grad_norm": 0.271135448834253, + "learning_rate": 2.7428777527247707e-05, + "loss": 2.3636, + "step": 7825 + }, + { + "epoch": 0.8973741543400986, + "grad_norm": 0.28538045693773134, + "learning_rate": 2.7368151184712918e-05, + "loss": 2.3036, + "step": 7826 + }, + { + "epoch": 0.8974888200894393, + "grad_norm": 0.2793228120527593, + "learning_rate": 2.730759003273242e-05, + "loss": 2.3208, + "step": 7827 + }, + { + "epoch": 0.8976034858387799, + "grad_norm": 0.2406438885695902, + "learning_rate": 2.7247094079659528e-05, + "loss": 2.4537, + "step": 7828 + }, + { + "epoch": 0.8977181515881206, + "grad_norm": 0.25972341250979636, + "learning_rate": 2.7186663333838514e-05, + "loss": 2.4673, + "step": 7829 + }, + { + "epoch": 0.8978328173374613, + "grad_norm": 0.2959067621531113, + "learning_rate": 2.712629780360465e-05, + "loss": 2.3499, + "step": 7830 + }, + { + "epoch": 0.897947483086802, + "grad_norm": 0.24667301600462077, + "learning_rate": 2.7065997497284224e-05, + "loss": 2.5194, + "step": 7831 + }, + { + "epoch": 0.8980621488361427, + "grad_norm": 0.24290724284194581, + "learning_rate": 2.700576242319458e-05, + "loss": 2.4992, + "step": 7832 + }, + { + "epoch": 0.8981768145854833, + "grad_norm": 0.2514735373448667, + "learning_rate": 2.6945592589643853e-05, + "loss": 2.4853, + "step": 7833 + }, + { + "epoch": 0.898291480334824, + "grad_norm": 0.2858408348601675, + "learning_rate": 2.6885488004931347e-05, + "loss": 2.3865, + "step": 7834 + }, + { + "epoch": 0.8984061460841647, + "grad_norm": 0.26390622936852715, + "learning_rate": 2.682544867734743e-05, + "loss": 2.3246, + "step": 7835 + }, + { + "epoch": 0.8985208118335053, + "grad_norm": 0.26803992807521726, + "learning_rate": 2.6765474615173313e-05, + "loss": 2.4616, + "step": 7836 + }, + { + "epoch": 0.898635477582846, + "grad_norm": 0.2752464542503413, + "learning_rate": 2.6705565826681322e-05, + "loss": 2.3671, + "step": 7837 + }, + { + "epoch": 0.8987501433321867, + "grad_norm": 0.2659857670056342, + "learning_rate": 2.664572232013479e-05, + "loss": 2.4742, + "step": 7838 + }, + { + "epoch": 0.8988648090815273, + "grad_norm": 0.27514635555858746, + "learning_rate": 2.658594410378773e-05, + "loss": 2.2781, + "step": 7839 + }, + { + "epoch": 0.898979474830868, + "grad_norm": 0.25802708356998977, + "learning_rate": 2.6526231185885763e-05, + "loss": 2.5159, + "step": 7840 + }, + { + "epoch": 0.8990941405802086, + "grad_norm": 0.2721796391535052, + "learning_rate": 2.6466583574664914e-05, + "loss": 2.352, + "step": 7841 + }, + { + "epoch": 0.8992088063295494, + "grad_norm": 0.26137860304294547, + "learning_rate": 2.6407001278352495e-05, + "loss": 2.4054, + "step": 7842 + }, + { + "epoch": 0.8993234720788901, + "grad_norm": 0.2586948705123628, + "learning_rate": 2.6347484305166704e-05, + "loss": 2.4599, + "step": 7843 + }, + { + "epoch": 0.8994381378282307, + "grad_norm": 0.24508466218311736, + "learning_rate": 2.628803266331686e-05, + "loss": 2.3333, + "step": 7844 + }, + { + "epoch": 0.8995528035775714, + "grad_norm": 0.266850580034854, + "learning_rate": 2.6228646361003085e-05, + "loss": 2.4701, + "step": 7845 + }, + { + "epoch": 0.8996674693269121, + "grad_norm": 0.2864173278943525, + "learning_rate": 2.6169325406416754e-05, + "loss": 2.3649, + "step": 7846 + }, + { + "epoch": 0.8997821350762527, + "grad_norm": 0.25437292815266915, + "learning_rate": 2.6110069807739832e-05, + "loss": 2.2711, + "step": 7847 + }, + { + "epoch": 0.8998968008255934, + "grad_norm": 0.2541625261546507, + "learning_rate": 2.6050879573145668e-05, + "loss": 2.5051, + "step": 7848 + }, + { + "epoch": 0.900011466574934, + "grad_norm": 0.2858908934118265, + "learning_rate": 2.5991754710798345e-05, + "loss": 2.3956, + "step": 7849 + }, + { + "epoch": 0.9001261323242747, + "grad_norm": 0.25169167331504866, + "learning_rate": 2.5932695228853055e-05, + "loss": 2.4216, + "step": 7850 + }, + { + "epoch": 0.9002407980736155, + "grad_norm": 0.2472825923975517, + "learning_rate": 2.5873701135455952e-05, + "loss": 2.4799, + "step": 7851 + }, + { + "epoch": 0.9003554638229561, + "grad_norm": 0.2633651096181669, + "learning_rate": 2.581477243874414e-05, + "loss": 2.3638, + "step": 7852 + }, + { + "epoch": 0.9004701295722968, + "grad_norm": 0.26599684658992595, + "learning_rate": 2.575590914684556e-05, + "loss": 2.3738, + "step": 7853 + }, + { + "epoch": 0.9005847953216374, + "grad_norm": 0.2851127565467227, + "learning_rate": 2.569711126787938e-05, + "loss": 2.4166, + "step": 7854 + }, + { + "epoch": 0.9006994610709781, + "grad_norm": 0.2485678713731354, + "learning_rate": 2.5638378809955786e-05, + "loss": 2.4474, + "step": 7855 + }, + { + "epoch": 0.9008141268203188, + "grad_norm": 0.25608213589692336, + "learning_rate": 2.557971178117563e-05, + "loss": 2.3716, + "step": 7856 + }, + { + "epoch": 0.9009287925696594, + "grad_norm": 0.23168367018671582, + "learning_rate": 2.5521110189630937e-05, + "loss": 2.3818, + "step": 7857 + }, + { + "epoch": 0.9010434583190001, + "grad_norm": 0.265882243984455, + "learning_rate": 2.5462574043404795e-05, + "loss": 2.4257, + "step": 7858 + }, + { + "epoch": 0.9011581240683408, + "grad_norm": 0.2636971340212292, + "learning_rate": 2.540410335057092e-05, + "loss": 2.4915, + "step": 7859 + }, + { + "epoch": 0.9012727898176814, + "grad_norm": 0.24036791651609013, + "learning_rate": 2.5345698119194403e-05, + "loss": 2.2715, + "step": 7860 + }, + { + "epoch": 0.9013874555670222, + "grad_norm": 0.25682477392528935, + "learning_rate": 2.5287358357331203e-05, + "loss": 2.4662, + "step": 7861 + }, + { + "epoch": 0.9015021213163628, + "grad_norm": 0.24387437086591005, + "learning_rate": 2.5229084073027987e-05, + "loss": 2.4367, + "step": 7862 + }, + { + "epoch": 0.9016167870657035, + "grad_norm": 0.2512676590586209, + "learning_rate": 2.5170875274322657e-05, + "loss": 2.3182, + "step": 7863 + }, + { + "epoch": 0.9017314528150442, + "grad_norm": 0.2556134895644676, + "learning_rate": 2.5112731969244073e-05, + "loss": 2.3356, + "step": 7864 + }, + { + "epoch": 0.9018461185643848, + "grad_norm": 0.2695815921155733, + "learning_rate": 2.5054654165811873e-05, + "loss": 2.3602, + "step": 7865 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 0.28399502911945773, + "learning_rate": 2.4996641872036875e-05, + "loss": 2.3925, + "step": 7866 + }, + { + "epoch": 0.9020754500630661, + "grad_norm": 0.23414169920940725, + "learning_rate": 2.493869509592078e-05, + "loss": 2.2545, + "step": 7867 + }, + { + "epoch": 0.9021901158124068, + "grad_norm": 0.24146780653946495, + "learning_rate": 2.488081384545621e-05, + "loss": 2.3091, + "step": 7868 + }, + { + "epoch": 0.9023047815617475, + "grad_norm": 0.2538515271390083, + "learning_rate": 2.482299812862665e-05, + "loss": 2.4604, + "step": 7869 + }, + { + "epoch": 0.9024194473110881, + "grad_norm": 0.24208003527611488, + "learning_rate": 2.47652479534069e-05, + "loss": 2.389, + "step": 7870 + }, + { + "epoch": 0.9025341130604289, + "grad_norm": 0.25496550477094687, + "learning_rate": 2.4707563327762417e-05, + "loss": 2.2991, + "step": 7871 + }, + { + "epoch": 0.9026487788097696, + "grad_norm": 0.2672172439689978, + "learning_rate": 2.4649944259649615e-05, + "loss": 2.5429, + "step": 7872 + }, + { + "epoch": 0.9027634445591102, + "grad_norm": 0.2652537669432156, + "learning_rate": 2.459239075701608e-05, + "loss": 2.4249, + "step": 7873 + }, + { + "epoch": 0.9028781103084509, + "grad_norm": 0.25933928278776125, + "learning_rate": 2.453490282780002e-05, + "loss": 2.3148, + "step": 7874 + }, + { + "epoch": 0.9029927760577915, + "grad_norm": 0.26039896820845754, + "learning_rate": 2.447748047993098e-05, + "loss": 2.2465, + "step": 7875 + }, + { + "epoch": 0.9031074418071322, + "grad_norm": 0.24749361630700203, + "learning_rate": 2.4420123721329344e-05, + "loss": 2.3158, + "step": 7876 + }, + { + "epoch": 0.9032221075564729, + "grad_norm": 0.2637399071512931, + "learning_rate": 2.4362832559906123e-05, + "loss": 2.4934, + "step": 7877 + }, + { + "epoch": 0.9033367733058135, + "grad_norm": 0.2515447449055186, + "learning_rate": 2.4305607003563714e-05, + "loss": 2.4473, + "step": 7878 + }, + { + "epoch": 0.9034514390551542, + "grad_norm": 0.291058701300032, + "learning_rate": 2.42484470601953e-05, + "loss": 2.2977, + "step": 7879 + }, + { + "epoch": 0.903566104804495, + "grad_norm": 0.25700278369804813, + "learning_rate": 2.4191352737684913e-05, + "loss": 2.2886, + "step": 7880 + }, + { + "epoch": 0.9036807705538356, + "grad_norm": 0.2469321386600971, + "learning_rate": 2.4134324043907697e-05, + "loss": 2.4029, + "step": 7881 + }, + { + "epoch": 0.9037954363031763, + "grad_norm": 0.2531502721088116, + "learning_rate": 2.407736098672969e-05, + "loss": 2.4438, + "step": 7882 + }, + { + "epoch": 0.9039101020525169, + "grad_norm": 0.2456453380400903, + "learning_rate": 2.4020463574007777e-05, + "loss": 2.393, + "step": 7883 + }, + { + "epoch": 0.9040247678018576, + "grad_norm": 0.23817744241130204, + "learning_rate": 2.396363181358996e-05, + "loss": 2.2695, + "step": 7884 + }, + { + "epoch": 0.9041394335511983, + "grad_norm": 0.2853802066100375, + "learning_rate": 2.3906865713315018e-05, + "loss": 2.3366, + "step": 7885 + }, + { + "epoch": 0.9042540993005389, + "grad_norm": 0.25068855741011037, + "learning_rate": 2.3850165281012747e-05, + "loss": 2.4639, + "step": 7886 + }, + { + "epoch": 0.9043687650498796, + "grad_norm": 0.25346109375597387, + "learning_rate": 2.3793530524504004e-05, + "loss": 2.4021, + "step": 7887 + }, + { + "epoch": 0.9044834307992202, + "grad_norm": 0.2414374657536262, + "learning_rate": 2.3736961451600423e-05, + "loss": 2.1887, + "step": 7888 + }, + { + "epoch": 0.9045980965485609, + "grad_norm": 0.25446171707434806, + "learning_rate": 2.3680458070104492e-05, + "loss": 2.3651, + "step": 7889 + }, + { + "epoch": 0.9047127622979017, + "grad_norm": 0.23584363518947468, + "learning_rate": 2.362402038781003e-05, + "loss": 2.2678, + "step": 7890 + }, + { + "epoch": 0.9048274280472423, + "grad_norm": 0.24127462723579762, + "learning_rate": 2.3567648412501365e-05, + "loss": 2.3628, + "step": 7891 + }, + { + "epoch": 0.904942093796583, + "grad_norm": 0.24126346027555615, + "learning_rate": 2.351134215195394e-05, + "loss": 2.4828, + "step": 7892 + }, + { + "epoch": 0.9050567595459237, + "grad_norm": 0.27217730333184964, + "learning_rate": 2.345510161393416e-05, + "loss": 2.4645, + "step": 7893 + }, + { + "epoch": 0.9051714252952643, + "grad_norm": 0.2723157247316288, + "learning_rate": 2.339892680619937e-05, + "loss": 2.4504, + "step": 7894 + }, + { + "epoch": 0.905286091044605, + "grad_norm": 0.2525789596314939, + "learning_rate": 2.334281773649777e-05, + "loss": 2.4053, + "step": 7895 + }, + { + "epoch": 0.9054007567939456, + "grad_norm": 0.2532463916943479, + "learning_rate": 2.32867744125686e-05, + "loss": 2.4811, + "step": 7896 + }, + { + "epoch": 0.9055154225432863, + "grad_norm": 0.26091843189723296, + "learning_rate": 2.323079684214191e-05, + "loss": 2.4983, + "step": 7897 + }, + { + "epoch": 0.905630088292627, + "grad_norm": 0.24144753077020223, + "learning_rate": 2.3174885032938685e-05, + "loss": 2.4569, + "step": 7898 + }, + { + "epoch": 0.9057447540419676, + "grad_norm": 0.23577034498701382, + "learning_rate": 2.3119038992670983e-05, + "loss": 2.3982, + "step": 7899 + }, + { + "epoch": 0.9058594197913084, + "grad_norm": 0.2458641648577192, + "learning_rate": 2.3063258729041635e-05, + "loss": 2.3224, + "step": 7900 + }, + { + "epoch": 0.905974085540649, + "grad_norm": 0.2550707171865382, + "learning_rate": 2.300754424974455e-05, + "loss": 2.3976, + "step": 7901 + }, + { + "epoch": 0.9060887512899897, + "grad_norm": 0.27940292289825736, + "learning_rate": 2.295189556246452e-05, + "loss": 2.5029, + "step": 7902 + }, + { + "epoch": 0.9062034170393304, + "grad_norm": 0.2758627350006131, + "learning_rate": 2.2896312674877017e-05, + "loss": 2.3768, + "step": 7903 + }, + { + "epoch": 0.906318082788671, + "grad_norm": 0.2501087405194425, + "learning_rate": 2.284079559464869e-05, + "loss": 2.3264, + "step": 7904 + }, + { + "epoch": 0.9064327485380117, + "grad_norm": 0.2526346671421727, + "learning_rate": 2.2785344329437297e-05, + "loss": 2.5465, + "step": 7905 + }, + { + "epoch": 0.9065474142873524, + "grad_norm": 0.24413443907852095, + "learning_rate": 2.2729958886891e-05, + "loss": 2.3026, + "step": 7906 + }, + { + "epoch": 0.906662080036693, + "grad_norm": 0.25158936353853034, + "learning_rate": 2.2674639274649357e-05, + "loss": 2.4795, + "step": 7907 + }, + { + "epoch": 0.9067767457860337, + "grad_norm": 0.24059036981887977, + "learning_rate": 2.2619385500342594e-05, + "loss": 2.4297, + "step": 7908 + }, + { + "epoch": 0.9068914115353743, + "grad_norm": 0.2635798576820344, + "learning_rate": 2.2564197571591726e-05, + "loss": 2.4817, + "step": 7909 + }, + { + "epoch": 0.907006077284715, + "grad_norm": 0.2633408375649657, + "learning_rate": 2.2509075496009168e-05, + "loss": 2.3852, + "step": 7910 + }, + { + "epoch": 0.9071207430340558, + "grad_norm": 0.2675663086237945, + "learning_rate": 2.245401928119789e-05, + "loss": 2.3659, + "step": 7911 + }, + { + "epoch": 0.9072354087833964, + "grad_norm": 0.26898841794311357, + "learning_rate": 2.2399028934751764e-05, + "loss": 2.3604, + "step": 7912 + }, + { + "epoch": 0.9073500745327371, + "grad_norm": 0.24553393301062404, + "learning_rate": 2.2344104464255664e-05, + "loss": 2.391, + "step": 7913 + }, + { + "epoch": 0.9074647402820778, + "grad_norm": 0.2690125686843234, + "learning_rate": 2.228924587728537e-05, + "loss": 2.1671, + "step": 7914 + }, + { + "epoch": 0.9075794060314184, + "grad_norm": 0.25391168996228936, + "learning_rate": 2.2234453181407654e-05, + "loss": 2.3694, + "step": 7915 + }, + { + "epoch": 0.9076940717807591, + "grad_norm": 0.2654551987482526, + "learning_rate": 2.217972638418009e-05, + "loss": 2.1761, + "step": 7916 + }, + { + "epoch": 0.9078087375300997, + "grad_norm": 0.27135612053122643, + "learning_rate": 2.2125065493151242e-05, + "loss": 2.2833, + "step": 7917 + }, + { + "epoch": 0.9079234032794404, + "grad_norm": 0.24079487125715343, + "learning_rate": 2.207047051586042e-05, + "loss": 2.3862, + "step": 7918 + }, + { + "epoch": 0.9080380690287811, + "grad_norm": 0.24838401386026915, + "learning_rate": 2.201594145983793e-05, + "loss": 2.4133, + "step": 7919 + }, + { + "epoch": 0.9081527347781218, + "grad_norm": 0.2527087426914141, + "learning_rate": 2.196147833260531e-05, + "loss": 2.2708, + "step": 7920 + }, + { + "epoch": 0.9082674005274625, + "grad_norm": 0.25189706427685826, + "learning_rate": 2.190708114167439e-05, + "loss": 2.2798, + "step": 7921 + }, + { + "epoch": 0.9083820662768031, + "grad_norm": 0.23870274237895803, + "learning_rate": 2.185274989454833e-05, + "loss": 2.3957, + "step": 7922 + }, + { + "epoch": 0.9084967320261438, + "grad_norm": 0.26146361575832205, + "learning_rate": 2.1798484598721245e-05, + "loss": 2.4097, + "step": 7923 + }, + { + "epoch": 0.9086113977754845, + "grad_norm": 0.2577300202943111, + "learning_rate": 2.1744285261677653e-05, + "loss": 2.3238, + "step": 7924 + }, + { + "epoch": 0.9087260635248251, + "grad_norm": 0.23911959575200273, + "learning_rate": 2.1690151890893628e-05, + "loss": 2.3646, + "step": 7925 + }, + { + "epoch": 0.9088407292741658, + "grad_norm": 0.2484690091750989, + "learning_rate": 2.1636084493835804e-05, + "loss": 2.5228, + "step": 7926 + }, + { + "epoch": 0.9089553950235065, + "grad_norm": 0.27500206040896635, + "learning_rate": 2.1582083077961612e-05, + "loss": 2.3183, + "step": 7927 + }, + { + "epoch": 0.9090700607728471, + "grad_norm": 0.2608313947945333, + "learning_rate": 2.1528147650719533e-05, + "loss": 2.3748, + "step": 7928 + }, + { + "epoch": 0.9091847265221878, + "grad_norm": 0.2568595556171314, + "learning_rate": 2.1474278219549058e-05, + "loss": 2.3894, + "step": 7929 + }, + { + "epoch": 0.9092993922715285, + "grad_norm": 0.24138136489043635, + "learning_rate": 2.1420474791880306e-05, + "loss": 2.3941, + "step": 7930 + }, + { + "epoch": 0.9094140580208692, + "grad_norm": 0.2398758345396527, + "learning_rate": 2.136673737513456e-05, + "loss": 2.3627, + "step": 7931 + }, + { + "epoch": 0.9095287237702099, + "grad_norm": 0.25842059196958517, + "learning_rate": 2.1313065976723834e-05, + "loss": 2.531, + "step": 7932 + }, + { + "epoch": 0.9096433895195505, + "grad_norm": 0.2545696850414797, + "learning_rate": 2.125946060405104e-05, + "loss": 2.2547, + "step": 7933 + }, + { + "epoch": 0.9097580552688912, + "grad_norm": 0.2664535756038087, + "learning_rate": 2.1205921264509986e-05, + "loss": 2.32, + "step": 7934 + }, + { + "epoch": 0.9098727210182318, + "grad_norm": 0.24092546936507106, + "learning_rate": 2.1152447965485432e-05, + "loss": 2.3564, + "step": 7935 + }, + { + "epoch": 0.9099873867675725, + "grad_norm": 0.26941517866280684, + "learning_rate": 2.109904071435309e-05, + "loss": 2.3969, + "step": 7936 + }, + { + "epoch": 0.9101020525169132, + "grad_norm": 0.2906635670090949, + "learning_rate": 2.1045699518479345e-05, + "loss": 2.3913, + "step": 7937 + }, + { + "epoch": 0.9102167182662538, + "grad_norm": 0.2648506540808758, + "learning_rate": 2.09924243852217e-05, + "loss": 2.4178, + "step": 7938 + }, + { + "epoch": 0.9103313840155945, + "grad_norm": 0.25568113775986445, + "learning_rate": 2.0939215321928328e-05, + "loss": 2.4078, + "step": 7939 + }, + { + "epoch": 0.9104460497649353, + "grad_norm": 0.2360733093806897, + "learning_rate": 2.0886072335938532e-05, + "loss": 2.3216, + "step": 7940 + }, + { + "epoch": 0.9105607155142759, + "grad_norm": 0.23834070098452093, + "learning_rate": 2.083299543458228e-05, + "loss": 2.4079, + "step": 7941 + }, + { + "epoch": 0.9106753812636166, + "grad_norm": 0.25228416042551843, + "learning_rate": 2.0779984625180548e-05, + "loss": 2.2367, + "step": 7942 + }, + { + "epoch": 0.9107900470129572, + "grad_norm": 0.2521735271398086, + "learning_rate": 2.072703991504521e-05, + "loss": 2.525, + "step": 7943 + }, + { + "epoch": 0.9109047127622979, + "grad_norm": 0.2534719502817023, + "learning_rate": 2.0674161311478924e-05, + "loss": 2.416, + "step": 7944 + }, + { + "epoch": 0.9110193785116386, + "grad_norm": 0.28122559249600215, + "learning_rate": 2.06213488217753e-05, + "loss": 2.4074, + "step": 7945 + }, + { + "epoch": 0.9111340442609792, + "grad_norm": 0.25621858258613533, + "learning_rate": 2.0568602453218843e-05, + "loss": 2.347, + "step": 7946 + }, + { + "epoch": 0.9112487100103199, + "grad_norm": 0.26564576016287866, + "learning_rate": 2.0515922213084848e-05, + "loss": 2.3198, + "step": 7947 + }, + { + "epoch": 0.9113633757596606, + "grad_norm": 0.2728719353488285, + "learning_rate": 2.0463308108639612e-05, + "loss": 2.4589, + "step": 7948 + }, + { + "epoch": 0.9114780415090012, + "grad_norm": 0.2481026541444845, + "learning_rate": 2.0410760147140162e-05, + "loss": 2.4044, + "step": 7949 + }, + { + "epoch": 0.911592707258342, + "grad_norm": 0.25040294688921416, + "learning_rate": 2.035827833583459e-05, + "loss": 2.5385, + "step": 7950 + }, + { + "epoch": 0.9117073730076826, + "grad_norm": 0.27304643825929703, + "learning_rate": 2.0305862681961717e-05, + "loss": 2.3287, + "step": 7951 + }, + { + "epoch": 0.9118220387570233, + "grad_norm": 0.27729707874279247, + "learning_rate": 2.025351319275137e-05, + "loss": 2.3632, + "step": 7952 + }, + { + "epoch": 0.911936704506364, + "grad_norm": 0.2487220078848279, + "learning_rate": 2.0201229875424e-05, + "loss": 2.4274, + "step": 7953 + }, + { + "epoch": 0.9120513702557046, + "grad_norm": 0.269903611618632, + "learning_rate": 2.0149012737191052e-05, + "loss": 2.4517, + "step": 7954 + }, + { + "epoch": 0.9121660360050453, + "grad_norm": 0.2544392297092218, + "learning_rate": 2.0096861785255217e-05, + "loss": 2.3716, + "step": 7955 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 0.2756260067261441, + "learning_rate": 2.00447770268094e-05, + "loss": 2.4198, + "step": 7956 + }, + { + "epoch": 0.9123953675037266, + "grad_norm": 0.27686351547989557, + "learning_rate": 1.9992758469037808e-05, + "loss": 2.4203, + "step": 7957 + }, + { + "epoch": 0.9125100332530673, + "grad_norm": 0.23229343546216494, + "learning_rate": 1.994080611911553e-05, + "loss": 2.1795, + "step": 7958 + }, + { + "epoch": 0.912624699002408, + "grad_norm": 0.2793079021045908, + "learning_rate": 1.988891998420811e-05, + "loss": 2.288, + "step": 7959 + }, + { + "epoch": 0.9127393647517487, + "grad_norm": 0.2477835520672039, + "learning_rate": 1.983710007147249e-05, + "loss": 2.4896, + "step": 7960 + }, + { + "epoch": 0.9128540305010894, + "grad_norm": 0.2761137058296533, + "learning_rate": 1.9785346388056236e-05, + "loss": 2.5479, + "step": 7961 + }, + { + "epoch": 0.91296869625043, + "grad_norm": 0.28408936240485505, + "learning_rate": 1.9733658941097686e-05, + "loss": 2.4743, + "step": 7962 + }, + { + "epoch": 0.9130833619997707, + "grad_norm": 0.243227350656218, + "learning_rate": 1.96820377377262e-05, + "loss": 2.3633, + "step": 7963 + }, + { + "epoch": 0.9131980277491113, + "grad_norm": 0.24971888746080473, + "learning_rate": 1.9630482785061854e-05, + "loss": 2.4409, + "step": 7964 + }, + { + "epoch": 0.913312693498452, + "grad_norm": 0.27381885216961716, + "learning_rate": 1.9578994090215796e-05, + "loss": 2.3039, + "step": 7965 + }, + { + "epoch": 0.9134273592477927, + "grad_norm": 0.2893100878612547, + "learning_rate": 1.952757166028979e-05, + "loss": 2.4132, + "step": 7966 + }, + { + "epoch": 0.9135420249971333, + "grad_norm": 0.26426162992338725, + "learning_rate": 1.9476215502376714e-05, + "loss": 2.3879, + "step": 7967 + }, + { + "epoch": 0.913656690746474, + "grad_norm": 0.2750488912030977, + "learning_rate": 1.9424925623560075e-05, + "loss": 2.3436, + "step": 7968 + }, + { + "epoch": 0.9137713564958146, + "grad_norm": 0.26550105223081966, + "learning_rate": 1.9373702030914208e-05, + "loss": 2.3745, + "step": 7969 + }, + { + "epoch": 0.9138860222451554, + "grad_norm": 0.2544003262522658, + "learning_rate": 1.932254473150474e-05, + "loss": 2.3693, + "step": 7970 + }, + { + "epoch": 0.9140006879944961, + "grad_norm": 0.22774973269781684, + "learning_rate": 1.9271453732387646e-05, + "loss": 2.4314, + "step": 7971 + }, + { + "epoch": 0.9141153537438367, + "grad_norm": 0.2433306057065735, + "learning_rate": 1.9220429040609943e-05, + "loss": 2.4567, + "step": 7972 + }, + { + "epoch": 0.9142300194931774, + "grad_norm": 0.2591787343448949, + "learning_rate": 1.9169470663209675e-05, + "loss": 2.5235, + "step": 7973 + }, + { + "epoch": 0.9143446852425181, + "grad_norm": 0.25497552405908674, + "learning_rate": 1.911857860721533e-05, + "loss": 2.4218, + "step": 7974 + }, + { + "epoch": 0.9144593509918587, + "grad_norm": 0.26170914019516583, + "learning_rate": 1.9067752879646683e-05, + "loss": 2.3525, + "step": 7975 + }, + { + "epoch": 0.9145740167411994, + "grad_norm": 0.2710807091619628, + "learning_rate": 1.901699348751418e-05, + "loss": 2.3217, + "step": 7976 + }, + { + "epoch": 0.91468868249054, + "grad_norm": 0.26184255260961053, + "learning_rate": 1.896630043781894e-05, + "loss": 2.5225, + "step": 7977 + }, + { + "epoch": 0.9148033482398807, + "grad_norm": 0.27548298724178183, + "learning_rate": 1.8915673737553317e-05, + "loss": 2.524, + "step": 7978 + }, + { + "epoch": 0.9149180139892215, + "grad_norm": 0.26156981036334437, + "learning_rate": 1.8865113393700117e-05, + "loss": 2.3514, + "step": 7979 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.2686840361718828, + "learning_rate": 1.8814619413233314e-05, + "loss": 2.3751, + "step": 7980 + }, + { + "epoch": 0.9151473454879028, + "grad_norm": 0.2515356834960261, + "learning_rate": 1.8764191803117502e-05, + "loss": 2.4604, + "step": 7981 + }, + { + "epoch": 0.9152620112372435, + "grad_norm": 0.2866490588818978, + "learning_rate": 1.8713830570308343e-05, + "loss": 2.2217, + "step": 7982 + }, + { + "epoch": 0.9153766769865841, + "grad_norm": 0.25815223016280925, + "learning_rate": 1.8663535721752e-05, + "loss": 2.4529, + "step": 7983 + }, + { + "epoch": 0.9154913427359248, + "grad_norm": 0.2613118482300695, + "learning_rate": 1.8613307264385816e-05, + "loss": 2.4427, + "step": 7984 + }, + { + "epoch": 0.9156060084852654, + "grad_norm": 0.25563490272277123, + "learning_rate": 1.8563145205137856e-05, + "loss": 2.3248, + "step": 7985 + }, + { + "epoch": 0.9157206742346061, + "grad_norm": 0.26308704049976694, + "learning_rate": 1.8513049550926974e-05, + "loss": 2.3331, + "step": 7986 + }, + { + "epoch": 0.9158353399839468, + "grad_norm": 0.2691209456354479, + "learning_rate": 1.8463020308662927e-05, + "loss": 2.4257, + "step": 7987 + }, + { + "epoch": 0.9159500057332874, + "grad_norm": 0.24971958931054827, + "learning_rate": 1.8413057485246355e-05, + "loss": 2.2825, + "step": 7988 + }, + { + "epoch": 0.9160646714826282, + "grad_norm": 0.2347494049609805, + "learning_rate": 1.836316108756847e-05, + "loss": 2.4649, + "step": 7989 + }, + { + "epoch": 0.9161793372319688, + "grad_norm": 0.2379239030923568, + "learning_rate": 1.831333112251188e-05, + "loss": 2.3399, + "step": 7990 + }, + { + "epoch": 0.9162940029813095, + "grad_norm": 0.278357519210992, + "learning_rate": 1.826356759694936e-05, + "loss": 2.3701, + "step": 7991 + }, + { + "epoch": 0.9164086687306502, + "grad_norm": 0.2561537001232615, + "learning_rate": 1.821387051774498e-05, + "loss": 2.3156, + "step": 7992 + }, + { + "epoch": 0.9165233344799908, + "grad_norm": 0.26891840820248547, + "learning_rate": 1.8164239891753476e-05, + "loss": 2.3687, + "step": 7993 + }, + { + "epoch": 0.9166380002293315, + "grad_norm": 0.27703946127002466, + "learning_rate": 1.8114675725820428e-05, + "loss": 2.4344, + "step": 7994 + }, + { + "epoch": 0.9167526659786722, + "grad_norm": 0.268571678749942, + "learning_rate": 1.8065178026782315e-05, + "loss": 2.4033, + "step": 7995 + }, + { + "epoch": 0.9168673317280128, + "grad_norm": 0.24724163625560708, + "learning_rate": 1.8015746801466447e-05, + "loss": 2.2614, + "step": 7996 + }, + { + "epoch": 0.9169819974773535, + "grad_norm": 0.27260184253505326, + "learning_rate": 1.7966382056690765e-05, + "loss": 2.391, + "step": 7997 + }, + { + "epoch": 0.9170966632266941, + "grad_norm": 0.2591481752110663, + "learning_rate": 1.7917083799264323e-05, + "loss": 2.3386, + "step": 7998 + }, + { + "epoch": 0.9172113289760349, + "grad_norm": 0.2550676766374494, + "learning_rate": 1.786785203598679e-05, + "loss": 2.4072, + "step": 7999 + }, + { + "epoch": 0.9173259947253756, + "grad_norm": 0.26146311090283386, + "learning_rate": 1.781868677364884e-05, + "loss": 2.4198, + "step": 8000 + }, + { + "epoch": 0.9174406604747162, + "grad_norm": 0.2356798541763734, + "learning_rate": 1.7769588019031835e-05, + "loss": 2.4053, + "step": 8001 + }, + { + "epoch": 0.9175553262240569, + "grad_norm": 0.2529702641889254, + "learning_rate": 1.772055577890802e-05, + "loss": 2.2814, + "step": 8002 + }, + { + "epoch": 0.9176699919733975, + "grad_norm": 0.2943835985004157, + "learning_rate": 1.767159006004043e-05, + "loss": 2.4049, + "step": 8003 + }, + { + "epoch": 0.9177846577227382, + "grad_norm": 0.2807948584623163, + "learning_rate": 1.7622690869182946e-05, + "loss": 2.3376, + "step": 8004 + }, + { + "epoch": 0.9178993234720789, + "grad_norm": 0.25007874522344503, + "learning_rate": 1.7573858213080384e-05, + "loss": 2.4103, + "step": 8005 + }, + { + "epoch": 0.9180139892214195, + "grad_norm": 0.2658146043768412, + "learning_rate": 1.7525092098468143e-05, + "loss": 2.1952, + "step": 8006 + }, + { + "epoch": 0.9181286549707602, + "grad_norm": 0.27960997861621845, + "learning_rate": 1.7476392532072673e-05, + "loss": 2.2935, + "step": 8007 + }, + { + "epoch": 0.918243320720101, + "grad_norm": 0.2641727380124091, + "learning_rate": 1.7427759520611153e-05, + "loss": 2.3138, + "step": 8008 + }, + { + "epoch": 0.9183579864694416, + "grad_norm": 0.28608360586076703, + "learning_rate": 1.7379193070791387e-05, + "loss": 2.4391, + "step": 8009 + }, + { + "epoch": 0.9184726522187823, + "grad_norm": 0.2641146051205449, + "learning_rate": 1.7330693189312464e-05, + "loss": 2.4031, + "step": 8010 + }, + { + "epoch": 0.9185873179681229, + "grad_norm": 0.2477446034912744, + "learning_rate": 1.7282259882863972e-05, + "loss": 2.3376, + "step": 8011 + }, + { + "epoch": 0.9187019837174636, + "grad_norm": 0.24554748678152213, + "learning_rate": 1.723389315812618e-05, + "loss": 2.3335, + "step": 8012 + }, + { + "epoch": 0.9188166494668043, + "grad_norm": 0.24960720814015303, + "learning_rate": 1.7185593021770472e-05, + "loss": 2.335, + "step": 8013 + }, + { + "epoch": 0.9189313152161449, + "grad_norm": 0.2515645579988811, + "learning_rate": 1.713735948045897e-05, + "loss": 2.4738, + "step": 8014 + }, + { + "epoch": 0.9190459809654856, + "grad_norm": 0.2641872583191583, + "learning_rate": 1.708919254084451e-05, + "loss": 2.4574, + "step": 8015 + }, + { + "epoch": 0.9191606467148263, + "grad_norm": 0.2584275813051626, + "learning_rate": 1.7041092209570784e-05, + "loss": 2.4039, + "step": 8016 + }, + { + "epoch": 0.9192753124641669, + "grad_norm": 0.24781138472333708, + "learning_rate": 1.6993058493272483e-05, + "loss": 2.3033, + "step": 8017 + }, + { + "epoch": 0.9193899782135077, + "grad_norm": 0.24658448928599494, + "learning_rate": 1.6945091398574698e-05, + "loss": 2.2932, + "step": 8018 + }, + { + "epoch": 0.9195046439628483, + "grad_norm": 0.2739745712532286, + "learning_rate": 1.689719093209363e-05, + "loss": 2.4372, + "step": 8019 + }, + { + "epoch": 0.919619309712189, + "grad_norm": 0.26670163818051185, + "learning_rate": 1.6849357100436448e-05, + "loss": 2.3562, + "step": 8020 + }, + { + "epoch": 0.9197339754615297, + "grad_norm": 0.2704356247511721, + "learning_rate": 1.68015899102007e-05, + "loss": 2.2534, + "step": 8021 + }, + { + "epoch": 0.9198486412108703, + "grad_norm": 0.2996818891262219, + "learning_rate": 1.6753889367975008e-05, + "loss": 2.4931, + "step": 8022 + }, + { + "epoch": 0.919963306960211, + "grad_norm": 0.2456550937484026, + "learning_rate": 1.670625548033883e-05, + "loss": 2.4219, + "step": 8023 + }, + { + "epoch": 0.9200779727095516, + "grad_norm": 0.26025304614520073, + "learning_rate": 1.6658688253862185e-05, + "loss": 2.2894, + "step": 8024 + }, + { + "epoch": 0.9201926384588923, + "grad_norm": 0.2578033726658523, + "learning_rate": 1.6611187695106223e-05, + "loss": 2.2727, + "step": 8025 + }, + { + "epoch": 0.920307304208233, + "grad_norm": 0.26216816800449183, + "learning_rate": 1.6563753810622752e-05, + "loss": 2.3408, + "step": 8026 + }, + { + "epoch": 0.9204219699575736, + "grad_norm": 0.261256331046384, + "learning_rate": 1.6516386606954203e-05, + "loss": 2.3193, + "step": 8027 + }, + { + "epoch": 0.9205366357069144, + "grad_norm": 0.24402673385751353, + "learning_rate": 1.6469086090634134e-05, + "loss": 2.4218, + "step": 8028 + }, + { + "epoch": 0.9206513014562551, + "grad_norm": 0.2746084357662859, + "learning_rate": 1.642185226818671e-05, + "loss": 2.3899, + "step": 8029 + }, + { + "epoch": 0.9207659672055957, + "grad_norm": 0.24236924930280326, + "learning_rate": 1.6374685146126943e-05, + "loss": 2.436, + "step": 8030 + }, + { + "epoch": 0.9208806329549364, + "grad_norm": 0.22700968993118648, + "learning_rate": 1.6327584730960576e-05, + "loss": 2.3948, + "step": 8031 + }, + { + "epoch": 0.920995298704277, + "grad_norm": 0.2535278933427497, + "learning_rate": 1.6280551029184355e-05, + "loss": 2.4042, + "step": 8032 + }, + { + "epoch": 0.9211099644536177, + "grad_norm": 0.24834352454977696, + "learning_rate": 1.6233584047285478e-05, + "loss": 2.2668, + "step": 8033 + }, + { + "epoch": 0.9212246302029584, + "grad_norm": 0.2611479636369127, + "learning_rate": 1.6186683791742318e-05, + "loss": 2.3545, + "step": 8034 + }, + { + "epoch": 0.921339295952299, + "grad_norm": 0.24651986676882395, + "learning_rate": 1.613985026902376e-05, + "loss": 2.2661, + "step": 8035 + }, + { + "epoch": 0.9214539617016397, + "grad_norm": 0.25988313718205536, + "learning_rate": 1.6093083485589633e-05, + "loss": 2.4275, + "step": 8036 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 0.2875927731094603, + "learning_rate": 1.6046383447890612e-05, + "loss": 2.3422, + "step": 8037 + }, + { + "epoch": 0.921683293200321, + "grad_norm": 0.2608070110660877, + "learning_rate": 1.599975016236799e-05, + "loss": 2.5118, + "step": 8038 + }, + { + "epoch": 0.9217979589496618, + "grad_norm": 0.2589293421098052, + "learning_rate": 1.595318363545384e-05, + "loss": 2.3081, + "step": 8039 + }, + { + "epoch": 0.9219126246990024, + "grad_norm": 0.2722987015486067, + "learning_rate": 1.590668387357136e-05, + "loss": 2.4379, + "step": 8040 + }, + { + "epoch": 0.9220272904483431, + "grad_norm": 0.25924875070456926, + "learning_rate": 1.5860250883134143e-05, + "loss": 2.3375, + "step": 8041 + }, + { + "epoch": 0.9221419561976838, + "grad_norm": 0.26974540043077905, + "learning_rate": 1.581388467054673e-05, + "loss": 2.4762, + "step": 8042 + }, + { + "epoch": 0.9222566219470244, + "grad_norm": 0.2799503301423009, + "learning_rate": 1.5767585242204562e-05, + "loss": 2.3385, + "step": 8043 + }, + { + "epoch": 0.9223712876963651, + "grad_norm": 0.26064917524069303, + "learning_rate": 1.5721352604493645e-05, + "loss": 2.4257, + "step": 8044 + }, + { + "epoch": 0.9224859534457057, + "grad_norm": 0.31501954858120906, + "learning_rate": 1.567518676379098e-05, + "loss": 2.2656, + "step": 8045 + }, + { + "epoch": 0.9226006191950464, + "grad_norm": 0.28667108818283005, + "learning_rate": 1.5629087726464265e-05, + "loss": 2.3694, + "step": 8046 + }, + { + "epoch": 0.9227152849443871, + "grad_norm": 0.2293863517201724, + "learning_rate": 1.55830554988719e-05, + "loss": 2.2784, + "step": 8047 + }, + { + "epoch": 0.9228299506937278, + "grad_norm": 0.25280557352421623, + "learning_rate": 1.5537090087363205e-05, + "loss": 2.2526, + "step": 8048 + }, + { + "epoch": 0.9229446164430685, + "grad_norm": 0.25257668366634006, + "learning_rate": 1.549119149827821e-05, + "loss": 2.3565, + "step": 8049 + }, + { + "epoch": 0.9230592821924092, + "grad_norm": 0.24461801050401896, + "learning_rate": 1.5445359737947806e-05, + "loss": 2.3586, + "step": 8050 + }, + { + "epoch": 0.9231739479417498, + "grad_norm": 0.2637585277435613, + "learning_rate": 1.539959481269354e-05, + "loss": 2.5222, + "step": 8051 + }, + { + "epoch": 0.9232886136910905, + "grad_norm": 0.27081062356170865, + "learning_rate": 1.5353896728827922e-05, + "loss": 2.4402, + "step": 8052 + }, + { + "epoch": 0.9234032794404311, + "grad_norm": 0.24721872818071702, + "learning_rate": 1.5308265492654018e-05, + "loss": 2.5042, + "step": 8053 + }, + { + "epoch": 0.9235179451897718, + "grad_norm": 0.29667772138042775, + "learning_rate": 1.526270111046574e-05, + "loss": 2.3161, + "step": 8054 + }, + { + "epoch": 0.9236326109391125, + "grad_norm": 0.23851441070567184, + "learning_rate": 1.5217203588548001e-05, + "loss": 2.2463, + "step": 8055 + }, + { + "epoch": 0.9237472766884531, + "grad_norm": 0.2597259226025099, + "learning_rate": 1.5171772933176175e-05, + "loss": 2.3163, + "step": 8056 + }, + { + "epoch": 0.9238619424377938, + "grad_norm": 0.2621332819611845, + "learning_rate": 1.5126409150616638e-05, + "loss": 2.3143, + "step": 8057 + }, + { + "epoch": 0.9239766081871345, + "grad_norm": 0.2253327387878889, + "learning_rate": 1.5081112247126494e-05, + "loss": 2.3978, + "step": 8058 + }, + { + "epoch": 0.9240912739364752, + "grad_norm": 0.24468009057057297, + "learning_rate": 1.5035882228953357e-05, + "loss": 2.3048, + "step": 8059 + }, + { + "epoch": 0.9242059396858159, + "grad_norm": 0.2616487800639435, + "learning_rate": 1.4990719102336126e-05, + "loss": 2.3876, + "step": 8060 + }, + { + "epoch": 0.9243206054351565, + "grad_norm": 0.25626595317777, + "learning_rate": 1.4945622873504094e-05, + "loss": 2.4645, + "step": 8061 + }, + { + "epoch": 0.9244352711844972, + "grad_norm": 0.25829895455656565, + "learning_rate": 1.4900593548677343e-05, + "loss": 2.3807, + "step": 8062 + }, + { + "epoch": 0.9245499369338379, + "grad_norm": 0.2801724726914918, + "learning_rate": 1.4855631134066905e-05, + "loss": 2.3013, + "step": 8063 + }, + { + "epoch": 0.9246646026831785, + "grad_norm": 0.2341175315881007, + "learning_rate": 1.4810735635874428e-05, + "loss": 2.3921, + "step": 8064 + }, + { + "epoch": 0.9247792684325192, + "grad_norm": 0.2574068992612916, + "learning_rate": 1.476590706029246e-05, + "loss": 2.374, + "step": 8065 + }, + { + "epoch": 0.9248939341818598, + "grad_norm": 0.2625702326673317, + "learning_rate": 1.4721145413504167e-05, + "loss": 2.5189, + "step": 8066 + }, + { + "epoch": 0.9250085999312005, + "grad_norm": 0.25630476012187603, + "learning_rate": 1.4676450701683719e-05, + "loss": 2.3504, + "step": 8067 + }, + { + "epoch": 0.9251232656805413, + "grad_norm": 0.2825537565181648, + "learning_rate": 1.4631822930995686e-05, + "loss": 2.3681, + "step": 8068 + }, + { + "epoch": 0.9252379314298819, + "grad_norm": 0.25883033966670527, + "learning_rate": 1.45872621075957e-05, + "loss": 2.4504, + "step": 8069 + }, + { + "epoch": 0.9253525971792226, + "grad_norm": 0.2772779369534294, + "learning_rate": 1.4542768237630177e-05, + "loss": 2.4007, + "step": 8070 + }, + { + "epoch": 0.9254672629285632, + "grad_norm": 0.262993882211127, + "learning_rate": 1.4498341327236041e-05, + "loss": 2.3537, + "step": 8071 + }, + { + "epoch": 0.9255819286779039, + "grad_norm": 0.2574064289083826, + "learning_rate": 1.445398138254128e-05, + "loss": 2.6096, + "step": 8072 + }, + { + "epoch": 0.9256965944272446, + "grad_norm": 0.2851183605526974, + "learning_rate": 1.4409688409664445e-05, + "loss": 2.393, + "step": 8073 + }, + { + "epoch": 0.9258112601765852, + "grad_norm": 0.2700998206778859, + "learning_rate": 1.4365462414714815e-05, + "loss": 2.4588, + "step": 8074 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.24209341760037892, + "learning_rate": 1.4321303403792674e-05, + "loss": 2.4768, + "step": 8075 + }, + { + "epoch": 0.9260405916752666, + "grad_norm": 0.2615796457137312, + "learning_rate": 1.4277211382988876e-05, + "loss": 2.4355, + "step": 8076 + }, + { + "epoch": 0.9261552574246072, + "grad_norm": 0.2481692971947895, + "learning_rate": 1.4233186358384998e-05, + "loss": 2.3523, + "step": 8077 + }, + { + "epoch": 0.926269923173948, + "grad_norm": 0.26187215463348046, + "learning_rate": 1.4189228336053516e-05, + "loss": 2.4091, + "step": 8078 + }, + { + "epoch": 0.9263845889232886, + "grad_norm": 0.282674474270493, + "learning_rate": 1.4145337322057583e-05, + "loss": 2.4269, + "step": 8079 + }, + { + "epoch": 0.9264992546726293, + "grad_norm": 0.23963390516482738, + "learning_rate": 1.4101513322451131e-05, + "loss": 2.4968, + "step": 8080 + }, + { + "epoch": 0.92661392042197, + "grad_norm": 0.2648079043992534, + "learning_rate": 1.4057756343278827e-05, + "loss": 2.4128, + "step": 8081 + }, + { + "epoch": 0.9267285861713106, + "grad_norm": 0.25472455996478843, + "learning_rate": 1.401406639057623e-05, + "loss": 2.2936, + "step": 8082 + }, + { + "epoch": 0.9268432519206513, + "grad_norm": 0.26990040563559786, + "learning_rate": 1.3970443470369354e-05, + "loss": 2.4937, + "step": 8083 + }, + { + "epoch": 0.926957917669992, + "grad_norm": 0.25026388096420865, + "learning_rate": 1.3926887588675274e-05, + "loss": 2.3775, + "step": 8084 + }, + { + "epoch": 0.9270725834193326, + "grad_norm": 0.2750035741354005, + "learning_rate": 1.388339875150163e-05, + "loss": 2.3809, + "step": 8085 + }, + { + "epoch": 0.9271872491686733, + "grad_norm": 0.2617939472972245, + "learning_rate": 1.3839976964846901e-05, + "loss": 2.4285, + "step": 8086 + }, + { + "epoch": 0.927301914918014, + "grad_norm": 0.24190652887271005, + "learning_rate": 1.3796622234700295e-05, + "loss": 2.3817, + "step": 8087 + }, + { + "epoch": 0.9274165806673547, + "grad_norm": 0.2446660276298129, + "learning_rate": 1.375333456704181e-05, + "loss": 2.3651, + "step": 8088 + }, + { + "epoch": 0.9275312464166954, + "grad_norm": 0.2701734654306042, + "learning_rate": 1.3710113967842053e-05, + "loss": 2.3684, + "step": 8089 + }, + { + "epoch": 0.927645912166036, + "grad_norm": 0.2663681660267147, + "learning_rate": 1.3666960443062649e-05, + "loss": 2.3223, + "step": 8090 + }, + { + "epoch": 0.9277605779153767, + "grad_norm": 0.24776890733817822, + "learning_rate": 1.3623873998655612e-05, + "loss": 2.3772, + "step": 8091 + }, + { + "epoch": 0.9278752436647173, + "grad_norm": 0.23582531936025827, + "learning_rate": 1.3580854640564023e-05, + "loss": 2.3119, + "step": 8092 + }, + { + "epoch": 0.927989909414058, + "grad_norm": 0.2609344013850428, + "learning_rate": 1.3537902374721523e-05, + "loss": 2.3441, + "step": 8093 + }, + { + "epoch": 0.9281045751633987, + "grad_norm": 0.24913621976369907, + "learning_rate": 1.349501720705254e-05, + "loss": 2.3979, + "step": 8094 + }, + { + "epoch": 0.9282192409127393, + "grad_norm": 0.27003163874076047, + "learning_rate": 1.3452199143472343e-05, + "loss": 2.351, + "step": 8095 + }, + { + "epoch": 0.92833390666208, + "grad_norm": 0.2652766675384967, + "learning_rate": 1.3409448189886874e-05, + "loss": 2.3332, + "step": 8096 + }, + { + "epoch": 0.9284485724114208, + "grad_norm": 0.2418609052748786, + "learning_rate": 1.336676435219275e-05, + "loss": 2.4123, + "step": 8097 + }, + { + "epoch": 0.9285632381607614, + "grad_norm": 0.2668356668007308, + "learning_rate": 1.3324147636277372e-05, + "loss": 2.3291, + "step": 8098 + }, + { + "epoch": 0.9286779039101021, + "grad_norm": 0.2770723424643746, + "learning_rate": 1.3281598048018928e-05, + "loss": 2.4089, + "step": 8099 + }, + { + "epoch": 0.9287925696594427, + "grad_norm": 0.30168713655626106, + "learning_rate": 1.3239115593286389e-05, + "loss": 2.3588, + "step": 8100 + }, + { + "epoch": 0.9289072354087834, + "grad_norm": 0.2535836061655585, + "learning_rate": 1.3196700277939344e-05, + "loss": 2.2975, + "step": 8101 + }, + { + "epoch": 0.9290219011581241, + "grad_norm": 0.24718738857233885, + "learning_rate": 1.3154352107828171e-05, + "loss": 2.5053, + "step": 8102 + }, + { + "epoch": 0.9291365669074647, + "grad_norm": 0.231351121321423, + "learning_rate": 1.3112071088793976e-05, + "loss": 2.2341, + "step": 8103 + }, + { + "epoch": 0.9292512326568054, + "grad_norm": 0.23475932132271454, + "learning_rate": 1.3069857226668647e-05, + "loss": 2.3904, + "step": 8104 + }, + { + "epoch": 0.929365898406146, + "grad_norm": 0.23460476116341777, + "learning_rate": 1.3027710527274806e-05, + "loss": 2.3637, + "step": 8105 + }, + { + "epoch": 0.9294805641554867, + "grad_norm": 0.2576488117986609, + "learning_rate": 1.2985630996425746e-05, + "loss": 2.3566, + "step": 8106 + }, + { + "epoch": 0.9295952299048275, + "grad_norm": 0.24929724574098233, + "learning_rate": 1.2943618639925547e-05, + "loss": 2.3319, + "step": 8107 + }, + { + "epoch": 0.9297098956541681, + "grad_norm": 0.23539520977208472, + "learning_rate": 1.2901673463569075e-05, + "loss": 2.3742, + "step": 8108 + }, + { + "epoch": 0.9298245614035088, + "grad_norm": 0.25739202726007077, + "learning_rate": 1.28597954731417e-05, + "loss": 2.449, + "step": 8109 + }, + { + "epoch": 0.9299392271528495, + "grad_norm": 0.2877615685642805, + "learning_rate": 1.2817984674419914e-05, + "loss": 2.4556, + "step": 8110 + }, + { + "epoch": 0.9300538929021901, + "grad_norm": 0.26353407010656826, + "learning_rate": 1.2776241073170603e-05, + "loss": 2.4423, + "step": 8111 + }, + { + "epoch": 0.9301685586515308, + "grad_norm": 0.2560319015406194, + "learning_rate": 1.273456467515144e-05, + "loss": 2.1551, + "step": 8112 + }, + { + "epoch": 0.9302832244008714, + "grad_norm": 0.24250028838108162, + "learning_rate": 1.2692955486110991e-05, + "loss": 2.4481, + "step": 8113 + }, + { + "epoch": 0.9303978901502121, + "grad_norm": 0.2682825746063407, + "learning_rate": 1.265141351178839e-05, + "loss": 2.3304, + "step": 8114 + }, + { + "epoch": 0.9305125558995528, + "grad_norm": 0.2331300577866949, + "learning_rate": 1.2609938757913553e-05, + "loss": 2.2855, + "step": 8115 + }, + { + "epoch": 0.9306272216488934, + "grad_norm": 0.2588524294465511, + "learning_rate": 1.2568531230207236e-05, + "loss": 2.2791, + "step": 8116 + }, + { + "epoch": 0.9307418873982342, + "grad_norm": 0.2512275611350516, + "learning_rate": 1.252719093438076e-05, + "loss": 2.3478, + "step": 8117 + }, + { + "epoch": 0.9308565531475749, + "grad_norm": 0.27675649325902624, + "learning_rate": 1.2485917876136178e-05, + "loss": 2.3105, + "step": 8118 + }, + { + "epoch": 0.9309712188969155, + "grad_norm": 0.25420024848552236, + "learning_rate": 1.2444712061166318e-05, + "loss": 2.2799, + "step": 8119 + }, + { + "epoch": 0.9310858846462562, + "grad_norm": 0.31861198933237, + "learning_rate": 1.2403573495154863e-05, + "loss": 2.4298, + "step": 8120 + }, + { + "epoch": 0.9312005503955968, + "grad_norm": 0.2561857908232313, + "learning_rate": 1.2362502183775937e-05, + "loss": 2.38, + "step": 8121 + }, + { + "epoch": 0.9313152161449375, + "grad_norm": 0.255324395430613, + "learning_rate": 1.2321498132694676e-05, + "loss": 2.3723, + "step": 8122 + }, + { + "epoch": 0.9314298818942782, + "grad_norm": 0.2642985124976724, + "learning_rate": 1.2280561347566777e-05, + "loss": 2.4441, + "step": 8123 + }, + { + "epoch": 0.9315445476436188, + "grad_norm": 0.28474685572396197, + "learning_rate": 1.2239691834038558e-05, + "loss": 2.4006, + "step": 8124 + }, + { + "epoch": 0.9316592133929595, + "grad_norm": 0.27220102181672123, + "learning_rate": 1.2198889597747342e-05, + "loss": 2.3648, + "step": 8125 + }, + { + "epoch": 0.9317738791423001, + "grad_norm": 0.2921522234509613, + "learning_rate": 1.2158154644321018e-05, + "loss": 2.3943, + "step": 8126 + }, + { + "epoch": 0.9318885448916409, + "grad_norm": 0.2793095130191193, + "learning_rate": 1.211748697937809e-05, + "loss": 2.3729, + "step": 8127 + }, + { + "epoch": 0.9320032106409816, + "grad_norm": 0.24376219647054306, + "learning_rate": 1.2076886608527959e-05, + "loss": 2.3583, + "step": 8128 + }, + { + "epoch": 0.9321178763903222, + "grad_norm": 0.26408061711823383, + "learning_rate": 1.2036353537370648e-05, + "loss": 2.5253, + "step": 8129 + }, + { + "epoch": 0.9322325421396629, + "grad_norm": 0.24676651918416226, + "learning_rate": 1.199588777149696e-05, + "loss": 2.4686, + "step": 8130 + }, + { + "epoch": 0.9323472078890036, + "grad_norm": 0.25976025126099656, + "learning_rate": 1.1955489316488322e-05, + "loss": 2.432, + "step": 8131 + }, + { + "epoch": 0.9324618736383442, + "grad_norm": 0.24315919247582696, + "learning_rate": 1.1915158177916996e-05, + "loss": 2.3466, + "step": 8132 + }, + { + "epoch": 0.9325765393876849, + "grad_norm": 0.2435881397005569, + "learning_rate": 1.1874894361345867e-05, + "loss": 2.2413, + "step": 8133 + }, + { + "epoch": 0.9326912051370255, + "grad_norm": 0.2784322742443304, + "learning_rate": 1.1834697872328492e-05, + "loss": 2.3823, + "step": 8134 + }, + { + "epoch": 0.9328058708863662, + "grad_norm": 0.27380728021859974, + "learning_rate": 1.179456871640927e-05, + "loss": 2.3836, + "step": 8135 + }, + { + "epoch": 0.932920536635707, + "grad_norm": 0.2696855930390986, + "learning_rate": 1.1754506899123273e-05, + "loss": 2.4284, + "step": 8136 + }, + { + "epoch": 0.9330352023850476, + "grad_norm": 0.2632257106020182, + "learning_rate": 1.171451242599625e-05, + "loss": 2.382, + "step": 8137 + }, + { + "epoch": 0.9331498681343883, + "grad_norm": 0.27040681980860104, + "learning_rate": 1.1674585302544726e-05, + "loss": 2.493, + "step": 8138 + }, + { + "epoch": 0.9332645338837289, + "grad_norm": 0.2511660045678568, + "learning_rate": 1.1634725534275692e-05, + "loss": 2.3608, + "step": 8139 + }, + { + "epoch": 0.9333791996330696, + "grad_norm": 0.25653616936615176, + "learning_rate": 1.1594933126687301e-05, + "loss": 2.2532, + "step": 8140 + }, + { + "epoch": 0.9334938653824103, + "grad_norm": 0.26385315639963225, + "learning_rate": 1.1555208085267999e-05, + "loss": 2.4286, + "step": 8141 + }, + { + "epoch": 0.9336085311317509, + "grad_norm": 0.24634496044355686, + "learning_rate": 1.1515550415497122e-05, + "loss": 2.3009, + "step": 8142 + }, + { + "epoch": 0.9337231968810916, + "grad_norm": 0.25905183880565524, + "learning_rate": 1.1475960122844742e-05, + "loss": 2.3532, + "step": 8143 + }, + { + "epoch": 0.9338378626304323, + "grad_norm": 0.262936580628309, + "learning_rate": 1.1436437212771489e-05, + "loss": 2.2969, + "step": 8144 + }, + { + "epoch": 0.9339525283797729, + "grad_norm": 0.24296228702044412, + "learning_rate": 1.1396981690728891e-05, + "loss": 2.3006, + "step": 8145 + }, + { + "epoch": 0.9340671941291137, + "grad_norm": 0.2564210334966024, + "learning_rate": 1.1357593562159096e-05, + "loss": 2.5314, + "step": 8146 + }, + { + "epoch": 0.9341818598784543, + "grad_norm": 0.2522925381206959, + "learning_rate": 1.1318272832494814e-05, + "loss": 2.3658, + "step": 8147 + }, + { + "epoch": 0.934296525627795, + "grad_norm": 0.25305273931603023, + "learning_rate": 1.1279019507159705e-05, + "loss": 2.4349, + "step": 8148 + }, + { + "epoch": 0.9344111913771357, + "grad_norm": 0.23915144230310795, + "learning_rate": 1.1239833591567994e-05, + "loss": 2.3122, + "step": 8149 + }, + { + "epoch": 0.9345258571264763, + "grad_norm": 0.24068217190610214, + "learning_rate": 1.1200715091124637e-05, + "loss": 2.4044, + "step": 8150 + }, + { + "epoch": 0.934640522875817, + "grad_norm": 0.2559142024861348, + "learning_rate": 1.1161664011225203e-05, + "loss": 2.3234, + "step": 8151 + }, + { + "epoch": 0.9347551886251577, + "grad_norm": 0.24937071839083125, + "learning_rate": 1.1122680357256222e-05, + "loss": 2.3436, + "step": 8152 + }, + { + "epoch": 0.9348698543744983, + "grad_norm": 0.2494229983249669, + "learning_rate": 1.1083764134594554e-05, + "loss": 2.3909, + "step": 8153 + }, + { + "epoch": 0.934984520123839, + "grad_norm": 0.24045518556775694, + "learning_rate": 1.104491534860802e-05, + "loss": 2.4098, + "step": 8154 + }, + { + "epoch": 0.9350991858731796, + "grad_norm": 0.2612825271897257, + "learning_rate": 1.1006134004655111e-05, + "loss": 2.503, + "step": 8155 + }, + { + "epoch": 0.9352138516225204, + "grad_norm": 0.26178766323469627, + "learning_rate": 1.0967420108084991e-05, + "loss": 2.2943, + "step": 8156 + }, + { + "epoch": 0.9353285173718611, + "grad_norm": 0.25675477711198846, + "learning_rate": 1.092877366423739e-05, + "loss": 2.3567, + "step": 8157 + }, + { + "epoch": 0.9354431831212017, + "grad_norm": 0.2691510228172048, + "learning_rate": 1.0890194678442989e-05, + "loss": 2.4989, + "step": 8158 + }, + { + "epoch": 0.9355578488705424, + "grad_norm": 0.2533649609496583, + "learning_rate": 1.0851683156022862e-05, + "loss": 2.2879, + "step": 8159 + }, + { + "epoch": 0.935672514619883, + "grad_norm": 0.2614503229547817, + "learning_rate": 1.081323910228904e-05, + "loss": 2.3818, + "step": 8160 + }, + { + "epoch": 0.9357871803692237, + "grad_norm": 0.27044941928066546, + "learning_rate": 1.0774862522544227e-05, + "loss": 2.4398, + "step": 8161 + }, + { + "epoch": 0.9359018461185644, + "grad_norm": 0.25535286117410844, + "learning_rate": 1.0736553422081574e-05, + "loss": 2.2717, + "step": 8162 + }, + { + "epoch": 0.936016511867905, + "grad_norm": 0.26589397287993904, + "learning_rate": 1.0698311806185135e-05, + "loss": 2.3391, + "step": 8163 + }, + { + "epoch": 0.9361311776172457, + "grad_norm": 0.26660613382708637, + "learning_rate": 1.0660137680129688e-05, + "loss": 2.4047, + "step": 8164 + }, + { + "epoch": 0.9362458433665864, + "grad_norm": 0.24489497052574707, + "learning_rate": 1.062203104918058e-05, + "loss": 2.392, + "step": 8165 + }, + { + "epoch": 0.936360509115927, + "grad_norm": 0.2646791392910024, + "learning_rate": 1.0583991918593883e-05, + "loss": 2.3431, + "step": 8166 + }, + { + "epoch": 0.9364751748652678, + "grad_norm": 0.2740548601999037, + "learning_rate": 1.0546020293616455e-05, + "loss": 2.3384, + "step": 8167 + }, + { + "epoch": 0.9365898406146084, + "grad_norm": 0.25543228522917905, + "learning_rate": 1.0508116179485605e-05, + "loss": 2.3592, + "step": 8168 + }, + { + "epoch": 0.9367045063639491, + "grad_norm": 0.25087072636070334, + "learning_rate": 1.0470279581429542e-05, + "loss": 2.3475, + "step": 8169 + }, + { + "epoch": 0.9368191721132898, + "grad_norm": 0.25474509236004017, + "learning_rate": 1.0432510504667203e-05, + "loss": 2.4221, + "step": 8170 + }, + { + "epoch": 0.9369338378626304, + "grad_norm": 0.23843440945386765, + "learning_rate": 1.039480895440803e-05, + "loss": 2.1601, + "step": 8171 + }, + { + "epoch": 0.9370485036119711, + "grad_norm": 0.26492667132581915, + "learning_rate": 1.0357174935852199e-05, + "loss": 2.3854, + "step": 8172 + }, + { + "epoch": 0.9371631693613117, + "grad_norm": 0.24130163619148037, + "learning_rate": 1.031960845419072e-05, + "loss": 2.4521, + "step": 8173 + }, + { + "epoch": 0.9372778351106524, + "grad_norm": 0.2520288278931533, + "learning_rate": 1.0282109514604954e-05, + "loss": 2.3969, + "step": 8174 + }, + { + "epoch": 0.9373925008599931, + "grad_norm": 0.2886621048163839, + "learning_rate": 1.024467812226737e-05, + "loss": 2.3921, + "step": 8175 + }, + { + "epoch": 0.9375071666093338, + "grad_norm": 0.2902248853621348, + "learning_rate": 1.0207314282340952e-05, + "loss": 2.451, + "step": 8176 + }, + { + "epoch": 0.9376218323586745, + "grad_norm": 0.25050174508757433, + "learning_rate": 1.0170017999979131e-05, + "loss": 2.4845, + "step": 8177 + }, + { + "epoch": 0.9377364981080152, + "grad_norm": 0.2586959952570016, + "learning_rate": 1.0132789280326293e-05, + "loss": 2.3119, + "step": 8178 + }, + { + "epoch": 0.9378511638573558, + "grad_norm": 0.24259731500228857, + "learning_rate": 1.0095628128517497e-05, + "loss": 2.2834, + "step": 8179 + }, + { + "epoch": 0.9379658296066965, + "grad_norm": 0.2624829536896241, + "learning_rate": 1.0058534549678367e-05, + "loss": 2.5068, + "step": 8180 + }, + { + "epoch": 0.9380804953560371, + "grad_norm": 0.2456246342332185, + "learning_rate": 1.0021508548925306e-05, + "loss": 2.351, + "step": 8181 + }, + { + "epoch": 0.9381951611053778, + "grad_norm": 0.2833106641353864, + "learning_rate": 9.984550131365289e-06, + "loss": 2.5897, + "step": 8182 + }, + { + "epoch": 0.9383098268547185, + "grad_norm": 0.2763389271792517, + "learning_rate": 9.947659302096012e-06, + "loss": 2.3534, + "step": 8183 + }, + { + "epoch": 0.9384244926040591, + "grad_norm": 0.25918347093172817, + "learning_rate": 9.910836066205852e-06, + "loss": 2.2401, + "step": 8184 + }, + { + "epoch": 0.9385391583533998, + "grad_norm": 0.27506661410040334, + "learning_rate": 9.874080428773968e-06, + "loss": 2.4616, + "step": 8185 + }, + { + "epoch": 0.9386538241027406, + "grad_norm": 0.2791890698718216, + "learning_rate": 9.837392394870026e-06, + "loss": 2.3201, + "step": 8186 + }, + { + "epoch": 0.9387684898520812, + "grad_norm": 0.2563764371024099, + "learning_rate": 9.800771969554423e-06, + "loss": 2.402, + "step": 8187 + }, + { + "epoch": 0.9388831556014219, + "grad_norm": 0.23461644597124304, + "learning_rate": 9.76421915787834e-06, + "loss": 2.38, + "step": 8188 + }, + { + "epoch": 0.9389978213507625, + "grad_norm": 0.2783591103087557, + "learning_rate": 9.727733964883355e-06, + "loss": 2.359, + "step": 8189 + }, + { + "epoch": 0.9391124871001032, + "grad_norm": 0.280376622582292, + "learning_rate": 9.691316395602168e-06, + "loss": 2.3404, + "step": 8190 + }, + { + "epoch": 0.9392271528494439, + "grad_norm": 0.261009485782321, + "learning_rate": 9.6549664550577e-06, + "loss": 2.207, + "step": 8191 + }, + { + "epoch": 0.9393418185987845, + "grad_norm": 0.2382090117687303, + "learning_rate": 9.61868414826378e-06, + "loss": 2.3753, + "step": 8192 + }, + { + "epoch": 0.9394564843481252, + "grad_norm": 0.29608566487414617, + "learning_rate": 9.582469480224842e-06, + "loss": 2.4569, + "step": 8193 + }, + { + "epoch": 0.9395711500974658, + "grad_norm": 0.23546322155848892, + "learning_rate": 9.546322455936062e-06, + "loss": 2.3255, + "step": 8194 + }, + { + "epoch": 0.9396858158468065, + "grad_norm": 0.2682526249035245, + "learning_rate": 9.510243080383174e-06, + "loss": 2.4585, + "step": 8195 + }, + { + "epoch": 0.9398004815961473, + "grad_norm": 0.258771175203155, + "learning_rate": 9.474231358542751e-06, + "loss": 2.5324, + "step": 8196 + }, + { + "epoch": 0.9399151473454879, + "grad_norm": 0.26280991110474716, + "learning_rate": 9.438287295381765e-06, + "loss": 2.3957, + "step": 8197 + }, + { + "epoch": 0.9400298130948286, + "grad_norm": 0.25583777145935155, + "learning_rate": 9.402410895858082e-06, + "loss": 2.2061, + "step": 8198 + }, + { + "epoch": 0.9401444788441693, + "grad_norm": 0.2683364000943464, + "learning_rate": 9.366602164920191e-06, + "loss": 2.4681, + "step": 8199 + }, + { + "epoch": 0.9402591445935099, + "grad_norm": 0.25470355285088, + "learning_rate": 9.330861107507194e-06, + "loss": 2.3237, + "step": 8200 + }, + { + "epoch": 0.9403738103428506, + "grad_norm": 0.2611328858482173, + "learning_rate": 9.295187728548926e-06, + "loss": 2.3808, + "step": 8201 + }, + { + "epoch": 0.9404884760921912, + "grad_norm": 0.23912676310221773, + "learning_rate": 9.25958203296584e-06, + "loss": 2.2712, + "step": 8202 + }, + { + "epoch": 0.9406031418415319, + "grad_norm": 0.2495063308244627, + "learning_rate": 9.224044025669009e-06, + "loss": 2.3571, + "step": 8203 + }, + { + "epoch": 0.9407178075908726, + "grad_norm": 0.2750014813890792, + "learning_rate": 9.188573711560232e-06, + "loss": 2.4177, + "step": 8204 + }, + { + "epoch": 0.9408324733402132, + "grad_norm": 0.2758533583967891, + "learning_rate": 9.153171095532097e-06, + "loss": 2.4742, + "step": 8205 + }, + { + "epoch": 0.940947139089554, + "grad_norm": 0.263950315915875, + "learning_rate": 9.117836182467532e-06, + "loss": 2.4068, + "step": 8206 + }, + { + "epoch": 0.9410618048388946, + "grad_norm": 0.23929323323736013, + "learning_rate": 9.082568977240469e-06, + "loss": 2.4301, + "step": 8207 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.26393297498459356, + "learning_rate": 9.04736948471524e-06, + "loss": 2.4352, + "step": 8208 + }, + { + "epoch": 0.941291136337576, + "grad_norm": 0.23995622895639435, + "learning_rate": 9.012237709746961e-06, + "loss": 2.3966, + "step": 8209 + }, + { + "epoch": 0.9414058020869166, + "grad_norm": 0.2907944245391007, + "learning_rate": 8.977173657181426e-06, + "loss": 2.3662, + "step": 8210 + }, + { + "epoch": 0.9415204678362573, + "grad_norm": 0.25209705749344846, + "learning_rate": 8.942177331855096e-06, + "loss": 2.3436, + "step": 8211 + }, + { + "epoch": 0.941635133585598, + "grad_norm": 0.261047032096895, + "learning_rate": 8.907248738594942e-06, + "loss": 2.3725, + "step": 8212 + }, + { + "epoch": 0.9417497993349386, + "grad_norm": 0.2461029250719317, + "learning_rate": 8.872387882218724e-06, + "loss": 2.4841, + "step": 8213 + }, + { + "epoch": 0.9418644650842793, + "grad_norm": 0.2571065825096209, + "learning_rate": 8.837594767534873e-06, + "loss": 2.2285, + "step": 8214 + }, + { + "epoch": 0.94197913083362, + "grad_norm": 0.24301011444603032, + "learning_rate": 8.802869399342384e-06, + "loss": 2.4527, + "step": 8215 + }, + { + "epoch": 0.9420937965829607, + "grad_norm": 0.2544954513278097, + "learning_rate": 8.768211782431036e-06, + "loss": 2.4334, + "step": 8216 + }, + { + "epoch": 0.9422084623323014, + "grad_norm": 0.2631521290016174, + "learning_rate": 8.73362192158117e-06, + "loss": 2.443, + "step": 8217 + }, + { + "epoch": 0.942323128081642, + "grad_norm": 0.24055154540659598, + "learning_rate": 8.699099821563749e-06, + "loss": 2.3142, + "step": 8218 + }, + { + "epoch": 0.9424377938309827, + "grad_norm": 0.26221366755403697, + "learning_rate": 8.66464548714041e-06, + "loss": 2.4519, + "step": 8219 + }, + { + "epoch": 0.9425524595803234, + "grad_norm": 0.24950036721508184, + "learning_rate": 8.630258923063573e-06, + "loss": 2.3231, + "step": 8220 + }, + { + "epoch": 0.942667125329664, + "grad_norm": 0.2587646515070118, + "learning_rate": 8.595940134076108e-06, + "loss": 2.474, + "step": 8221 + }, + { + "epoch": 0.9427817910790047, + "grad_norm": 0.2383356711770594, + "learning_rate": 8.561689124911731e-06, + "loss": 2.3602, + "step": 8222 + }, + { + "epoch": 0.9428964568283453, + "grad_norm": 0.26418347944946585, + "learning_rate": 8.527505900294719e-06, + "loss": 2.3306, + "step": 8223 + }, + { + "epoch": 0.943011122577686, + "grad_norm": 0.30481677695958276, + "learning_rate": 8.493390464939854e-06, + "loss": 2.3382, + "step": 8224 + }, + { + "epoch": 0.9431257883270268, + "grad_norm": 0.2556177220339646, + "learning_rate": 8.459342823552818e-06, + "loss": 2.4669, + "step": 8225 + }, + { + "epoch": 0.9432404540763674, + "grad_norm": 0.25263975231650243, + "learning_rate": 8.425362980829909e-06, + "loss": 2.3443, + "step": 8226 + }, + { + "epoch": 0.9433551198257081, + "grad_norm": 0.23674415081125696, + "learning_rate": 8.391450941457823e-06, + "loss": 2.39, + "step": 8227 + }, + { + "epoch": 0.9434697855750487, + "grad_norm": 0.275384626348196, + "learning_rate": 8.357606710114207e-06, + "loss": 2.4353, + "step": 8228 + }, + { + "epoch": 0.9435844513243894, + "grad_norm": 0.2596109302555167, + "learning_rate": 8.323830291467161e-06, + "loss": 2.4022, + "step": 8229 + }, + { + "epoch": 0.9436991170737301, + "grad_norm": 0.23552399591032394, + "learning_rate": 8.290121690175567e-06, + "loss": 2.2812, + "step": 8230 + }, + { + "epoch": 0.9438137828230707, + "grad_norm": 0.2832792483730591, + "learning_rate": 8.25648091088882e-06, + "loss": 2.4406, + "step": 8231 + }, + { + "epoch": 0.9439284485724114, + "grad_norm": 0.24407402094145375, + "learning_rate": 8.222907958247094e-06, + "loss": 2.4201, + "step": 8232 + }, + { + "epoch": 0.9440431143217521, + "grad_norm": 0.2359417764000179, + "learning_rate": 8.18940283688102e-06, + "loss": 2.2518, + "step": 8233 + }, + { + "epoch": 0.9441577800710927, + "grad_norm": 0.2953689762173765, + "learning_rate": 8.15596555141207e-06, + "loss": 2.4648, + "step": 8234 + }, + { + "epoch": 0.9442724458204335, + "grad_norm": 0.23583814425399952, + "learning_rate": 8.122596106452273e-06, + "loss": 2.4145, + "step": 8235 + }, + { + "epoch": 0.9443871115697741, + "grad_norm": 0.24081713067548055, + "learning_rate": 8.089294506604338e-06, + "loss": 2.4625, + "step": 8236 + }, + { + "epoch": 0.9445017773191148, + "grad_norm": 0.25618574964819085, + "learning_rate": 8.056060756461537e-06, + "loss": 2.3716, + "step": 8237 + }, + { + "epoch": 0.9446164430684555, + "grad_norm": 0.2931553241910916, + "learning_rate": 8.022894860607866e-06, + "loss": 2.3177, + "step": 8238 + }, + { + "epoch": 0.9447311088177961, + "grad_norm": 0.25201195399629506, + "learning_rate": 7.98979682361789e-06, + "loss": 2.387, + "step": 8239 + }, + { + "epoch": 0.9448457745671368, + "grad_norm": 0.23157594235541062, + "learning_rate": 7.956766650056901e-06, + "loss": 2.4126, + "step": 8240 + }, + { + "epoch": 0.9449604403164775, + "grad_norm": 0.25052233889127434, + "learning_rate": 7.923804344480756e-06, + "loss": 2.3845, + "step": 8241 + }, + { + "epoch": 0.9450751060658181, + "grad_norm": 0.2599024985461345, + "learning_rate": 7.890909911435929e-06, + "loss": 2.4538, + "step": 8242 + }, + { + "epoch": 0.9451897718151588, + "grad_norm": 0.2461863638989664, + "learning_rate": 7.858083355459677e-06, + "loss": 2.3649, + "step": 8243 + }, + { + "epoch": 0.9453044375644994, + "grad_norm": 0.25570765614632873, + "learning_rate": 7.825324681079715e-06, + "loss": 2.344, + "step": 8244 + }, + { + "epoch": 0.9454191033138402, + "grad_norm": 0.27073657468203466, + "learning_rate": 7.792633892814483e-06, + "loss": 2.2348, + "step": 8245 + }, + { + "epoch": 0.9455337690631809, + "grad_norm": 0.2875278758270435, + "learning_rate": 7.760010995173151e-06, + "loss": 2.2376, + "step": 8246 + }, + { + "epoch": 0.9456484348125215, + "grad_norm": 0.27750889442392596, + "learning_rate": 7.727455992655342e-06, + "loss": 2.5322, + "step": 8247 + }, + { + "epoch": 0.9457631005618622, + "grad_norm": 0.2559563778209828, + "learning_rate": 7.694968889751408e-06, + "loss": 2.3339, + "step": 8248 + }, + { + "epoch": 0.9458777663112028, + "grad_norm": 0.24754544966331177, + "learning_rate": 7.662549690942267e-06, + "loss": 2.3161, + "step": 8249 + }, + { + "epoch": 0.9459924320605435, + "grad_norm": 0.24799802622102513, + "learning_rate": 7.630198400699672e-06, + "loss": 2.3639, + "step": 8250 + }, + { + "epoch": 0.9461070978098842, + "grad_norm": 0.26823317484198467, + "learning_rate": 7.597915023485724e-06, + "loss": 2.4358, + "step": 8251 + }, + { + "epoch": 0.9462217635592248, + "grad_norm": 0.24518256672697789, + "learning_rate": 7.565699563753414e-06, + "loss": 2.4118, + "step": 8252 + }, + { + "epoch": 0.9463364293085655, + "grad_norm": 0.2583556497728397, + "learning_rate": 7.533552025946188e-06, + "loss": 2.3429, + "step": 8253 + }, + { + "epoch": 0.9464510950579063, + "grad_norm": 0.24802980403519487, + "learning_rate": 7.5014724144981115e-06, + "loss": 2.2754, + "step": 8254 + }, + { + "epoch": 0.9465657608072469, + "grad_norm": 0.251840908988671, + "learning_rate": 7.469460733834144e-06, + "loss": 2.3421, + "step": 8255 + }, + { + "epoch": 0.9466804265565876, + "grad_norm": 0.2718626846263573, + "learning_rate": 7.4375169883695306e-06, + "loss": 2.3137, + "step": 8256 + }, + { + "epoch": 0.9467950923059282, + "grad_norm": 0.2517344320976422, + "learning_rate": 7.405641182510359e-06, + "loss": 2.339, + "step": 8257 + }, + { + "epoch": 0.9469097580552689, + "grad_norm": 0.27945542632915055, + "learning_rate": 7.373833320653334e-06, + "loss": 2.4839, + "step": 8258 + }, + { + "epoch": 0.9470244238046096, + "grad_norm": 0.24301312496700453, + "learning_rate": 7.342093407185613e-06, + "loss": 2.2752, + "step": 8259 + }, + { + "epoch": 0.9471390895539502, + "grad_norm": 0.254627648646453, + "learning_rate": 7.310421446485194e-06, + "loss": 2.3177, + "step": 8260 + }, + { + "epoch": 0.9472537553032909, + "grad_norm": 0.2663780075296378, + "learning_rate": 7.2788174429206935e-06, + "loss": 2.3615, + "step": 8261 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.2669470205161946, + "learning_rate": 7.24728140085118e-06, + "loss": 2.3978, + "step": 8262 + }, + { + "epoch": 0.9474830868019722, + "grad_norm": 0.25541325954841004, + "learning_rate": 7.215813324626452e-06, + "loss": 2.4179, + "step": 8263 + }, + { + "epoch": 0.947597752551313, + "grad_norm": 0.26121280345126874, + "learning_rate": 7.184413218586927e-06, + "loss": 2.396, + "step": 8264 + }, + { + "epoch": 0.9477124183006536, + "grad_norm": 0.2819296299012922, + "learning_rate": 7.153081087063751e-06, + "loss": 2.4594, + "step": 8265 + }, + { + "epoch": 0.9478270840499943, + "grad_norm": 0.2311050058761652, + "learning_rate": 7.121816934378467e-06, + "loss": 2.3666, + "step": 8266 + }, + { + "epoch": 0.947941749799335, + "grad_norm": 0.24868183497068921, + "learning_rate": 7.090620764843458e-06, + "loss": 2.3062, + "step": 8267 + }, + { + "epoch": 0.9480564155486756, + "grad_norm": 0.2538853647633459, + "learning_rate": 7.059492582761618e-06, + "loss": 2.2446, + "step": 8268 + }, + { + "epoch": 0.9481710812980163, + "grad_norm": 0.25978775168953633, + "learning_rate": 7.028432392426398e-06, + "loss": 2.4882, + "step": 8269 + }, + { + "epoch": 0.9482857470473569, + "grad_norm": 0.22256521243699512, + "learning_rate": 6.997440198122151e-06, + "loss": 2.303, + "step": 8270 + }, + { + "epoch": 0.9484004127966976, + "grad_norm": 0.24493201282792895, + "learning_rate": 6.966516004123458e-06, + "loss": 2.4145, + "step": 8271 + }, + { + "epoch": 0.9485150785460383, + "grad_norm": 0.2867705130830074, + "learning_rate": 6.935659814695849e-06, + "loss": 2.3361, + "step": 8272 + }, + { + "epoch": 0.9486297442953789, + "grad_norm": 0.24303993238198116, + "learning_rate": 6.904871634095311e-06, + "loss": 2.1991, + "step": 8273 + }, + { + "epoch": 0.9487444100447197, + "grad_norm": 0.2561614797401005, + "learning_rate": 6.8741514665683905e-06, + "loss": 2.3874, + "step": 8274 + }, + { + "epoch": 0.9488590757940604, + "grad_norm": 0.25889911175158453, + "learning_rate": 6.84349931635253e-06, + "loss": 2.3403, + "step": 8275 + }, + { + "epoch": 0.948973741543401, + "grad_norm": 0.2696669586795356, + "learning_rate": 6.812915187675517e-06, + "loss": 2.4291, + "step": 8276 + }, + { + "epoch": 0.9490884072927417, + "grad_norm": 0.2656433627560766, + "learning_rate": 6.7823990847558635e-06, + "loss": 2.3351, + "step": 8277 + }, + { + "epoch": 0.9492030730420823, + "grad_norm": 0.2572834947221208, + "learning_rate": 6.751951011802649e-06, + "loss": 2.363, + "step": 8278 + }, + { + "epoch": 0.949317738791423, + "grad_norm": 0.2517027906554384, + "learning_rate": 6.7215709730156225e-06, + "loss": 2.2274, + "step": 8279 + }, + { + "epoch": 0.9494324045407637, + "grad_norm": 0.2744701809436497, + "learning_rate": 6.691258972585157e-06, + "loss": 2.4672, + "step": 8280 + }, + { + "epoch": 0.9495470702901043, + "grad_norm": 0.2598967443229175, + "learning_rate": 6.661015014692184e-06, + "loss": 2.5424, + "step": 8281 + }, + { + "epoch": 0.949661736039445, + "grad_norm": 0.2685728928687207, + "learning_rate": 6.630839103508312e-06, + "loss": 2.4848, + "step": 8282 + }, + { + "epoch": 0.9497764017887856, + "grad_norm": 0.28309410998051354, + "learning_rate": 6.600731243195712e-06, + "loss": 2.4671, + "step": 8283 + }, + { + "epoch": 0.9498910675381264, + "grad_norm": 0.2703105840260175, + "learning_rate": 6.570691437907172e-06, + "loss": 2.5553, + "step": 8284 + }, + { + "epoch": 0.9500057332874671, + "grad_norm": 0.26168414854028377, + "learning_rate": 6.540719691786101e-06, + "loss": 2.4644, + "step": 8285 + }, + { + "epoch": 0.9501203990368077, + "grad_norm": 0.2385760686453105, + "learning_rate": 6.51081600896658e-06, + "loss": 2.3778, + "step": 8286 + }, + { + "epoch": 0.9502350647861484, + "grad_norm": 0.26687836018085953, + "learning_rate": 6.480980393573255e-06, + "loss": 2.4915, + "step": 8287 + }, + { + "epoch": 0.9503497305354891, + "grad_norm": 0.2590077700080527, + "learning_rate": 6.451212849721389e-06, + "loss": 2.3588, + "step": 8288 + }, + { + "epoch": 0.9504643962848297, + "grad_norm": 0.22761665414136276, + "learning_rate": 6.421513381516752e-06, + "loss": 2.5281, + "step": 8289 + }, + { + "epoch": 0.9505790620341704, + "grad_norm": 0.27245624288842796, + "learning_rate": 6.391881993055903e-06, + "loss": 2.4664, + "step": 8290 + }, + { + "epoch": 0.950693727783511, + "grad_norm": 0.3011637604420565, + "learning_rate": 6.362318688425905e-06, + "loss": 2.337, + "step": 8291 + }, + { + "epoch": 0.9508083935328517, + "grad_norm": 0.25090579843869215, + "learning_rate": 6.332823471704441e-06, + "loss": 2.4416, + "step": 8292 + }, + { + "epoch": 0.9509230592821925, + "grad_norm": 0.26981447542141124, + "learning_rate": 6.303396346959867e-06, + "loss": 2.4451, + "step": 8293 + }, + { + "epoch": 0.951037725031533, + "grad_norm": 0.24675037790856236, + "learning_rate": 6.2740373182509916e-06, + "loss": 2.3511, + "step": 8294 + }, + { + "epoch": 0.9511523907808738, + "grad_norm": 0.25520009926014153, + "learning_rate": 6.244746389627409e-06, + "loss": 2.3927, + "step": 8295 + }, + { + "epoch": 0.9512670565302144, + "grad_norm": 0.26536958202966315, + "learning_rate": 6.215523565129277e-06, + "loss": 2.4324, + "step": 8296 + }, + { + "epoch": 0.9513817222795551, + "grad_norm": 0.23027959919447816, + "learning_rate": 6.186368848787261e-06, + "loss": 2.3214, + "step": 8297 + }, + { + "epoch": 0.9514963880288958, + "grad_norm": 0.25824101648853826, + "learning_rate": 6.157282244622697e-06, + "loss": 2.3741, + "step": 8298 + }, + { + "epoch": 0.9516110537782364, + "grad_norm": 0.2574870582555831, + "learning_rate": 6.1282637566476e-06, + "loss": 2.3825, + "step": 8299 + }, + { + "epoch": 0.9517257195275771, + "grad_norm": 0.22674492358316906, + "learning_rate": 6.09931338886438e-06, + "loss": 2.3879, + "step": 8300 + }, + { + "epoch": 0.9518403852769178, + "grad_norm": 0.2442943026143882, + "learning_rate": 6.070431145266342e-06, + "loss": 2.4195, + "step": 8301 + }, + { + "epoch": 0.9519550510262584, + "grad_norm": 0.2710190308868985, + "learning_rate": 6.041617029837188e-06, + "loss": 2.3508, + "step": 8302 + }, + { + "epoch": 0.9520697167755992, + "grad_norm": 0.2487835860780046, + "learning_rate": 6.012871046551238e-06, + "loss": 2.3725, + "step": 8303 + }, + { + "epoch": 0.9521843825249398, + "grad_norm": 0.24631300714824145, + "learning_rate": 5.984193199373433e-06, + "loss": 2.3199, + "step": 8304 + }, + { + "epoch": 0.9522990482742805, + "grad_norm": 0.2828104915810898, + "learning_rate": 5.955583492259442e-06, + "loss": 2.3485, + "step": 8305 + }, + { + "epoch": 0.9524137140236212, + "grad_norm": 0.27032574254147634, + "learning_rate": 5.92704192915533e-06, + "loss": 2.3933, + "step": 8306 + }, + { + "epoch": 0.9525283797729618, + "grad_norm": 0.23853866638329677, + "learning_rate": 5.898568513997893e-06, + "loss": 2.4829, + "step": 8307 + }, + { + "epoch": 0.9526430455223025, + "grad_norm": 0.2700118492653518, + "learning_rate": 5.870163250714544e-06, + "loss": 2.27, + "step": 8308 + }, + { + "epoch": 0.9527577112716432, + "grad_norm": 0.2529457667048586, + "learning_rate": 5.841826143223094e-06, + "loss": 2.3639, + "step": 8309 + }, + { + "epoch": 0.9528723770209838, + "grad_norm": 0.2534768616603229, + "learning_rate": 5.8135571954323065e-06, + "loss": 2.5443, + "step": 8310 + }, + { + "epoch": 0.9529870427703245, + "grad_norm": 0.250687673797539, + "learning_rate": 5.7853564112412274e-06, + "loss": 2.4916, + "step": 8311 + }, + { + "epoch": 0.9531017085196651, + "grad_norm": 0.2544087508919549, + "learning_rate": 5.75722379453969e-06, + "loss": 2.3374, + "step": 8312 + }, + { + "epoch": 0.9532163742690059, + "grad_norm": 0.26331795354540927, + "learning_rate": 5.7291593492079245e-06, + "loss": 2.3509, + "step": 8313 + }, + { + "epoch": 0.9533310400183466, + "grad_norm": 0.2535627933490605, + "learning_rate": 5.701163079117e-06, + "loss": 2.3188, + "step": 8314 + }, + { + "epoch": 0.9534457057676872, + "grad_norm": 0.24786966133909533, + "learning_rate": 5.6732349881284394e-06, + "loss": 2.3342, + "step": 8315 + }, + { + "epoch": 0.9535603715170279, + "grad_norm": 0.2543877198308559, + "learning_rate": 5.6453750800944395e-06, + "loss": 2.3627, + "step": 8316 + }, + { + "epoch": 0.9536750372663685, + "grad_norm": 0.26908684213864814, + "learning_rate": 5.617583358857647e-06, + "loss": 2.3906, + "step": 8317 + }, + { + "epoch": 0.9537897030157092, + "grad_norm": 0.2571294618781444, + "learning_rate": 5.589859828251443e-06, + "loss": 2.376, + "step": 8318 + }, + { + "epoch": 0.9539043687650499, + "grad_norm": 0.2288080190888788, + "learning_rate": 5.56220449209971e-06, + "loss": 2.2848, + "step": 8319 + }, + { + "epoch": 0.9540190345143905, + "grad_norm": 0.2675439354218467, + "learning_rate": 5.5346173542171195e-06, + "loss": 2.3461, + "step": 8320 + }, + { + "epoch": 0.9541337002637312, + "grad_norm": 0.2545697958252429, + "learning_rate": 5.507098418408629e-06, + "loss": 2.2829, + "step": 8321 + }, + { + "epoch": 0.954248366013072, + "grad_norm": 0.2640037128881111, + "learning_rate": 5.479647688470035e-06, + "loss": 2.4964, + "step": 8322 + }, + { + "epoch": 0.9543630317624126, + "grad_norm": 0.24680418294124895, + "learning_rate": 5.4522651681876976e-06, + "loss": 2.4496, + "step": 8323 + }, + { + "epoch": 0.9544776975117533, + "grad_norm": 0.27937992197459405, + "learning_rate": 5.424950861338318e-06, + "loss": 2.5054, + "step": 8324 + }, + { + "epoch": 0.9545923632610939, + "grad_norm": 0.26686293410864853, + "learning_rate": 5.397704771689549e-06, + "loss": 2.3905, + "step": 8325 + }, + { + "epoch": 0.9547070290104346, + "grad_norm": 0.27647543565482224, + "learning_rate": 5.3705269029994955e-06, + "loss": 2.3988, + "step": 8326 + }, + { + "epoch": 0.9548216947597753, + "grad_norm": 0.24235521436265892, + "learning_rate": 5.343417259016714e-06, + "loss": 2.3787, + "step": 8327 + }, + { + "epoch": 0.9549363605091159, + "grad_norm": 0.2579202116584383, + "learning_rate": 5.316375843480492e-06, + "loss": 2.4601, + "step": 8328 + }, + { + "epoch": 0.9550510262584566, + "grad_norm": 0.24670098455650555, + "learning_rate": 5.289402660120735e-06, + "loss": 2.4395, + "step": 8329 + }, + { + "epoch": 0.9551656920077972, + "grad_norm": 0.29804283541135873, + "learning_rate": 5.262497712657799e-06, + "loss": 2.4581, + "step": 8330 + }, + { + "epoch": 0.9552803577571379, + "grad_norm": 0.24559413791377968, + "learning_rate": 5.235661004802772e-06, + "loss": 2.3473, + "step": 8331 + }, + { + "epoch": 0.9553950235064786, + "grad_norm": 0.2676532781782227, + "learning_rate": 5.208892540257249e-06, + "loss": 2.571, + "step": 8332 + }, + { + "epoch": 0.9555096892558193, + "grad_norm": 0.2723955925299646, + "learning_rate": 5.1821923227134415e-06, + "loss": 2.5049, + "step": 8333 + }, + { + "epoch": 0.95562435500516, + "grad_norm": 0.2730711463800587, + "learning_rate": 5.155560355854072e-06, + "loss": 2.4023, + "step": 8334 + }, + { + "epoch": 0.9557390207545007, + "grad_norm": 0.24420291135179673, + "learning_rate": 5.12899664335259e-06, + "loss": 2.4451, + "step": 8335 + }, + { + "epoch": 0.9558536865038413, + "grad_norm": 0.2357257651263144, + "learning_rate": 5.102501188872899e-06, + "loss": 2.4001, + "step": 8336 + }, + { + "epoch": 0.955968352253182, + "grad_norm": 0.2523126556735753, + "learning_rate": 5.076073996069575e-06, + "loss": 2.2557, + "step": 8337 + }, + { + "epoch": 0.9560830180025226, + "grad_norm": 0.2791939052779133, + "learning_rate": 5.049715068587757e-06, + "loss": 2.3186, + "step": 8338 + }, + { + "epoch": 0.9561976837518633, + "grad_norm": 0.27711418022798756, + "learning_rate": 5.023424410063037e-06, + "loss": 2.5132, + "step": 8339 + }, + { + "epoch": 0.956312349501204, + "grad_norm": 0.23558857933079597, + "learning_rate": 4.997202024121905e-06, + "loss": 2.4132, + "step": 8340 + }, + { + "epoch": 0.9564270152505446, + "grad_norm": 0.253260530868003, + "learning_rate": 4.971047914381133e-06, + "loss": 2.4074, + "step": 8341 + }, + { + "epoch": 0.9565416809998853, + "grad_norm": 0.25437723525719685, + "learning_rate": 4.944962084448168e-06, + "loss": 2.4498, + "step": 8342 + }, + { + "epoch": 0.9566563467492261, + "grad_norm": 0.2610423914681025, + "learning_rate": 4.918944537921078e-06, + "loss": 2.4428, + "step": 8343 + }, + { + "epoch": 0.9567710124985667, + "grad_norm": 0.25845056010930106, + "learning_rate": 4.89299527838849e-06, + "loss": 2.4038, + "step": 8344 + }, + { + "epoch": 0.9568856782479074, + "grad_norm": 0.24714501010664763, + "learning_rate": 4.867114309429599e-06, + "loss": 2.1805, + "step": 8345 + }, + { + "epoch": 0.957000343997248, + "grad_norm": 0.2745990649825864, + "learning_rate": 4.84130163461427e-06, + "loss": 2.3015, + "step": 8346 + }, + { + "epoch": 0.9571150097465887, + "grad_norm": 0.2632636359137389, + "learning_rate": 4.815557257502712e-06, + "loss": 2.5008, + "step": 8347 + }, + { + "epoch": 0.9572296754959294, + "grad_norm": 0.2519043709936815, + "learning_rate": 4.789881181645972e-06, + "loss": 2.2929, + "step": 8348 + }, + { + "epoch": 0.95734434124527, + "grad_norm": 0.29761126367304686, + "learning_rate": 4.7642734105855515e-06, + "loss": 2.4881, + "step": 8349 + }, + { + "epoch": 0.9574590069946107, + "grad_norm": 0.24442059022814722, + "learning_rate": 4.738733947853568e-06, + "loss": 2.3661, + "step": 8350 + }, + { + "epoch": 0.9575736727439513, + "grad_norm": 0.24876567862219537, + "learning_rate": 4.713262796972706e-06, + "loss": 2.3632, + "step": 8351 + }, + { + "epoch": 0.957688338493292, + "grad_norm": 0.25481764587786854, + "learning_rate": 4.687859961456265e-06, + "loss": 2.2325, + "step": 8352 + }, + { + "epoch": 0.9578030042426328, + "grad_norm": 0.2571578782698582, + "learning_rate": 4.662525444807941e-06, + "loss": 2.2839, + "step": 8353 + }, + { + "epoch": 0.9579176699919734, + "grad_norm": 0.2650279118932301, + "learning_rate": 4.637259250522274e-06, + "loss": 2.3512, + "step": 8354 + }, + { + "epoch": 0.9580323357413141, + "grad_norm": 0.25210339343781624, + "learning_rate": 4.612061382084254e-06, + "loss": 2.2983, + "step": 8355 + }, + { + "epoch": 0.9581470014906548, + "grad_norm": 0.2611284550921159, + "learning_rate": 4.586931842969378e-06, + "loss": 2.4196, + "step": 8356 + }, + { + "epoch": 0.9582616672399954, + "grad_norm": 0.26891327701704676, + "learning_rate": 4.561870636643817e-06, + "loss": 2.402, + "step": 8357 + }, + { + "epoch": 0.9583763329893361, + "grad_norm": 0.2557116036715242, + "learning_rate": 4.5368777665643625e-06, + "loss": 2.3076, + "step": 8358 + }, + { + "epoch": 0.9584909987386767, + "grad_norm": 0.2412888538341008, + "learning_rate": 4.511953236178146e-06, + "loss": 2.3084, + "step": 8359 + }, + { + "epoch": 0.9586056644880174, + "grad_norm": 0.27363336282448064, + "learning_rate": 4.487097048923139e-06, + "loss": 2.5273, + "step": 8360 + }, + { + "epoch": 0.9587203302373581, + "grad_norm": 0.2774842908544762, + "learning_rate": 4.462309208227821e-06, + "loss": 2.4187, + "step": 8361 + }, + { + "epoch": 0.9588349959866987, + "grad_norm": 0.25774516848941514, + "learning_rate": 4.43758971751107e-06, + "loss": 2.3091, + "step": 8362 + }, + { + "epoch": 0.9589496617360395, + "grad_norm": 0.2622898867122452, + "learning_rate": 4.412938580182546e-06, + "loss": 2.4931, + "step": 8363 + }, + { + "epoch": 0.9590643274853801, + "grad_norm": 0.2562935073562918, + "learning_rate": 4.388355799642418e-06, + "loss": 2.4813, + "step": 8364 + }, + { + "epoch": 0.9591789932347208, + "grad_norm": 0.24449369236245136, + "learning_rate": 4.363841379281364e-06, + "loss": 2.3105, + "step": 8365 + }, + { + "epoch": 0.9592936589840615, + "grad_norm": 0.2571371389296851, + "learning_rate": 4.339395322480733e-06, + "loss": 2.4525, + "step": 8366 + }, + { + "epoch": 0.9594083247334021, + "grad_norm": 0.2449777465007151, + "learning_rate": 4.315017632612383e-06, + "loss": 2.4308, + "step": 8367 + }, + { + "epoch": 0.9595229904827428, + "grad_norm": 0.24055070955795596, + "learning_rate": 4.2907083130387385e-06, + "loss": 2.4023, + "step": 8368 + }, + { + "epoch": 0.9596376562320835, + "grad_norm": 0.2501889870635449, + "learning_rate": 4.266467367112725e-06, + "loss": 2.3454, + "step": 8369 + }, + { + "epoch": 0.9597523219814241, + "grad_norm": 0.26470128591142694, + "learning_rate": 4.242294798178059e-06, + "loss": 2.2379, + "step": 8370 + }, + { + "epoch": 0.9598669877307648, + "grad_norm": 0.31153605708143683, + "learning_rate": 4.21819060956885e-06, + "loss": 2.4934, + "step": 8371 + }, + { + "epoch": 0.9599816534801054, + "grad_norm": 0.2562742180643882, + "learning_rate": 4.1941548046097176e-06, + "loss": 2.3686, + "step": 8372 + }, + { + "epoch": 0.9600963192294462, + "grad_norm": 0.28757836786342056, + "learning_rate": 4.170187386616064e-06, + "loss": 2.2426, + "step": 8373 + }, + { + "epoch": 0.9602109849787869, + "grad_norm": 0.2636900410544457, + "learning_rate": 4.146288358893635e-06, + "loss": 2.5788, + "step": 8374 + }, + { + "epoch": 0.9603256507281275, + "grad_norm": 0.27547721186056146, + "learning_rate": 4.1224577247389036e-06, + "loss": 2.3483, + "step": 8375 + }, + { + "epoch": 0.9604403164774682, + "grad_norm": 0.23583431981462877, + "learning_rate": 4.098695487438908e-06, + "loss": 2.3554, + "step": 8376 + }, + { + "epoch": 0.9605549822268089, + "grad_norm": 0.27678831304033363, + "learning_rate": 4.075001650271082e-06, + "loss": 2.5244, + "step": 8377 + }, + { + "epoch": 0.9606696479761495, + "grad_norm": 0.2554363846591312, + "learning_rate": 4.051376216503588e-06, + "loss": 2.3167, + "step": 8378 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 0.2580344106752261, + "learning_rate": 4.027819189395099e-06, + "loss": 2.4036, + "step": 8379 + }, + { + "epoch": 0.9608989794748308, + "grad_norm": 0.2816616575444161, + "learning_rate": 4.004330572194903e-06, + "loss": 2.4606, + "step": 8380 + }, + { + "epoch": 0.9610136452241715, + "grad_norm": 0.28968082659291905, + "learning_rate": 3.9809103681426876e-06, + "loss": 2.4585, + "step": 8381 + }, + { + "epoch": 0.9611283109735123, + "grad_norm": 0.25259906443195596, + "learning_rate": 3.9575585804689785e-06, + "loss": 2.3501, + "step": 8382 + }, + { + "epoch": 0.9612429767228529, + "grad_norm": 0.25662404265332256, + "learning_rate": 3.9342752123946446e-06, + "loss": 2.4186, + "step": 8383 + }, + { + "epoch": 0.9613576424721936, + "grad_norm": 0.28381785279693855, + "learning_rate": 3.911060267131117e-06, + "loss": 2.5626, + "step": 8384 + }, + { + "epoch": 0.9614723082215342, + "grad_norm": 0.2736075773189394, + "learning_rate": 3.8879137478805006e-06, + "loss": 2.3913, + "step": 8385 + }, + { + "epoch": 0.9615869739708749, + "grad_norm": 0.2639082016217544, + "learning_rate": 3.864835657835464e-06, + "loss": 2.2596, + "step": 8386 + }, + { + "epoch": 0.9617016397202156, + "grad_norm": 0.25312072830550825, + "learning_rate": 3.841826000179127e-06, + "loss": 2.3277, + "step": 8387 + }, + { + "epoch": 0.9618163054695562, + "grad_norm": 0.2728875003965544, + "learning_rate": 3.818884778085286e-06, + "loss": 2.3821, + "step": 8388 + }, + { + "epoch": 0.9619309712188969, + "grad_norm": 0.24836051430557018, + "learning_rate": 3.7960119947181847e-06, + "loss": 2.4183, + "step": 8389 + }, + { + "epoch": 0.9620456369682376, + "grad_norm": 0.2481167959315104, + "learning_rate": 3.7732076532327463e-06, + "loss": 2.4998, + "step": 8390 + }, + { + "epoch": 0.9621603027175782, + "grad_norm": 0.2621211159878882, + "learning_rate": 3.7504717567743986e-06, + "loss": 2.4555, + "step": 8391 + }, + { + "epoch": 0.962274968466919, + "grad_norm": 0.2405782746911224, + "learning_rate": 3.7278043084790214e-06, + "loss": 2.3392, + "step": 8392 + }, + { + "epoch": 0.9623896342162596, + "grad_norm": 0.2600099823094173, + "learning_rate": 3.705205311473281e-06, + "loss": 2.2495, + "step": 8393 + }, + { + "epoch": 0.9625042999656003, + "grad_norm": 0.26291773328794926, + "learning_rate": 3.682674768874239e-06, + "loss": 2.2289, + "step": 8394 + }, + { + "epoch": 0.962618965714941, + "grad_norm": 0.2884000301364202, + "learning_rate": 3.660212683789521e-06, + "loss": 2.4623, + "step": 8395 + }, + { + "epoch": 0.9627336314642816, + "grad_norm": 0.2615649223766214, + "learning_rate": 3.637819059317371e-06, + "loss": 2.3936, + "step": 8396 + }, + { + "epoch": 0.9628482972136223, + "grad_norm": 0.2772111623220838, + "learning_rate": 3.61549389854654e-06, + "loss": 2.3991, + "step": 8397 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.24320704776268254, + "learning_rate": 3.5932372045563986e-06, + "loss": 2.416, + "step": 8398 + }, + { + "epoch": 0.9630776287123036, + "grad_norm": 0.23006586809253868, + "learning_rate": 3.571048980416769e-06, + "loss": 2.3647, + "step": 8399 + }, + { + "epoch": 0.9631922944616443, + "grad_norm": 0.27537648184512703, + "learning_rate": 3.5489292291882023e-06, + "loss": 2.3363, + "step": 8400 + }, + { + "epoch": 0.9633069602109849, + "grad_norm": 0.26573239909756463, + "learning_rate": 3.526877953921592e-06, + "loss": 2.4941, + "step": 8401 + }, + { + "epoch": 0.9634216259603257, + "grad_norm": 0.23504372292061523, + "learning_rate": 3.5048951576585607e-06, + "loss": 2.438, + "step": 8402 + }, + { + "epoch": 0.9635362917096664, + "grad_norm": 0.23039198012059517, + "learning_rate": 3.482980843431127e-06, + "loss": 2.3353, + "step": 8403 + }, + { + "epoch": 0.963650957459007, + "grad_norm": 0.2807335610428062, + "learning_rate": 3.46113501426204e-06, + "loss": 2.5978, + "step": 8404 + }, + { + "epoch": 0.9637656232083477, + "grad_norm": 0.24931737692248507, + "learning_rate": 3.4393576731645003e-06, + "loss": 2.4239, + "step": 8405 + }, + { + "epoch": 0.9638802889576883, + "grad_norm": 0.2638901800050044, + "learning_rate": 3.4176488231422163e-06, + "loss": 2.4502, + "step": 8406 + }, + { + "epoch": 0.963994954707029, + "grad_norm": 0.24767178698275336, + "learning_rate": 3.396008467189626e-06, + "loss": 2.2911, + "step": 8407 + }, + { + "epoch": 0.9641096204563697, + "grad_norm": 0.23863421504479457, + "learning_rate": 3.374436608291509e-06, + "loss": 2.1898, + "step": 8408 + }, + { + "epoch": 0.9642242862057103, + "grad_norm": 0.27956160235255784, + "learning_rate": 3.3529332494232623e-06, + "loss": 2.5438, + "step": 8409 + }, + { + "epoch": 0.964338951955051, + "grad_norm": 0.316384202323507, + "learning_rate": 3.331498393550958e-06, + "loss": 2.3726, + "step": 8410 + }, + { + "epoch": 0.9644536177043918, + "grad_norm": 0.24870588118664255, + "learning_rate": 3.31013204363112e-06, + "loss": 2.329, + "step": 8411 + }, + { + "epoch": 0.9645682834537324, + "grad_norm": 0.29291804670613175, + "learning_rate": 3.2888342026107797e-06, + "loss": 2.4762, + "step": 8412 + }, + { + "epoch": 0.9646829492030731, + "grad_norm": 0.25438408331469714, + "learning_rate": 3.267604873427532e-06, + "loss": 2.3931, + "step": 8413 + }, + { + "epoch": 0.9647976149524137, + "grad_norm": 0.25729325979079226, + "learning_rate": 3.2464440590097013e-06, + "loss": 2.4832, + "step": 8414 + }, + { + "epoch": 0.9649122807017544, + "grad_norm": 0.24890578951332115, + "learning_rate": 3.225351762275841e-06, + "loss": 2.3521, + "step": 8415 + }, + { + "epoch": 0.9650269464510951, + "grad_norm": 0.2654343198648445, + "learning_rate": 3.204327986135347e-06, + "loss": 2.291, + "step": 8416 + }, + { + "epoch": 0.9651416122004357, + "grad_norm": 0.26795019871067954, + "learning_rate": 3.183372733488066e-06, + "loss": 2.4191, + "step": 8417 + }, + { + "epoch": 0.9652562779497764, + "grad_norm": 0.2428742026047186, + "learning_rate": 3.162486007224297e-06, + "loss": 2.3942, + "step": 8418 + }, + { + "epoch": 0.965370943699117, + "grad_norm": 0.2929667099073874, + "learning_rate": 3.1416678102249574e-06, + "loss": 2.3929, + "step": 8419 + }, + { + "epoch": 0.9654856094484577, + "grad_norm": 0.24590684530648857, + "learning_rate": 3.1209181453616396e-06, + "loss": 2.3613, + "step": 8420 + }, + { + "epoch": 0.9656002751977985, + "grad_norm": 0.2833691750923599, + "learning_rate": 3.1002370154962212e-06, + "loss": 2.5156, + "step": 8421 + }, + { + "epoch": 0.9657149409471391, + "grad_norm": 0.2587312929452015, + "learning_rate": 3.079624423481364e-06, + "loss": 2.3396, + "step": 8422 + }, + { + "epoch": 0.9658296066964798, + "grad_norm": 0.2704084886452787, + "learning_rate": 3.0590803721601833e-06, + "loss": 2.2815, + "step": 8423 + }, + { + "epoch": 0.9659442724458205, + "grad_norm": 0.2905788352811762, + "learning_rate": 3.0386048643662455e-06, + "loss": 2.4134, + "step": 8424 + }, + { + "epoch": 0.9660589381951611, + "grad_norm": 0.24361707702662044, + "learning_rate": 3.018197902923847e-06, + "loss": 2.2445, + "step": 8425 + }, + { + "epoch": 0.9661736039445018, + "grad_norm": 0.2550232728292785, + "learning_rate": 2.997859490647736e-06, + "loss": 2.4258, + "step": 8426 + }, + { + "epoch": 0.9662882696938424, + "grad_norm": 0.24151542870021409, + "learning_rate": 2.9775896303431694e-06, + "loss": 2.2899, + "step": 8427 + }, + { + "epoch": 0.9664029354431831, + "grad_norm": 0.2727833915889519, + "learning_rate": 2.9573883248060207e-06, + "loss": 2.3635, + "step": 8428 + }, + { + "epoch": 0.9665176011925238, + "grad_norm": 0.24532817830146272, + "learning_rate": 2.9372555768226172e-06, + "loss": 2.2529, + "step": 8429 + }, + { + "epoch": 0.9666322669418644, + "grad_norm": 0.28083526271437764, + "learning_rate": 2.91719138916996e-06, + "loss": 2.4243, + "step": 8430 + }, + { + "epoch": 0.9667469326912052, + "grad_norm": 0.25164075668168007, + "learning_rate": 2.897195764615501e-06, + "loss": 2.5141, + "step": 8431 + }, + { + "epoch": 0.9668615984405458, + "grad_norm": 0.2513564476712266, + "learning_rate": 2.8772687059172577e-06, + "loss": 2.4376, + "step": 8432 + }, + { + "epoch": 0.9669762641898865, + "grad_norm": 0.2842053189882223, + "learning_rate": 2.8574102158237525e-06, + "loss": 2.4194, + "step": 8433 + }, + { + "epoch": 0.9670909299392272, + "grad_norm": 0.2611075009076694, + "learning_rate": 2.8376202970740726e-06, + "loss": 2.2427, + "step": 8434 + }, + { + "epoch": 0.9672055956885678, + "grad_norm": 0.28626553183420866, + "learning_rate": 2.8178989523979235e-06, + "loss": 2.437, + "step": 8435 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.2623496442953994, + "learning_rate": 2.7982461845154627e-06, + "loss": 2.4218, + "step": 8436 + }, + { + "epoch": 0.9674349271872492, + "grad_norm": 0.24122108299802497, + "learning_rate": 2.7786619961374103e-06, + "loss": 2.3628, + "step": 8437 + }, + { + "epoch": 0.9675495929365898, + "grad_norm": 0.2401193007514644, + "learning_rate": 2.7591463899650505e-06, + "loss": 2.3108, + "step": 8438 + }, + { + "epoch": 0.9676642586859305, + "grad_norm": 0.2859965051341742, + "learning_rate": 2.7396993686901184e-06, + "loss": 2.4414, + "step": 8439 + }, + { + "epoch": 0.9677789244352711, + "grad_norm": 0.251153094870571, + "learning_rate": 2.72032093499508e-06, + "loss": 2.3124, + "step": 8440 + }, + { + "epoch": 0.9678935901846119, + "grad_norm": 0.2726179968976424, + "learning_rate": 2.701011091552741e-06, + "loss": 2.354, + "step": 8441 + }, + { + "epoch": 0.9680082559339526, + "grad_norm": 0.24020894522782626, + "learning_rate": 2.6817698410265266e-06, + "loss": 2.3449, + "step": 8442 + }, + { + "epoch": 0.9681229216832932, + "grad_norm": 0.2628606635548168, + "learning_rate": 2.6625971860704256e-06, + "loss": 2.3972, + "step": 8443 + }, + { + "epoch": 0.9682375874326339, + "grad_norm": 0.24459212683548268, + "learning_rate": 2.643493129328933e-06, + "loss": 2.4123, + "step": 8444 + }, + { + "epoch": 0.9683522531819746, + "grad_norm": 0.25799431051324895, + "learning_rate": 2.624457673437053e-06, + "loss": 2.3618, + "step": 8445 + }, + { + "epoch": 0.9684669189313152, + "grad_norm": 0.29717022944311133, + "learning_rate": 2.605490821020462e-06, + "loss": 2.4505, + "step": 8446 + }, + { + "epoch": 0.9685815846806559, + "grad_norm": 0.25993638806101477, + "learning_rate": 2.5865925746951234e-06, + "loss": 2.3699, + "step": 8447 + }, + { + "epoch": 0.9686962504299965, + "grad_norm": 0.2834277057767693, + "learning_rate": 2.5677629370678413e-06, + "loss": 2.2227, + "step": 8448 + }, + { + "epoch": 0.9688109161793372, + "grad_norm": 0.2437292527163565, + "learning_rate": 2.5490019107357045e-06, + "loss": 2.2666, + "step": 8449 + }, + { + "epoch": 0.968925581928678, + "grad_norm": 0.28649100931769983, + "learning_rate": 2.530309498286476e-06, + "loss": 2.4059, + "step": 8450 + }, + { + "epoch": 0.9690402476780186, + "grad_norm": 0.2982880823695621, + "learning_rate": 2.511685702298483e-06, + "loss": 2.3061, + "step": 8451 + }, + { + "epoch": 0.9691549134273593, + "grad_norm": 0.25532804029621126, + "learning_rate": 2.4931305253403925e-06, + "loss": 2.248, + "step": 8452 + }, + { + "epoch": 0.9692695791766999, + "grad_norm": 0.23928389861445995, + "learning_rate": 2.4746439699716016e-06, + "loss": 2.2957, + "step": 8453 + }, + { + "epoch": 0.9693842449260406, + "grad_norm": 0.24814515847985688, + "learning_rate": 2.4562260387420154e-06, + "loss": 2.2715, + "step": 8454 + }, + { + "epoch": 0.9694989106753813, + "grad_norm": 0.2808779510379786, + "learning_rate": 2.4378767341919907e-06, + "loss": 2.3207, + "step": 8455 + }, + { + "epoch": 0.9696135764247219, + "grad_norm": 0.259420852389618, + "learning_rate": 2.4195960588524467e-06, + "loss": 2.3086, + "step": 8456 + }, + { + "epoch": 0.9697282421740626, + "grad_norm": 0.25913114440745566, + "learning_rate": 2.401384015244867e-06, + "loss": 2.3251, + "step": 8457 + }, + { + "epoch": 0.9698429079234033, + "grad_norm": 0.27636273488207475, + "learning_rate": 2.383240605881354e-06, + "loss": 2.3595, + "step": 8458 + }, + { + "epoch": 0.9699575736727439, + "grad_norm": 0.25222454998088556, + "learning_rate": 2.365165833264293e-06, + "loss": 2.3091, + "step": 8459 + }, + { + "epoch": 0.9700722394220846, + "grad_norm": 0.26521689905308565, + "learning_rate": 2.3471596998868026e-06, + "loss": 2.356, + "step": 8460 + }, + { + "epoch": 0.9701869051714253, + "grad_norm": 0.26201129724181055, + "learning_rate": 2.329222208232562e-06, + "loss": 2.2731, + "step": 8461 + }, + { + "epoch": 0.970301570920766, + "grad_norm": 0.24982548082938946, + "learning_rate": 2.3113533607756478e-06, + "loss": 2.4669, + "step": 8462 + }, + { + "epoch": 0.9704162366701067, + "grad_norm": 0.2463293940012505, + "learning_rate": 2.293553159980699e-06, + "loss": 2.2925, + "step": 8463 + }, + { + "epoch": 0.9705309024194473, + "grad_norm": 0.23591816998375842, + "learning_rate": 2.2758216083029192e-06, + "loss": 2.3594, + "step": 8464 + }, + { + "epoch": 0.970645568168788, + "grad_norm": 0.2557743380233621, + "learning_rate": 2.2581587081881294e-06, + "loss": 2.367, + "step": 8465 + }, + { + "epoch": 0.9707602339181286, + "grad_norm": 0.2646908234834943, + "learning_rate": 2.240564462072492e-06, + "loss": 2.2461, + "step": 8466 + }, + { + "epoch": 0.9708748996674693, + "grad_norm": 0.26900631526322105, + "learning_rate": 2.2230388723828433e-06, + "loss": 2.4389, + "step": 8467 + }, + { + "epoch": 0.97098956541681, + "grad_norm": 0.2801739466040622, + "learning_rate": 2.205581941536472e-06, + "loss": 2.2958, + "step": 8468 + }, + { + "epoch": 0.9711042311661506, + "grad_norm": 0.2605246338911792, + "learning_rate": 2.1881936719411744e-06, + "loss": 2.3529, + "step": 8469 + }, + { + "epoch": 0.9712188969154913, + "grad_norm": 0.25239985832291245, + "learning_rate": 2.170874065995532e-06, + "loss": 2.3785, + "step": 8470 + }, + { + "epoch": 0.9713335626648321, + "grad_norm": 0.2696513861947799, + "learning_rate": 2.1536231260882446e-06, + "loss": 2.3186, + "step": 8471 + }, + { + "epoch": 0.9714482284141727, + "grad_norm": 0.29162139459134045, + "learning_rate": 2.1364408545988533e-06, + "loss": 2.3864, + "step": 8472 + }, + { + "epoch": 0.9715628941635134, + "grad_norm": 0.2663284470093609, + "learning_rate": 2.1193272538973516e-06, + "loss": 2.5046, + "step": 8473 + }, + { + "epoch": 0.971677559912854, + "grad_norm": 0.27015464372200615, + "learning_rate": 2.1022823263441294e-06, + "loss": 2.4469, + "step": 8474 + }, + { + "epoch": 0.9717922256621947, + "grad_norm": 0.2556707523662889, + "learning_rate": 2.08530607429025e-06, + "loss": 2.1617, + "step": 8475 + }, + { + "epoch": 0.9719068914115354, + "grad_norm": 0.26491338644190016, + "learning_rate": 2.0683985000773396e-06, + "loss": 2.3224, + "step": 8476 + }, + { + "epoch": 0.972021557160876, + "grad_norm": 0.24145245870429918, + "learning_rate": 2.0515596060373675e-06, + "loss": 2.3224, + "step": 8477 + }, + { + "epoch": 0.9721362229102167, + "grad_norm": 0.23731236863928826, + "learning_rate": 2.0347893944930306e-06, + "loss": 2.5342, + "step": 8478 + }, + { + "epoch": 0.9722508886595574, + "grad_norm": 0.31113528877749735, + "learning_rate": 2.0180878677573677e-06, + "loss": 2.3587, + "step": 8479 + }, + { + "epoch": 0.972365554408898, + "grad_norm": 0.25339015801507186, + "learning_rate": 2.0014550281340914e-06, + "loss": 2.2876, + "step": 8480 + }, + { + "epoch": 0.9724802201582388, + "grad_norm": 0.2419201820307682, + "learning_rate": 1.9848908779174223e-06, + "loss": 2.3492, + "step": 8481 + }, + { + "epoch": 0.9725948859075794, + "grad_norm": 0.2434474478898852, + "learning_rate": 1.9683954193920328e-06, + "loss": 2.3274, + "step": 8482 + }, + { + "epoch": 0.9727095516569201, + "grad_norm": 0.2704571497396297, + "learning_rate": 1.9519686548331028e-06, + "loss": 2.3012, + "step": 8483 + }, + { + "epoch": 0.9728242174062608, + "grad_norm": 0.2305453641312586, + "learning_rate": 1.935610586506431e-06, + "loss": 2.3197, + "step": 8484 + }, + { + "epoch": 0.9729388831556014, + "grad_norm": 0.2578872919862767, + "learning_rate": 1.9193212166683237e-06, + "loss": 2.3573, + "step": 8485 + }, + { + "epoch": 0.9730535489049421, + "grad_norm": 0.2675929036086628, + "learning_rate": 1.9031005475655948e-06, + "loss": 2.2994, + "step": 8486 + }, + { + "epoch": 0.9731682146542827, + "grad_norm": 0.27769510078029847, + "learning_rate": 1.8869485814355103e-06, + "loss": 2.2844, + "step": 8487 + }, + { + "epoch": 0.9732828804036234, + "grad_norm": 0.23492978133841613, + "learning_rate": 1.87086532050601e-06, + "loss": 2.3786, + "step": 8488 + }, + { + "epoch": 0.9733975461529641, + "grad_norm": 0.32549103613331437, + "learning_rate": 1.8548507669954307e-06, + "loss": 2.4331, + "step": 8489 + }, + { + "epoch": 0.9735122119023047, + "grad_norm": 0.2457448063532635, + "learning_rate": 1.838904923112672e-06, + "loss": 2.3706, + "step": 8490 + }, + { + "epoch": 0.9736268776516455, + "grad_norm": 0.23738500721549768, + "learning_rate": 1.8230277910571413e-06, + "loss": 2.4697, + "step": 8491 + }, + { + "epoch": 0.9737415434009862, + "grad_norm": 0.24351712546075663, + "learning_rate": 1.8072193730188091e-06, + "loss": 2.3576, + "step": 8492 + }, + { + "epoch": 0.9738562091503268, + "grad_norm": 0.25803284402512366, + "learning_rate": 1.791479671178209e-06, + "loss": 2.4386, + "step": 8493 + }, + { + "epoch": 0.9739708748996675, + "grad_norm": 0.2560372398616809, + "learning_rate": 1.7758086877062153e-06, + "loss": 2.3201, + "step": 8494 + }, + { + "epoch": 0.9740855406490081, + "grad_norm": 0.25040048963431133, + "learning_rate": 1.7602064247643768e-06, + "loss": 2.4845, + "step": 8495 + }, + { + "epoch": 0.9742002063983488, + "grad_norm": 0.24204034497013002, + "learning_rate": 1.7446728845048055e-06, + "loss": 2.2566, + "step": 8496 + }, + { + "epoch": 0.9743148721476895, + "grad_norm": 0.27725711187821334, + "learning_rate": 1.7292080690699542e-06, + "loss": 2.2911, + "step": 8497 + }, + { + "epoch": 0.9744295378970301, + "grad_norm": 0.2605717025827012, + "learning_rate": 1.7138119805929498e-06, + "loss": 2.3371, + "step": 8498 + }, + { + "epoch": 0.9745442036463708, + "grad_norm": 0.2584119966825838, + "learning_rate": 1.698484621197427e-06, + "loss": 2.4076, + "step": 8499 + }, + { + "epoch": 0.9746588693957114, + "grad_norm": 0.25005681982530353, + "learning_rate": 1.683225992997417e-06, + "loss": 2.302, + "step": 8500 + }, + { + "epoch": 0.9747735351450522, + "grad_norm": 0.2615039172756164, + "learning_rate": 1.668036098097625e-06, + "loss": 2.3953, + "step": 8501 + }, + { + "epoch": 0.9748882008943929, + "grad_norm": 0.2550620184315978, + "learning_rate": 1.652914938593153e-06, + "loss": 2.3409, + "step": 8502 + }, + { + "epoch": 0.9750028666437335, + "grad_norm": 0.24508727119384985, + "learning_rate": 1.6378625165697213e-06, + "loss": 2.3968, + "step": 8503 + }, + { + "epoch": 0.9751175323930742, + "grad_norm": 0.27767149087417653, + "learning_rate": 1.6228788341035028e-06, + "loss": 2.4108, + "step": 8504 + }, + { + "epoch": 0.9752321981424149, + "grad_norm": 0.2786241869901198, + "learning_rate": 1.607963893261233e-06, + "loss": 2.4078, + "step": 8505 + }, + { + "epoch": 0.9753468638917555, + "grad_norm": 0.24591812819938894, + "learning_rate": 1.5931176961000993e-06, + "loss": 2.3426, + "step": 8506 + }, + { + "epoch": 0.9754615296410962, + "grad_norm": 0.25936059468835, + "learning_rate": 1.5783402446679084e-06, + "loss": 2.4557, + "step": 8507 + }, + { + "epoch": 0.9755761953904368, + "grad_norm": 0.26959900676022736, + "learning_rate": 1.563631541002919e-06, + "loss": 2.3734, + "step": 8508 + }, + { + "epoch": 0.9756908611397775, + "grad_norm": 0.23271576336009359, + "learning_rate": 1.5489915871338411e-06, + "loss": 2.3424, + "step": 8509 + }, + { + "epoch": 0.9758055268891183, + "grad_norm": 0.3047475012515261, + "learning_rate": 1.5344203850800597e-06, + "loss": 2.489, + "step": 8510 + }, + { + "epoch": 0.9759201926384589, + "grad_norm": 0.27608972436248186, + "learning_rate": 1.519917936851356e-06, + "loss": 2.4136, + "step": 8511 + }, + { + "epoch": 0.9760348583877996, + "grad_norm": 0.2775226381175172, + "learning_rate": 1.5054842444480743e-06, + "loss": 2.3587, + "step": 8512 + }, + { + "epoch": 0.9761495241371403, + "grad_norm": 0.25709021435459783, + "learning_rate": 1.491119309861122e-06, + "loss": 2.4863, + "step": 8513 + }, + { + "epoch": 0.9762641898864809, + "grad_norm": 0.27395764061458866, + "learning_rate": 1.4768231350717477e-06, + "loss": 2.2367, + "step": 8514 + }, + { + "epoch": 0.9763788556358216, + "grad_norm": 0.2608829989376526, + "learning_rate": 1.4625957220519292e-06, + "loss": 2.351, + "step": 8515 + }, + { + "epoch": 0.9764935213851622, + "grad_norm": 0.2611176925670891, + "learning_rate": 1.4484370727640417e-06, + "loss": 2.437, + "step": 8516 + }, + { + "epoch": 0.9766081871345029, + "grad_norm": 0.255556131065512, + "learning_rate": 1.4343471891610783e-06, + "loss": 2.4218, + "step": 8517 + }, + { + "epoch": 0.9767228528838436, + "grad_norm": 0.26108419748104433, + "learning_rate": 1.4203260731863176e-06, + "loss": 2.3108, + "step": 8518 + }, + { + "epoch": 0.9768375186331842, + "grad_norm": 0.2550087628600148, + "learning_rate": 1.4063737267737686e-06, + "loss": 2.2847, + "step": 8519 + }, + { + "epoch": 0.976952184382525, + "grad_norm": 0.2733763933884475, + "learning_rate": 1.392490151847947e-06, + "loss": 2.232, + "step": 8520 + }, + { + "epoch": 0.9770668501318656, + "grad_norm": 0.26434913676493516, + "learning_rate": 1.3786753503237658e-06, + "loss": 2.3077, + "step": 8521 + }, + { + "epoch": 0.9771815158812063, + "grad_norm": 0.25819237027229835, + "learning_rate": 1.3649293241067007e-06, + "loss": 2.4131, + "step": 8522 + }, + { + "epoch": 0.977296181630547, + "grad_norm": 0.27401061392284126, + "learning_rate": 1.3512520750928458e-06, + "loss": 2.3191, + "step": 8523 + }, + { + "epoch": 0.9774108473798876, + "grad_norm": 0.24198730650660902, + "learning_rate": 1.3376436051686369e-06, + "loss": 2.2846, + "step": 8524 + }, + { + "epoch": 0.9775255131292283, + "grad_norm": 0.2585859894798289, + "learning_rate": 1.324103916211128e-06, + "loss": 2.3691, + "step": 8525 + }, + { + "epoch": 0.977640178878569, + "grad_norm": 0.2638394998218401, + "learning_rate": 1.310633010087825e-06, + "loss": 2.4373, + "step": 8526 + }, + { + "epoch": 0.9777548446279096, + "grad_norm": 0.2552706076084606, + "learning_rate": 1.297230888656853e-06, + "loss": 2.4431, + "step": 8527 + }, + { + "epoch": 0.9778695103772503, + "grad_norm": 0.2752598378347172, + "learning_rate": 1.283897553766733e-06, + "loss": 2.4102, + "step": 8528 + }, + { + "epoch": 0.9779841761265909, + "grad_norm": 0.25793116206924965, + "learning_rate": 1.2706330072564942e-06, + "loss": 2.3146, + "step": 8529 + }, + { + "epoch": 0.9780988418759317, + "grad_norm": 0.2784917858625757, + "learning_rate": 1.2574372509558397e-06, + "loss": 2.3299, + "step": 8530 + }, + { + "epoch": 0.9782135076252724, + "grad_norm": 0.2774151749871697, + "learning_rate": 1.2443102866848134e-06, + "loss": 2.4968, + "step": 8531 + }, + { + "epoch": 0.978328173374613, + "grad_norm": 0.28230478265078923, + "learning_rate": 1.231252116254078e-06, + "loss": 2.3479, + "step": 8532 + }, + { + "epoch": 0.9784428391239537, + "grad_norm": 0.25304453276351124, + "learning_rate": 1.2182627414646375e-06, + "loss": 2.4481, + "step": 8533 + }, + { + "epoch": 0.9785575048732943, + "grad_norm": 0.2767444064552373, + "learning_rate": 1.2053421641082806e-06, + "loss": 2.2822, + "step": 8534 + }, + { + "epoch": 0.978672170622635, + "grad_norm": 0.2565010664441952, + "learning_rate": 1.1924903859670266e-06, + "loss": 2.4065, + "step": 8535 + }, + { + "epoch": 0.9787868363719757, + "grad_norm": 0.23713810154018222, + "learning_rate": 1.1797074088135685e-06, + "loss": 2.4407, + "step": 8536 + }, + { + "epoch": 0.9789015021213163, + "grad_norm": 0.2584627467071504, + "learning_rate": 1.1669932344111623e-06, + "loss": 2.4213, + "step": 8537 + }, + { + "epoch": 0.979016167870657, + "grad_norm": 0.2637919677709154, + "learning_rate": 1.1543478645134054e-06, + "loss": 2.5363, + "step": 8538 + }, + { + "epoch": 0.9791308336199978, + "grad_norm": 0.2865891791062618, + "learning_rate": 1.1417713008644582e-06, + "loss": 2.4531, + "step": 8539 + }, + { + "epoch": 0.9792454993693384, + "grad_norm": 0.2609159710137132, + "learning_rate": 1.129263545199044e-06, + "loss": 2.3607, + "step": 8540 + }, + { + "epoch": 0.9793601651186791, + "grad_norm": 0.28009965269635984, + "learning_rate": 1.1168245992424497e-06, + "loss": 2.393, + "step": 8541 + }, + { + "epoch": 0.9794748308680197, + "grad_norm": 0.2370978025934183, + "learning_rate": 1.1044544647102473e-06, + "loss": 2.3487, + "step": 8542 + }, + { + "epoch": 0.9795894966173604, + "grad_norm": 0.23859973817093474, + "learning_rate": 1.092153143308794e-06, + "loss": 2.3618, + "step": 8543 + }, + { + "epoch": 0.9797041623667011, + "grad_norm": 0.2641864262541659, + "learning_rate": 1.0799206367347326e-06, + "loss": 2.5617, + "step": 8544 + }, + { + "epoch": 0.9798188281160417, + "grad_norm": 0.26185422861275354, + "learning_rate": 1.06775694667538e-06, + "loss": 2.4492, + "step": 8545 + }, + { + "epoch": 0.9799334938653824, + "grad_norm": 0.26620400934088717, + "learning_rate": 1.0556620748083945e-06, + "loss": 2.4071, + "step": 8546 + }, + { + "epoch": 0.9800481596147231, + "grad_norm": 0.23265135526070768, + "learning_rate": 1.043636022802108e-06, + "loss": 2.3724, + "step": 8547 + }, + { + "epoch": 0.9801628253640637, + "grad_norm": 0.27895118415445375, + "learning_rate": 1.0316787923152493e-06, + "loss": 2.4107, + "step": 8548 + }, + { + "epoch": 0.9802774911134045, + "grad_norm": 0.27337607152649684, + "learning_rate": 1.0197903849971103e-06, + "loss": 2.2803, + "step": 8549 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.27569940593712866, + "learning_rate": 1.0079708024874345e-06, + "loss": 2.3929, + "step": 8550 + }, + { + "epoch": 0.9805068226120858, + "grad_norm": 0.24101819384287163, + "learning_rate": 9.96220046416585e-07, + "loss": 2.281, + "step": 8551 + }, + { + "epoch": 0.9806214883614265, + "grad_norm": 0.27169162898514193, + "learning_rate": 9.845381184052649e-07, + "loss": 2.4003, + "step": 8552 + }, + { + "epoch": 0.9807361541107671, + "grad_norm": 0.2969175020424089, + "learning_rate": 9.729250200647965e-07, + "loss": 2.4271, + "step": 8553 + }, + { + "epoch": 0.9808508198601078, + "grad_norm": 0.2341802626887597, + "learning_rate": 9.613807529970097e-07, + "loss": 2.3637, + "step": 8554 + }, + { + "epoch": 0.9809654856094484, + "grad_norm": 0.25508104807854093, + "learning_rate": 9.49905318794242e-07, + "loss": 2.3479, + "step": 8555 + }, + { + "epoch": 0.9810801513587891, + "grad_norm": 0.2554437380150877, + "learning_rate": 9.38498719039227e-07, + "loss": 2.4653, + "step": 8556 + }, + { + "epoch": 0.9811948171081298, + "grad_norm": 0.26662389628647565, + "learning_rate": 9.271609553053728e-07, + "loss": 2.5141, + "step": 8557 + }, + { + "epoch": 0.9813094828574704, + "grad_norm": 0.24288649240674007, + "learning_rate": 9.158920291564842e-07, + "loss": 2.3692, + "step": 8558 + }, + { + "epoch": 0.9814241486068112, + "grad_norm": 0.26412325947355103, + "learning_rate": 9.04691942146818e-07, + "loss": 2.2659, + "step": 8559 + }, + { + "epoch": 0.9815388143561519, + "grad_norm": 0.28103900686852595, + "learning_rate": 8.935606958213049e-07, + "loss": 2.3534, + "step": 8560 + }, + { + "epoch": 0.9816534801054925, + "grad_norm": 0.25661453042230875, + "learning_rate": 8.824982917152724e-07, + "loss": 2.5129, + "step": 8561 + }, + { + "epoch": 0.9817681458548332, + "grad_norm": 0.26762345224717815, + "learning_rate": 8.715047313545554e-07, + "loss": 2.3802, + "step": 8562 + }, + { + "epoch": 0.9818828116041738, + "grad_norm": 0.256755232078889, + "learning_rate": 8.605800162554967e-07, + "loss": 2.2949, + "step": 8563 + }, + { + "epoch": 0.9819974773535145, + "grad_norm": 0.26337890713236367, + "learning_rate": 8.497241479249462e-07, + "loss": 2.5167, + "step": 8564 + }, + { + "epoch": 0.9821121431028552, + "grad_norm": 0.23160137054670188, + "learning_rate": 8.389371278603175e-07, + "loss": 2.2524, + "step": 8565 + }, + { + "epoch": 0.9822268088521958, + "grad_norm": 0.26657365958268897, + "learning_rate": 8.282189575494203e-07, + "loss": 2.4371, + "step": 8566 + }, + { + "epoch": 0.9823414746015365, + "grad_norm": 0.2952456788827243, + "learning_rate": 8.175696384706277e-07, + "loss": 2.4809, + "step": 8567 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 0.26076524901979437, + "learning_rate": 8.069891720928202e-07, + "loss": 2.328, + "step": 8568 + }, + { + "epoch": 0.9825708061002179, + "grad_norm": 0.24788892534550078, + "learning_rate": 7.964775598753859e-07, + "loss": 2.4206, + "step": 8569 + }, + { + "epoch": 0.9826854718495586, + "grad_norm": 0.2653858524494065, + "learning_rate": 7.860348032682207e-07, + "loss": 2.2839, + "step": 8570 + }, + { + "epoch": 0.9828001375988992, + "grad_norm": 0.2609458188004867, + "learning_rate": 7.756609037116169e-07, + "loss": 2.4553, + "step": 8571 + }, + { + "epoch": 0.9829148033482399, + "grad_norm": 0.2715620262028699, + "learning_rate": 7.653558626365409e-07, + "loss": 2.3281, + "step": 8572 + }, + { + "epoch": 0.9830294690975806, + "grad_norm": 0.2744616849169987, + "learning_rate": 7.551196814643002e-07, + "loss": 2.3304, + "step": 8573 + }, + { + "epoch": 0.9831441348469212, + "grad_norm": 0.27664841180895827, + "learning_rate": 7.449523616068765e-07, + "loss": 2.5822, + "step": 8574 + }, + { + "epoch": 0.9832588005962619, + "grad_norm": 0.2510220552396542, + "learning_rate": 7.348539044665925e-07, + "loss": 2.3962, + "step": 8575 + }, + { + "epoch": 0.9833734663456025, + "grad_norm": 0.25770550998226555, + "learning_rate": 7.24824311436334e-07, + "loss": 2.5001, + "step": 8576 + }, + { + "epoch": 0.9834881320949432, + "grad_norm": 0.23522144334916062, + "learning_rate": 7.148635838994943e-07, + "loss": 2.4103, + "step": 8577 + }, + { + "epoch": 0.983602797844284, + "grad_norm": 0.26492834233872287, + "learning_rate": 7.049717232300302e-07, + "loss": 2.3792, + "step": 8578 + }, + { + "epoch": 0.9837174635936246, + "grad_norm": 0.2841040357611516, + "learning_rate": 6.951487307922944e-07, + "loss": 2.39, + "step": 8579 + }, + { + "epoch": 0.9838321293429653, + "grad_norm": 0.24951431511582048, + "learning_rate": 6.853946079411477e-07, + "loss": 2.2993, + "step": 8580 + }, + { + "epoch": 0.983946795092306, + "grad_norm": 0.2440385410536707, + "learning_rate": 6.75709356022014e-07, + "loss": 2.4304, + "step": 8581 + }, + { + "epoch": 0.9840614608416466, + "grad_norm": 0.3032387375584678, + "learning_rate": 6.660929763707691e-07, + "loss": 2.4766, + "step": 8582 + }, + { + "epoch": 0.9841761265909873, + "grad_norm": 0.26000275894989894, + "learning_rate": 6.565454703138518e-07, + "loss": 2.3079, + "step": 8583 + }, + { + "epoch": 0.9842907923403279, + "grad_norm": 0.23190025457840022, + "learning_rate": 6.470668391681533e-07, + "loss": 2.5004, + "step": 8584 + }, + { + "epoch": 0.9844054580896686, + "grad_norm": 0.24162034169815647, + "learning_rate": 6.376570842410168e-07, + "loss": 2.4408, + "step": 8585 + }, + { + "epoch": 0.9845201238390093, + "grad_norm": 0.2627554606261515, + "learning_rate": 6.283162068304038e-07, + "loss": 2.4052, + "step": 8586 + }, + { + "epoch": 0.9846347895883499, + "grad_norm": 0.253002779773242, + "learning_rate": 6.190442082246172e-07, + "loss": 2.495, + "step": 8587 + }, + { + "epoch": 0.9847494553376906, + "grad_norm": 0.2829690156804403, + "learning_rate": 6.098410897026896e-07, + "loss": 2.4628, + "step": 8588 + }, + { + "epoch": 0.9848641210870313, + "grad_norm": 0.24585862143049741, + "learning_rate": 6.007068525338833e-07, + "loss": 2.3591, + "step": 8589 + }, + { + "epoch": 0.984978786836372, + "grad_norm": 0.2636374727683315, + "learning_rate": 5.916414979781903e-07, + "loss": 2.6421, + "step": 8590 + }, + { + "epoch": 0.9850934525857127, + "grad_norm": 0.2696762905044979, + "learning_rate": 5.82645027285944e-07, + "loss": 2.4102, + "step": 8591 + }, + { + "epoch": 0.9852081183350533, + "grad_norm": 0.25301280819631894, + "learning_rate": 5.737174416980961e-07, + "loss": 2.455, + "step": 8592 + }, + { + "epoch": 0.985322784084394, + "grad_norm": 0.2762356668214205, + "learning_rate": 5.648587424459395e-07, + "loss": 2.4909, + "step": 8593 + }, + { + "epoch": 0.9854374498337347, + "grad_norm": 0.26095966828504896, + "learning_rate": 5.560689307514411e-07, + "loss": 2.418, + "step": 8594 + }, + { + "epoch": 0.9855521155830753, + "grad_norm": 0.2528275987167809, + "learning_rate": 5.473480078269644e-07, + "loss": 2.226, + "step": 8595 + }, + { + "epoch": 0.985666781332416, + "grad_norm": 0.2520518829423772, + "learning_rate": 5.386959748754361e-07, + "loss": 2.3396, + "step": 8596 + }, + { + "epoch": 0.9857814470817566, + "grad_norm": 0.22798840958610034, + "learning_rate": 5.30112833090124e-07, + "loss": 2.2825, + "step": 8597 + }, + { + "epoch": 0.9858961128310973, + "grad_norm": 0.2474641745540536, + "learning_rate": 5.215985836550252e-07, + "loss": 2.3857, + "step": 8598 + }, + { + "epoch": 0.9860107785804381, + "grad_norm": 0.2507477113255739, + "learning_rate": 5.131532277444784e-07, + "loss": 2.3427, + "step": 8599 + }, + { + "epoch": 0.9861254443297787, + "grad_norm": 0.24766441039804943, + "learning_rate": 5.047767665233849e-07, + "loss": 2.4202, + "step": 8600 + }, + { + "epoch": 0.9862401100791194, + "grad_norm": 0.24855984984404453, + "learning_rate": 4.964692011470428e-07, + "loss": 2.3779, + "step": 8601 + }, + { + "epoch": 0.98635477582846, + "grad_norm": 0.2644803728861775, + "learning_rate": 4.882305327614244e-07, + "loss": 2.3043, + "step": 8602 + }, + { + "epoch": 0.9864694415778007, + "grad_norm": 0.23700830234761502, + "learning_rate": 4.800607625027876e-07, + "loss": 2.2955, + "step": 8603 + }, + { + "epoch": 0.9865841073271414, + "grad_norm": 0.26165516163496383, + "learning_rate": 4.719598914980638e-07, + "loss": 2.3221, + "step": 8604 + }, + { + "epoch": 0.986698773076482, + "grad_norm": 0.2676096701582563, + "learning_rate": 4.6392792086463744e-07, + "loss": 2.5827, + "step": 8605 + }, + { + "epoch": 0.9868134388258227, + "grad_norm": 0.25229729564013986, + "learning_rate": 4.559648517102888e-07, + "loss": 2.4649, + "step": 8606 + }, + { + "epoch": 0.9869281045751634, + "grad_norm": 0.262937464184219, + "learning_rate": 4.4807068513341707e-07, + "loss": 2.2522, + "step": 8607 + }, + { + "epoch": 0.987042770324504, + "grad_norm": 0.2588535218203286, + "learning_rate": 4.4024542222287354e-07, + "loss": 2.367, + "step": 8608 + }, + { + "epoch": 0.9871574360738448, + "grad_norm": 0.26780235224776666, + "learning_rate": 4.32489064058017e-07, + "loss": 2.34, + "step": 8609 + }, + { + "epoch": 0.9872721018231854, + "grad_norm": 0.25159497229173994, + "learning_rate": 4.248016117086584e-07, + "loss": 2.3852, + "step": 8610 + }, + { + "epoch": 0.9873867675725261, + "grad_norm": 0.24894319767426784, + "learning_rate": 4.1718306623517164e-07, + "loss": 2.3663, + "step": 8611 + }, + { + "epoch": 0.9875014333218668, + "grad_norm": 0.2704067145258275, + "learning_rate": 4.0963342868832744e-07, + "loss": 2.4953, + "step": 8612 + }, + { + "epoch": 0.9876160990712074, + "grad_norm": 0.26337325958042057, + "learning_rate": 4.021527001095149e-07, + "loss": 2.347, + "step": 8613 + }, + { + "epoch": 0.9877307648205481, + "grad_norm": 0.24638098946302542, + "learning_rate": 3.947408815305198e-07, + "loss": 2.3986, + "step": 8614 + }, + { + "epoch": 0.9878454305698888, + "grad_norm": 0.2403370226109531, + "learning_rate": 3.8739797397369105e-07, + "loss": 2.2409, + "step": 8615 + }, + { + "epoch": 0.9879600963192294, + "grad_norm": 0.25069772670311363, + "learning_rate": 3.801239784518296e-07, + "loss": 2.2775, + "step": 8616 + }, + { + "epoch": 0.9880747620685701, + "grad_norm": 0.2385637216993145, + "learning_rate": 3.729188959682439e-07, + "loss": 2.3719, + "step": 8617 + }, + { + "epoch": 0.9881894278179107, + "grad_norm": 0.2559934146276328, + "learning_rate": 3.657827275167502e-07, + "loss": 2.4245, + "step": 8618 + }, + { + "epoch": 0.9883040935672515, + "grad_norm": 0.26140054239578825, + "learning_rate": 3.587154740816168e-07, + "loss": 2.2906, + "step": 8619 + }, + { + "epoch": 0.9884187593165922, + "grad_norm": 0.2443434791368734, + "learning_rate": 3.517171366376748e-07, + "loss": 2.4153, + "step": 8620 + }, + { + "epoch": 0.9885334250659328, + "grad_norm": 0.25900973421790924, + "learning_rate": 3.4478771615015224e-07, + "loss": 2.4326, + "step": 8621 + }, + { + "epoch": 0.9886480908152735, + "grad_norm": 0.2794240255798061, + "learning_rate": 3.379272135748956e-07, + "loss": 2.3971, + "step": 8622 + }, + { + "epoch": 0.9887627565646141, + "grad_norm": 0.25236031326502967, + "learning_rate": 3.3113562985814804e-07, + "loss": 2.3574, + "step": 8623 + }, + { + "epoch": 0.9888774223139548, + "grad_norm": 0.25063342782528103, + "learning_rate": 3.2441296593666014e-07, + "loss": 2.4527, + "step": 8624 + }, + { + "epoch": 0.9889920880632955, + "grad_norm": 0.261683573217417, + "learning_rate": 3.1775922273774573e-07, + "loss": 2.385, + "step": 8625 + }, + { + "epoch": 0.9891067538126361, + "grad_norm": 0.27361532978457964, + "learning_rate": 3.1117440117917063e-07, + "loss": 2.2607, + "step": 8626 + }, + { + "epoch": 0.9892214195619768, + "grad_norm": 0.2527023328580243, + "learning_rate": 3.046585021690973e-07, + "loss": 2.3632, + "step": 8627 + }, + { + "epoch": 0.9893360853113176, + "grad_norm": 0.2492932072753548, + "learning_rate": 2.982115266063623e-07, + "loss": 2.2511, + "step": 8628 + }, + { + "epoch": 0.9894507510606582, + "grad_norm": 0.2802536918308958, + "learning_rate": 2.9183347538014324e-07, + "loss": 2.4247, + "step": 8629 + }, + { + "epoch": 0.9895654168099989, + "grad_norm": 0.2907507553788078, + "learning_rate": 2.855243493701809e-07, + "loss": 2.2605, + "step": 8630 + }, + { + "epoch": 0.9896800825593395, + "grad_norm": 0.25848255535047177, + "learning_rate": 2.7928414944672355e-07, + "loss": 2.2358, + "step": 8631 + }, + { + "epoch": 0.9897947483086802, + "grad_norm": 0.24152951604750336, + "learning_rate": 2.731128764704716e-07, + "loss": 2.3455, + "step": 8632 + }, + { + "epoch": 0.9899094140580209, + "grad_norm": 0.25856205415955, + "learning_rate": 2.670105312926885e-07, + "loss": 2.3936, + "step": 8633 + }, + { + "epoch": 0.9900240798073615, + "grad_norm": 0.26757653497520606, + "learning_rate": 2.6097711475497885e-07, + "loss": 2.2827, + "step": 8634 + }, + { + "epoch": 0.9901387455567022, + "grad_norm": 0.25914451400940713, + "learning_rate": 2.5501262768956593e-07, + "loss": 2.5759, + "step": 8635 + }, + { + "epoch": 0.9902534113060428, + "grad_norm": 0.24312766035895478, + "learning_rate": 2.4911707091918036e-07, + "loss": 2.4949, + "step": 8636 + }, + { + "epoch": 0.9903680770553835, + "grad_norm": 0.26370911699327604, + "learning_rate": 2.4329044525694954e-07, + "loss": 2.3438, + "step": 8637 + }, + { + "epoch": 0.9904827428047243, + "grad_norm": 0.25296513702543827, + "learning_rate": 2.3753275150661947e-07, + "loss": 2.4272, + "step": 8638 + }, + { + "epoch": 0.9905974085540649, + "grad_norm": 0.24066438685155278, + "learning_rate": 2.318439904622771e-07, + "loss": 2.3957, + "step": 8639 + }, + { + "epoch": 0.9907120743034056, + "grad_norm": 0.2884418538132336, + "learning_rate": 2.2622416290857262e-07, + "loss": 2.3981, + "step": 8640 + }, + { + "epoch": 0.9908267400527463, + "grad_norm": 0.2627610370136801, + "learning_rate": 2.2067326962071922e-07, + "loss": 2.4432, + "step": 8641 + }, + { + "epoch": 0.9909414058020869, + "grad_norm": 0.28302352401365827, + "learning_rate": 2.151913113643822e-07, + "loss": 2.4919, + "step": 8642 + }, + { + "epoch": 0.9910560715514276, + "grad_norm": 0.27097779205217254, + "learning_rate": 2.0977828889556794e-07, + "loss": 2.4745, + "step": 8643 + }, + { + "epoch": 0.9911707373007682, + "grad_norm": 0.26667546134556286, + "learning_rate": 2.0443420296101246e-07, + "loss": 2.4686, + "step": 8644 + }, + { + "epoch": 0.9912854030501089, + "grad_norm": 0.2913833970875544, + "learning_rate": 1.991590542977373e-07, + "loss": 2.3921, + "step": 8645 + }, + { + "epoch": 0.9914000687994496, + "grad_norm": 0.24231146255736993, + "learning_rate": 1.9395284363343813e-07, + "loss": 2.4104, + "step": 8646 + }, + { + "epoch": 0.9915147345487902, + "grad_norm": 0.2438636295979367, + "learning_rate": 1.8881557168620722e-07, + "loss": 2.4818, + "step": 8647 + }, + { + "epoch": 0.991629400298131, + "grad_norm": 0.25132533516851535, + "learning_rate": 1.837472391645334e-07, + "loss": 2.269, + "step": 8648 + }, + { + "epoch": 0.9917440660474717, + "grad_norm": 0.24548136728404096, + "learning_rate": 1.787478467675796e-07, + "loss": 2.2914, + "step": 8649 + }, + { + "epoch": 0.9918587317968123, + "grad_norm": 0.30056787591766043, + "learning_rate": 1.7381739518496087e-07, + "loss": 2.3555, + "step": 8650 + }, + { + "epoch": 0.991973397546153, + "grad_norm": 0.2996134119455222, + "learning_rate": 1.6895588509663328e-07, + "loss": 2.6143, + "step": 8651 + }, + { + "epoch": 0.9920880632954936, + "grad_norm": 0.26386854756364286, + "learning_rate": 1.6416331717317157e-07, + "loss": 2.333, + "step": 8652 + }, + { + "epoch": 0.9922027290448343, + "grad_norm": 0.28182992367003884, + "learning_rate": 1.5943969207565801e-07, + "loss": 2.5012, + "step": 8653 + }, + { + "epoch": 0.992317394794175, + "grad_norm": 0.2614427511089186, + "learning_rate": 1.54785010455627e-07, + "loss": 2.4487, + "step": 8654 + }, + { + "epoch": 0.9924320605435156, + "grad_norm": 0.2871849118856723, + "learning_rate": 1.5019927295506497e-07, + "loss": 2.408, + "step": 8655 + }, + { + "epoch": 0.9925467262928563, + "grad_norm": 0.28388353540419736, + "learning_rate": 1.4568248020652152e-07, + "loss": 2.3525, + "step": 8656 + }, + { + "epoch": 0.9926613920421969, + "grad_norm": 0.2640457456346779, + "learning_rate": 1.4123463283299832e-07, + "loss": 2.4838, + "step": 8657 + }, + { + "epoch": 0.9927760577915377, + "grad_norm": 0.29740165737984015, + "learning_rate": 1.3685573144800456e-07, + "loss": 2.4034, + "step": 8658 + }, + { + "epoch": 0.9928907235408784, + "grad_norm": 0.27163374215204966, + "learning_rate": 1.325457766554461e-07, + "loss": 2.4704, + "step": 8659 + }, + { + "epoch": 0.993005389290219, + "grad_norm": 0.26471152740242737, + "learning_rate": 1.2830476904990286e-07, + "loss": 2.3582, + "step": 8660 + }, + { + "epoch": 0.9931200550395597, + "grad_norm": 0.2503953927919408, + "learning_rate": 1.241327092162403e-07, + "loss": 2.3669, + "step": 8661 + }, + { + "epoch": 0.9932347207889004, + "grad_norm": 0.26780079575386767, + "learning_rate": 1.200295977300536e-07, + "loss": 2.4478, + "step": 8662 + }, + { + "epoch": 0.993349386538241, + "grad_norm": 0.26285216971670683, + "learning_rate": 1.1599543515711241e-07, + "loss": 2.3395, + "step": 8663 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.2588272966672831, + "learning_rate": 1.1203022205402702e-07, + "loss": 2.2397, + "step": 8664 + }, + { + "epoch": 0.9935787180369223, + "grad_norm": 0.2564340223156835, + "learning_rate": 1.0813395896758227e-07, + "loss": 2.5017, + "step": 8665 + }, + { + "epoch": 0.993693383786263, + "grad_norm": 0.23745272228770195, + "learning_rate": 1.0430664643529264e-07, + "loss": 2.325, + "step": 8666 + }, + { + "epoch": 0.9938080495356038, + "grad_norm": 0.27010028185794954, + "learning_rate": 1.0054828498501367e-07, + "loss": 2.4829, + "step": 8667 + }, + { + "epoch": 0.9939227152849444, + "grad_norm": 0.2824664547110627, + "learning_rate": 9.68588751351085e-08, + "loss": 2.4335, + "step": 8668 + }, + { + "epoch": 0.9940373810342851, + "grad_norm": 0.2551464862481459, + "learning_rate": 9.323841739455885e-08, + "loss": 2.5204, + "step": 8669 + }, + { + "epoch": 0.9941520467836257, + "grad_norm": 0.29775161777572506, + "learning_rate": 8.968691226263204e-08, + "loss": 2.4096, + "step": 8670 + }, + { + "epoch": 0.9942667125329664, + "grad_norm": 0.2651379129855896, + "learning_rate": 8.620436022926947e-08, + "loss": 2.4061, + "step": 8671 + }, + { + "epoch": 0.9943813782823071, + "grad_norm": 0.23904300107657003, + "learning_rate": 8.279076177475364e-08, + "loss": 2.374, + "step": 8672 + }, + { + "epoch": 0.9944960440316477, + "grad_norm": 0.24277519274369436, + "learning_rate": 7.944611736998564e-08, + "loss": 2.351, + "step": 8673 + }, + { + "epoch": 0.9946107097809884, + "grad_norm": 0.2468743565022368, + "learning_rate": 7.617042747631864e-08, + "loss": 2.3021, + "step": 8674 + }, + { + "epoch": 0.9947253755303291, + "grad_norm": 0.2607820769266603, + "learning_rate": 7.296369254544688e-08, + "loss": 2.1844, + "step": 8675 + }, + { + "epoch": 0.9948400412796697, + "grad_norm": 0.26964384342108455, + "learning_rate": 6.982591301979424e-08, + "loss": 2.4855, + "step": 8676 + }, + { + "epoch": 0.9949547070290105, + "grad_norm": 0.2703086387292498, + "learning_rate": 6.675708933212565e-08, + "loss": 2.2926, + "step": 8677 + }, + { + "epoch": 0.9950693727783511, + "grad_norm": 0.27776889607863414, + "learning_rate": 6.375722190571364e-08, + "loss": 2.3183, + "step": 8678 + }, + { + "epoch": 0.9951840385276918, + "grad_norm": 0.2560109574942953, + "learning_rate": 6.082631115439385e-08, + "loss": 2.4115, + "step": 8679 + }, + { + "epoch": 0.9952987042770325, + "grad_norm": 0.2674455608940206, + "learning_rate": 5.796435748228746e-08, + "loss": 2.2582, + "step": 8680 + }, + { + "epoch": 0.9954133700263731, + "grad_norm": 0.24423833523924252, + "learning_rate": 5.517136128430078e-08, + "loss": 2.3668, + "step": 8681 + }, + { + "epoch": 0.9955280357757138, + "grad_norm": 0.2628472406338573, + "learning_rate": 5.24473229455702e-08, + "loss": 2.4378, + "step": 8682 + }, + { + "epoch": 0.9956427015250545, + "grad_norm": 0.26159456291626043, + "learning_rate": 4.97922428419062e-08, + "loss": 2.4248, + "step": 8683 + }, + { + "epoch": 0.9957573672743951, + "grad_norm": 0.2576237397475355, + "learning_rate": 4.720612133946034e-08, + "loss": 2.4214, + "step": 8684 + }, + { + "epoch": 0.9958720330237358, + "grad_norm": 0.22990891398550256, + "learning_rate": 4.4688958794947274e-08, + "loss": 2.2679, + "step": 8685 + }, + { + "epoch": 0.9959866987730764, + "grad_norm": 0.2439124544729441, + "learning_rate": 4.224075555558926e-08, + "loss": 2.3323, + "step": 8686 + }, + { + "epoch": 0.9961013645224172, + "grad_norm": 0.2450153319712526, + "learning_rate": 3.986151195906063e-08, + "loss": 2.3078, + "step": 8687 + }, + { + "epoch": 0.9962160302717579, + "grad_norm": 0.25413039722979747, + "learning_rate": 3.7551228333543294e-08, + "loss": 2.3556, + "step": 8688 + }, + { + "epoch": 0.9963306960210985, + "grad_norm": 0.22087335675688888, + "learning_rate": 3.530990499761577e-08, + "loss": 2.2377, + "step": 8689 + }, + { + "epoch": 0.9964453617704392, + "grad_norm": 0.24757528557754177, + "learning_rate": 3.313754226058619e-08, + "loss": 2.343, + "step": 8690 + }, + { + "epoch": 0.9965600275197798, + "grad_norm": 0.2831624394783042, + "learning_rate": 3.10341404219372e-08, + "loss": 2.3523, + "step": 8691 + }, + { + "epoch": 0.9966746932691205, + "grad_norm": 0.25768590833675686, + "learning_rate": 2.8999699771881105e-08, + "loss": 2.3252, + "step": 8692 + }, + { + "epoch": 0.9967893590184612, + "grad_norm": 0.2653641590796179, + "learning_rate": 2.703422059097127e-08, + "loss": 2.3874, + "step": 8693 + }, + { + "epoch": 0.9969040247678018, + "grad_norm": 0.23592517689817794, + "learning_rate": 2.5137703150379666e-08, + "loss": 2.317, + "step": 8694 + }, + { + "epoch": 0.9970186905171425, + "grad_norm": 0.2614730575100257, + "learning_rate": 2.3310147711619324e-08, + "loss": 2.3447, + "step": 8695 + }, + { + "epoch": 0.9971333562664833, + "grad_norm": 0.26783196452850944, + "learning_rate": 2.1551554526821892e-08, + "loss": 2.3634, + "step": 8696 + }, + { + "epoch": 0.9972480220158239, + "grad_norm": 0.2526019031264884, + "learning_rate": 1.9861923838515596e-08, + "loss": 2.3153, + "step": 8697 + }, + { + "epoch": 0.9973626877651646, + "grad_norm": 0.25562213729370675, + "learning_rate": 1.824125587979175e-08, + "loss": 2.4103, + "step": 8698 + }, + { + "epoch": 0.9974773535145052, + "grad_norm": 0.25315660282809743, + "learning_rate": 1.668955087413826e-08, + "loss": 2.5363, + "step": 8699 + }, + { + "epoch": 0.9975920192638459, + "grad_norm": 0.2565312874958023, + "learning_rate": 1.5206809035606116e-08, + "loss": 2.5074, + "step": 8700 + }, + { + "epoch": 0.9977066850131866, + "grad_norm": 0.29156274121267944, + "learning_rate": 1.3793030568753917e-08, + "loss": 2.3541, + "step": 8701 + }, + { + "epoch": 0.9978213507625272, + "grad_norm": 0.2529971326244546, + "learning_rate": 1.2448215668536822e-08, + "loss": 2.4232, + "step": 8702 + }, + { + "epoch": 0.9979360165118679, + "grad_norm": 0.2562649865938568, + "learning_rate": 1.11723645204731e-08, + "loss": 2.3617, + "step": 8703 + }, + { + "epoch": 0.9980506822612085, + "grad_norm": 0.25383915701070436, + "learning_rate": 9.965477300477588e-09, + "loss": 2.4644, + "step": 8704 + }, + { + "epoch": 0.9981653480105492, + "grad_norm": 0.2468425391742744, + "learning_rate": 8.827554175139252e-09, + "loss": 2.2745, + "step": 8705 + }, + { + "epoch": 0.99828001375989, + "grad_norm": 0.26347020622133427, + "learning_rate": 7.758595301277094e-09, + "loss": 2.4839, + "step": 8706 + }, + { + "epoch": 0.9983946795092306, + "grad_norm": 0.268117646474416, + "learning_rate": 6.7586008264397534e-09, + "loss": 2.6396, + "step": 8707 + }, + { + "epoch": 0.9985093452585713, + "grad_norm": 0.23972507325541234, + "learning_rate": 5.82757088846142e-09, + "loss": 2.3252, + "step": 8708 + }, + { + "epoch": 0.998624011007912, + "grad_norm": 0.25997406905727055, + "learning_rate": 4.965505615850408e-09, + "loss": 2.3622, + "step": 8709 + }, + { + "epoch": 0.9987386767572526, + "grad_norm": 0.22394255195605028, + "learning_rate": 4.172405127511603e-09, + "loss": 2.2471, + "step": 8710 + }, + { + "epoch": 0.9988533425065933, + "grad_norm": 0.2808201473396869, + "learning_rate": 3.448269532746462e-09, + "loss": 2.2729, + "step": 8711 + }, + { + "epoch": 0.9989680082559339, + "grad_norm": 0.2795613612146032, + "learning_rate": 2.793098931530569e-09, + "loss": 2.4667, + "step": 8712 + }, + { + "epoch": 0.9990826740052746, + "grad_norm": 0.25612538088162873, + "learning_rate": 2.2068934141805664e-09, + "loss": 2.295, + "step": 8713 + }, + { + "epoch": 0.9991973397546153, + "grad_norm": 0.2704012435212936, + "learning_rate": 1.6896530615206907e-09, + "loss": 2.3392, + "step": 8714 + }, + { + "epoch": 0.9993120055039559, + "grad_norm": 0.24133529993686634, + "learning_rate": 1.2413779449937934e-09, + "loss": 2.3006, + "step": 8715 + }, + { + "epoch": 0.9994266712532967, + "grad_norm": 0.26350406306133656, + "learning_rate": 8.620681263282748e-10, + "loss": 2.4566, + "step": 8716 + }, + { + "epoch": 0.9995413370026374, + "grad_norm": 0.23644930682027115, + "learning_rate": 5.517236579266616e-10, + "loss": 2.5879, + "step": 8717 + }, + { + "epoch": 0.999656002751978, + "grad_norm": 0.2874345114734641, + "learning_rate": 3.1034458258805134e-10, + "loss": 2.4806, + "step": 8718 + }, + { + "epoch": 0.9997706685013187, + "grad_norm": 0.2356292832141907, + "learning_rate": 1.3793093350811247e-10, + "loss": 2.2896, + "step": 8719 + }, + { + "epoch": 0.9998853342506593, + "grad_norm": 0.25027326326702304, + "learning_rate": 3.4482734556640086e-11, + "loss": 2.5145, + "step": 8720 + }, + { + "epoch": 1.0, + "grad_norm": 0.25850277707761193, + "learning_rate": 0.0, + "loss": 2.4187, + "step": 8721 + }, + { + "epoch": 1.0, + "step": 8721, + "total_flos": 291840174110720.0, + "train_loss": 2.5324699844792957, + "train_runtime": 37426.6704, + "train_samples_per_second": 14.913, + "train_steps_per_second": 0.233 + } + ], + "logging_steps": 1.0, + "max_steps": 8721, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 291840174110720.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}