{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0788133053640996, "eval_steps": 500, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005993407252022775, "grad_norm": 0.8397009372711182, "learning_rate": 5.991611743559017e-07, "loss": 0.0366, "step": 100 }, { "epoch": 0.01198681450404555, "grad_norm": 0.7360026240348816, "learning_rate": 1.1983223487118035e-06, "loss": 0.0143, "step": 200 }, { "epoch": 0.017980221756068324, "grad_norm": 0.26396483182907104, "learning_rate": 1.7974835230677055e-06, "loss": 0.0091, "step": 300 }, { "epoch": 0.0239736290080911, "grad_norm": 0.08779273182153702, "learning_rate": 2.396644697423607e-06, "loss": 0.0059, "step": 400 }, { "epoch": 0.029967036260113874, "grad_norm": 0.5255675911903381, "learning_rate": 2.995805871779509e-06, "loss": 0.0059, "step": 500 }, { "epoch": 0.03596044351213665, "grad_norm": 0.2772226929664612, "learning_rate": 3.594967046135411e-06, "loss": 0.005, "step": 600 }, { "epoch": 0.041953850764159424, "grad_norm": 0.29560720920562744, "learning_rate": 4.194128220491313e-06, "loss": 0.0041, "step": 700 }, { "epoch": 0.0479472580161822, "grad_norm": 0.4243590235710144, "learning_rate": 4.793289394847214e-06, "loss": 0.0038, "step": 800 }, { "epoch": 0.05394066526820498, "grad_norm": 0.12234604358673096, "learning_rate": 5.392450569203116e-06, "loss": 0.0033, "step": 900 }, { "epoch": 0.05993407252022775, "grad_norm": 0.17332005500793457, "learning_rate": 5.991611743559018e-06, "loss": 0.0045, "step": 1000 }, { "epoch": 0.06592747977225052, "grad_norm": 0.017084548249840736, "learning_rate": 6.59077291791492e-06, "loss": 0.0031, "step": 1100 }, { "epoch": 0.0719208870242733, "grad_norm": 0.04909040033817291, "learning_rate": 7.189934092270822e-06, "loss": 0.0034, "step": 1200 }, { "epoch": 0.07791429427629608, "grad_norm": 0.03835730627179146, "learning_rate": 7.789095266626723e-06, "loss": 0.0028, "step": 1300 }, { "epoch": 0.08390770152831885, "grad_norm": 0.04889771714806557, "learning_rate": 8.388256440982625e-06, "loss": 0.0028, "step": 1400 }, { "epoch": 0.08990110878034162, "grad_norm": 0.1031421571969986, "learning_rate": 8.987417615338527e-06, "loss": 0.003, "step": 1500 }, { "epoch": 0.0958945160323644, "grad_norm": 0.11215908825397491, "learning_rate": 9.586578789694428e-06, "loss": 0.0027, "step": 1600 }, { "epoch": 0.10188792328438717, "grad_norm": 0.1708650439977646, "learning_rate": 9.99022112867102e-06, "loss": 0.0025, "step": 1700 }, { "epoch": 0.10788133053640996, "grad_norm": 0.01850762963294983, "learning_rate": 9.958676382448504e-06, "loss": 0.0025, "step": 1800 }, { "epoch": 0.11387473778843273, "grad_norm": 0.10600468516349792, "learning_rate": 9.927131636225988e-06, "loss": 0.0025, "step": 1900 }, { "epoch": 0.1198681450404555, "grad_norm": 0.16077758371829987, "learning_rate": 9.895586890003471e-06, "loss": 0.0027, "step": 2000 }, { "epoch": 0.12586155229247828, "grad_norm": 0.3142828047275543, "learning_rate": 9.864042143780953e-06, "loss": 0.0022, "step": 2100 }, { "epoch": 0.13185495954450105, "grad_norm": 0.18406708538532257, "learning_rate": 9.832497397558437e-06, "loss": 0.0025, "step": 2200 }, { "epoch": 0.13784836679652382, "grad_norm": 0.10599557310342789, "learning_rate": 9.80095265133592e-06, "loss": 0.0027, "step": 2300 }, { "epoch": 0.1438417740485466, "grad_norm": 0.041681960225105286, "learning_rate": 9.769407905113404e-06, "loss": 0.0027, "step": 2400 }, { "epoch": 0.1498351813005694, "grad_norm": 0.26586103439331055, "learning_rate": 9.737863158890888e-06, "loss": 0.0031, "step": 2500 }, { "epoch": 0.15582858855259216, "grad_norm": 0.1568969488143921, "learning_rate": 9.70631841266837e-06, "loss": 0.0025, "step": 2600 }, { "epoch": 0.16182199580461493, "grad_norm": 0.09259970486164093, "learning_rate": 9.674773666445855e-06, "loss": 0.0023, "step": 2700 }, { "epoch": 0.1678154030566377, "grad_norm": 0.03380216658115387, "learning_rate": 9.643228920223337e-06, "loss": 0.0022, "step": 2800 }, { "epoch": 0.17380881030866047, "grad_norm": 0.18946796655654907, "learning_rate": 9.611684174000821e-06, "loss": 0.0025, "step": 2900 }, { "epoch": 0.17980221756068324, "grad_norm": 0.3344770073890686, "learning_rate": 9.580139427778305e-06, "loss": 0.0021, "step": 3000 }, { "epoch": 0.18579562481270603, "grad_norm": 0.04218849539756775, "learning_rate": 9.548594681555787e-06, "loss": 0.0024, "step": 3100 }, { "epoch": 0.1917890320647288, "grad_norm": 0.0481434129178524, "learning_rate": 9.517049935333272e-06, "loss": 0.0027, "step": 3200 }, { "epoch": 0.19778243931675157, "grad_norm": 0.32030656933784485, "learning_rate": 9.485505189110754e-06, "loss": 0.0025, "step": 3300 }, { "epoch": 0.20377584656877434, "grad_norm": 0.19509385526180267, "learning_rate": 9.453960442888238e-06, "loss": 0.0022, "step": 3400 }, { "epoch": 0.2097692538207971, "grad_norm": 0.08745113760232925, "learning_rate": 9.422415696665721e-06, "loss": 0.0026, "step": 3500 }, { "epoch": 0.2157626610728199, "grad_norm": 0.11743105947971344, "learning_rate": 9.390870950443205e-06, "loss": 0.0021, "step": 3600 }, { "epoch": 0.22175606832484268, "grad_norm": 0.1497587114572525, "learning_rate": 9.359326204220689e-06, "loss": 0.0026, "step": 3700 }, { "epoch": 0.22774947557686545, "grad_norm": 0.07227639853954315, "learning_rate": 9.32778145799817e-06, "loss": 0.0024, "step": 3800 }, { "epoch": 0.23374288282888822, "grad_norm": 0.022099023684859276, "learning_rate": 9.296236711775654e-06, "loss": 0.0019, "step": 3900 }, { "epoch": 0.239736290080911, "grad_norm": 0.09603813290596008, "learning_rate": 9.264691965553138e-06, "loss": 0.0019, "step": 4000 }, { "epoch": 0.24572969733293376, "grad_norm": 0.09311718493700027, "learning_rate": 9.233147219330622e-06, "loss": 0.002, "step": 4100 }, { "epoch": 0.25172310458495656, "grad_norm": 0.06892485171556473, "learning_rate": 9.201602473108105e-06, "loss": 0.0022, "step": 4200 }, { "epoch": 0.2577165118369793, "grad_norm": 0.2696809470653534, "learning_rate": 9.170057726885589e-06, "loss": 0.0024, "step": 4300 }, { "epoch": 0.2637099190890021, "grad_norm": 0.12481023371219635, "learning_rate": 9.138512980663071e-06, "loss": 0.0021, "step": 4400 }, { "epoch": 0.2697033263410249, "grad_norm": 0.029085570946335793, "learning_rate": 9.106968234440555e-06, "loss": 0.0025, "step": 4500 }, { "epoch": 0.27569673359304764, "grad_norm": 0.16772325336933136, "learning_rate": 9.075423488218038e-06, "loss": 0.0019, "step": 4600 }, { "epoch": 0.28169014084507044, "grad_norm": 0.25038984417915344, "learning_rate": 9.04387874199552e-06, "loss": 0.0022, "step": 4700 }, { "epoch": 0.2876835480970932, "grad_norm": 0.009772785007953644, "learning_rate": 9.012333995773006e-06, "loss": 0.002, "step": 4800 }, { "epoch": 0.293676955349116, "grad_norm": 0.10010802745819092, "learning_rate": 8.980789249550487e-06, "loss": 0.0021, "step": 4900 }, { "epoch": 0.2996703626011388, "grad_norm": 0.019169898703694344, "learning_rate": 8.949244503327971e-06, "loss": 0.0024, "step": 5000 }, { "epoch": 0.3056637698531615, "grad_norm": 0.039739012718200684, "learning_rate": 8.917699757105455e-06, "loss": 0.0022, "step": 5100 }, { "epoch": 0.3116571771051843, "grad_norm": 0.20961305499076843, "learning_rate": 8.886155010882938e-06, "loss": 0.0021, "step": 5200 }, { "epoch": 0.31765058435720706, "grad_norm": 0.07605484127998352, "learning_rate": 8.854610264660422e-06, "loss": 0.002, "step": 5300 }, { "epoch": 0.32364399160922985, "grad_norm": 0.01589258573949337, "learning_rate": 8.823065518437904e-06, "loss": 0.0022, "step": 5400 }, { "epoch": 0.3296373988612526, "grad_norm": 0.10248999297618866, "learning_rate": 8.791520772215388e-06, "loss": 0.0023, "step": 5500 }, { "epoch": 0.3356308061132754, "grad_norm": 0.09167122095823288, "learning_rate": 8.759976025992871e-06, "loss": 0.002, "step": 5600 }, { "epoch": 0.3416242133652982, "grad_norm": 0.23392055928707123, "learning_rate": 8.728431279770355e-06, "loss": 0.0021, "step": 5700 }, { "epoch": 0.34761762061732093, "grad_norm": 0.040714360773563385, "learning_rate": 8.696886533547839e-06, "loss": 0.0025, "step": 5800 }, { "epoch": 0.35361102786934373, "grad_norm": 0.184820294380188, "learning_rate": 8.665341787325322e-06, "loss": 0.0021, "step": 5900 }, { "epoch": 0.3596044351213665, "grad_norm": 0.04772236570715904, "learning_rate": 8.633797041102804e-06, "loss": 0.0022, "step": 6000 }, { "epoch": 0.36559784237338927, "grad_norm": 0.12407626956701279, "learning_rate": 8.60225229488029e-06, "loss": 0.0018, "step": 6100 }, { "epoch": 0.37159124962541207, "grad_norm": 0.1552393138408661, "learning_rate": 8.570707548657772e-06, "loss": 0.0024, "step": 6200 }, { "epoch": 0.3775846568774348, "grad_norm": 0.005017109680920839, "learning_rate": 8.539162802435255e-06, "loss": 0.0022, "step": 6300 }, { "epoch": 0.3835780641294576, "grad_norm": 0.00316947465762496, "learning_rate": 8.507618056212739e-06, "loss": 0.0021, "step": 6400 }, { "epoch": 0.38957147138148035, "grad_norm": 0.08644753694534302, "learning_rate": 8.476073309990221e-06, "loss": 0.0016, "step": 6500 }, { "epoch": 0.39556487863350315, "grad_norm": 0.23877011239528656, "learning_rate": 8.444528563767705e-06, "loss": 0.0023, "step": 6600 }, { "epoch": 0.40155828588552595, "grad_norm": 0.12397243827581406, "learning_rate": 8.412983817545188e-06, "loss": 0.002, "step": 6700 }, { "epoch": 0.4075516931375487, "grad_norm": 0.08488207310438156, "learning_rate": 8.381439071322672e-06, "loss": 0.002, "step": 6800 }, { "epoch": 0.4135451003895715, "grad_norm": 0.15658150613307953, "learning_rate": 8.349894325100156e-06, "loss": 0.0021, "step": 6900 }, { "epoch": 0.4195385076415942, "grad_norm": 0.09054456651210785, "learning_rate": 8.31834957887764e-06, "loss": 0.0022, "step": 7000 }, { "epoch": 0.425531914893617, "grad_norm": 0.1383715718984604, "learning_rate": 8.286804832655121e-06, "loss": 0.0019, "step": 7100 }, { "epoch": 0.4315253221456398, "grad_norm": 0.23421403765678406, "learning_rate": 8.255260086432605e-06, "loss": 0.0021, "step": 7200 }, { "epoch": 0.43751872939766256, "grad_norm": 0.07612959295511246, "learning_rate": 8.223715340210089e-06, "loss": 0.0018, "step": 7300 }, { "epoch": 0.44351213664968536, "grad_norm": 0.08813223987817764, "learning_rate": 8.192170593987572e-06, "loss": 0.0028, "step": 7400 }, { "epoch": 0.4495055439017081, "grad_norm": 0.11603320389986038, "learning_rate": 8.160625847765056e-06, "loss": 0.0021, "step": 7500 }, { "epoch": 0.4554989511537309, "grad_norm": 0.06462118774652481, "learning_rate": 8.129081101542538e-06, "loss": 0.0021, "step": 7600 }, { "epoch": 0.46149235840575364, "grad_norm": 0.08253411203622818, "learning_rate": 8.097536355320023e-06, "loss": 0.0019, "step": 7700 }, { "epoch": 0.46748576565777644, "grad_norm": 0.017711922526359558, "learning_rate": 8.065991609097505e-06, "loss": 0.0018, "step": 7800 }, { "epoch": 0.47347917290979924, "grad_norm": 0.16423271596431732, "learning_rate": 8.034446862874989e-06, "loss": 0.0021, "step": 7900 }, { "epoch": 0.479472580161822, "grad_norm": 0.17104622721672058, "learning_rate": 8.002902116652473e-06, "loss": 0.0022, "step": 8000 }, { "epoch": 0.4854659874138448, "grad_norm": 0.11236003786325455, "learning_rate": 7.971357370429955e-06, "loss": 0.002, "step": 8100 }, { "epoch": 0.4914593946658675, "grad_norm": 0.004910625517368317, "learning_rate": 7.93981262420744e-06, "loss": 0.0017, "step": 8200 }, { "epoch": 0.4974528019178903, "grad_norm": 0.015166868455708027, "learning_rate": 7.908267877984922e-06, "loss": 0.0016, "step": 8300 }, { "epoch": 0.5034462091699131, "grad_norm": 0.04219336435198784, "learning_rate": 7.876723131762406e-06, "loss": 0.0019, "step": 8400 }, { "epoch": 0.5094396164219359, "grad_norm": 0.08096965402364731, "learning_rate": 7.84517838553989e-06, "loss": 0.002, "step": 8500 }, { "epoch": 0.5154330236739586, "grad_norm": 0.27304044365882874, "learning_rate": 7.813633639317373e-06, "loss": 0.002, "step": 8600 }, { "epoch": 0.5214264309259814, "grad_norm": 0.023843977600336075, "learning_rate": 7.782088893094857e-06, "loss": 0.0021, "step": 8700 }, { "epoch": 0.5274198381780042, "grad_norm": 0.06996838003396988, "learning_rate": 7.750544146872338e-06, "loss": 0.0016, "step": 8800 }, { "epoch": 0.533413245430027, "grad_norm": 0.09238845109939575, "learning_rate": 7.718999400649822e-06, "loss": 0.0017, "step": 8900 }, { "epoch": 0.5394066526820498, "grad_norm": 0.031245483085513115, "learning_rate": 7.687454654427306e-06, "loss": 0.0019, "step": 9000 }, { "epoch": 0.5454000599340725, "grad_norm": 0.02232646569609642, "learning_rate": 7.65590990820479e-06, "loss": 0.0022, "step": 9100 }, { "epoch": 0.5513934671860953, "grad_norm": 0.18810293078422546, "learning_rate": 7.624365161982272e-06, "loss": 0.002, "step": 9200 }, { "epoch": 0.5573868744381181, "grad_norm": 0.04845254495739937, "learning_rate": 7.592820415759756e-06, "loss": 0.0021, "step": 9300 }, { "epoch": 0.5633802816901409, "grad_norm": 0.12073975801467896, "learning_rate": 7.561275669537239e-06, "loss": 0.0021, "step": 9400 }, { "epoch": 0.5693736889421637, "grad_norm": 0.03330647572875023, "learning_rate": 7.529730923314722e-06, "loss": 0.002, "step": 9500 }, { "epoch": 0.5753670961941864, "grad_norm": 0.23292703926563263, "learning_rate": 7.498186177092206e-06, "loss": 0.0018, "step": 9600 }, { "epoch": 0.5813605034462092, "grad_norm": 0.3227817118167877, "learning_rate": 7.466641430869689e-06, "loss": 0.0017, "step": 9700 }, { "epoch": 0.587353910698232, "grad_norm": 0.03530238941311836, "learning_rate": 7.4350966846471726e-06, "loss": 0.0023, "step": 9800 }, { "epoch": 0.5933473179502547, "grad_norm": 0.1631837785243988, "learning_rate": 7.403551938424655e-06, "loss": 0.0019, "step": 9900 }, { "epoch": 0.5993407252022775, "grad_norm": 0.11341429501771927, "learning_rate": 7.37200719220214e-06, "loss": 0.0018, "step": 10000 }, { "epoch": 0.6053341324543002, "grad_norm": 0.19524067640304565, "learning_rate": 7.340462445979623e-06, "loss": 0.0021, "step": 10100 }, { "epoch": 0.611327539706323, "grad_norm": 0.058198366314172745, "learning_rate": 7.308917699757106e-06, "loss": 0.0018, "step": 10200 }, { "epoch": 0.6173209469583458, "grad_norm": 0.02788078971207142, "learning_rate": 7.277372953534589e-06, "loss": 0.0014, "step": 10300 }, { "epoch": 0.6233143542103686, "grad_norm": 0.07168685644865036, "learning_rate": 7.245828207312072e-06, "loss": 0.0017, "step": 10400 }, { "epoch": 0.6293077614623914, "grad_norm": 0.07542666047811508, "learning_rate": 7.2142834610895565e-06, "loss": 0.0022, "step": 10500 }, { "epoch": 0.6353011687144141, "grad_norm": 0.1050957664847374, "learning_rate": 7.182738714867039e-06, "loss": 0.0017, "step": 10600 }, { "epoch": 0.6412945759664369, "grad_norm": 0.02330237440764904, "learning_rate": 7.151193968644523e-06, "loss": 0.002, "step": 10700 }, { "epoch": 0.6472879832184597, "grad_norm": 0.019814783707261086, "learning_rate": 7.119649222422006e-06, "loss": 0.0019, "step": 10800 }, { "epoch": 0.6532813904704825, "grad_norm": 0.041212160140275955, "learning_rate": 7.08810447619949e-06, "loss": 0.0022, "step": 10900 }, { "epoch": 0.6592747977225052, "grad_norm": 0.104148730635643, "learning_rate": 7.056559729976973e-06, "loss": 0.0017, "step": 11000 }, { "epoch": 0.665268204974528, "grad_norm": 0.060578759759664536, "learning_rate": 7.025014983754457e-06, "loss": 0.0019, "step": 11100 }, { "epoch": 0.6712616122265508, "grad_norm": 0.014108662493526936, "learning_rate": 6.99347023753194e-06, "loss": 0.002, "step": 11200 }, { "epoch": 0.6772550194785736, "grad_norm": 0.06860730797052383, "learning_rate": 6.9619254913094224e-06, "loss": 0.0018, "step": 11300 }, { "epoch": 0.6832484267305964, "grad_norm": 0.2818455696105957, "learning_rate": 6.930380745086906e-06, "loss": 0.0015, "step": 11400 }, { "epoch": 0.6892418339826191, "grad_norm": 0.09976188093423843, "learning_rate": 6.89883599886439e-06, "loss": 0.0017, "step": 11500 }, { "epoch": 0.6952352412346419, "grad_norm": 0.04988027364015579, "learning_rate": 6.8672912526418734e-06, "loss": 0.0016, "step": 11600 }, { "epoch": 0.7012286484866647, "grad_norm": 0.061295535415410995, "learning_rate": 6.835746506419356e-06, "loss": 0.0016, "step": 11700 }, { "epoch": 0.7072220557386875, "grad_norm": 0.04820416495203972, "learning_rate": 6.80420176019684e-06, "loss": 0.0017, "step": 11800 }, { "epoch": 0.7132154629907103, "grad_norm": 0.08933009207248688, "learning_rate": 6.772657013974323e-06, "loss": 0.002, "step": 11900 }, { "epoch": 0.719208870242733, "grad_norm": 0.057753268629312515, "learning_rate": 6.7411122677518055e-06, "loss": 0.0018, "step": 12000 }, { "epoch": 0.7252022774947557, "grad_norm": 0.020321357995271683, "learning_rate": 6.70956752152929e-06, "loss": 0.0017, "step": 12100 }, { "epoch": 0.7311956847467785, "grad_norm": 0.258957177400589, "learning_rate": 6.678022775306773e-06, "loss": 0.0019, "step": 12200 }, { "epoch": 0.7371890919988013, "grad_norm": 0.1562880277633667, "learning_rate": 6.6464780290842565e-06, "loss": 0.002, "step": 12300 }, { "epoch": 0.7431824992508241, "grad_norm": 0.0703672245144844, "learning_rate": 6.614933282861739e-06, "loss": 0.0018, "step": 12400 }, { "epoch": 0.7491759065028468, "grad_norm": 0.015919741243124008, "learning_rate": 6.583388536639224e-06, "loss": 0.0018, "step": 12500 }, { "epoch": 0.7551693137548696, "grad_norm": 0.06606917828321457, "learning_rate": 6.551843790416707e-06, "loss": 0.0022, "step": 12600 }, { "epoch": 0.7611627210068924, "grad_norm": 0.1327201873064041, "learning_rate": 6.52029904419419e-06, "loss": 0.002, "step": 12700 }, { "epoch": 0.7671561282589152, "grad_norm": 0.10167068988084793, "learning_rate": 6.488754297971673e-06, "loss": 0.0018, "step": 12800 }, { "epoch": 0.773149535510938, "grad_norm": 0.20014306902885437, "learning_rate": 6.457209551749156e-06, "loss": 0.0019, "step": 12900 }, { "epoch": 0.7791429427629607, "grad_norm": 0.10611408203840256, "learning_rate": 6.4256648055266405e-06, "loss": 0.0016, "step": 13000 }, { "epoch": 0.7851363500149835, "grad_norm": 0.004227208439260721, "learning_rate": 6.394120059304123e-06, "loss": 0.0018, "step": 13100 }, { "epoch": 0.7911297572670063, "grad_norm": 0.04251255840063095, "learning_rate": 6.362575313081607e-06, "loss": 0.0022, "step": 13200 }, { "epoch": 0.7971231645190291, "grad_norm": 0.09611974656581879, "learning_rate": 6.33103056685909e-06, "loss": 0.0019, "step": 13300 }, { "epoch": 0.8031165717710519, "grad_norm": 0.060009848326444626, "learning_rate": 6.299485820636574e-06, "loss": 0.0019, "step": 13400 }, { "epoch": 0.8091099790230746, "grad_norm": 0.027135098353028297, "learning_rate": 6.267941074414057e-06, "loss": 0.0016, "step": 13500 }, { "epoch": 0.8151033862750974, "grad_norm": 0.09115968644618988, "learning_rate": 6.236396328191541e-06, "loss": 0.0017, "step": 13600 }, { "epoch": 0.8210967935271202, "grad_norm": 0.3819001317024231, "learning_rate": 6.204851581969024e-06, "loss": 0.0019, "step": 13700 }, { "epoch": 0.827090200779143, "grad_norm": 0.07268409430980682, "learning_rate": 6.173306835746506e-06, "loss": 0.002, "step": 13800 }, { "epoch": 0.8330836080311658, "grad_norm": 0.1490897685289383, "learning_rate": 6.14176208952399e-06, "loss": 0.0015, "step": 13900 }, { "epoch": 0.8390770152831885, "grad_norm": 0.07468798011541367, "learning_rate": 6.110217343301474e-06, "loss": 0.0017, "step": 14000 }, { "epoch": 0.8450704225352113, "grad_norm": 0.045000866055488586, "learning_rate": 6.078672597078957e-06, "loss": 0.0019, "step": 14100 }, { "epoch": 0.851063829787234, "grad_norm": 0.22245222330093384, "learning_rate": 6.04712785085644e-06, "loss": 0.0015, "step": 14200 }, { "epoch": 0.8570572370392568, "grad_norm": 0.09135129302740097, "learning_rate": 6.015583104633924e-06, "loss": 0.002, "step": 14300 }, { "epoch": 0.8630506442912796, "grad_norm": 0.043701499700546265, "learning_rate": 5.984038358411407e-06, "loss": 0.0017, "step": 14400 }, { "epoch": 0.8690440515433023, "grad_norm": 0.1364869773387909, "learning_rate": 5.9524936121888895e-06, "loss": 0.0019, "step": 14500 }, { "epoch": 0.8750374587953251, "grad_norm": 0.08669265359640121, "learning_rate": 5.920948865966374e-06, "loss": 0.002, "step": 14600 }, { "epoch": 0.8810308660473479, "grad_norm": 0.00844608899205923, "learning_rate": 5.889404119743857e-06, "loss": 0.0016, "step": 14700 }, { "epoch": 0.8870242732993707, "grad_norm": 0.027935262769460678, "learning_rate": 5.8578593735213405e-06, "loss": 0.0018, "step": 14800 }, { "epoch": 0.8930176805513935, "grad_norm": 0.0481196753680706, "learning_rate": 5.826314627298823e-06, "loss": 0.0019, "step": 14900 }, { "epoch": 0.8990110878034162, "grad_norm": 0.021947329863905907, "learning_rate": 5.794769881076308e-06, "loss": 0.0015, "step": 15000 }, { "epoch": 0.905004495055439, "grad_norm": 0.08527759462594986, "learning_rate": 5.763225134853791e-06, "loss": 0.0017, "step": 15100 }, { "epoch": 0.9109979023074618, "grad_norm": 0.021068023517727852, "learning_rate": 5.731680388631274e-06, "loss": 0.0018, "step": 15200 }, { "epoch": 0.9169913095594846, "grad_norm": 0.08113428950309753, "learning_rate": 5.700135642408757e-06, "loss": 0.0017, "step": 15300 }, { "epoch": 0.9229847168115073, "grad_norm": 0.10709325969219208, "learning_rate": 5.66859089618624e-06, "loss": 0.0015, "step": 15400 }, { "epoch": 0.9289781240635301, "grad_norm": 0.08009694516658783, "learning_rate": 5.6370461499637244e-06, "loss": 0.0016, "step": 15500 }, { "epoch": 0.9349715313155529, "grad_norm": 0.03613545373082161, "learning_rate": 5.605501403741207e-06, "loss": 0.0017, "step": 15600 }, { "epoch": 0.9409649385675757, "grad_norm": 0.06710252165794373, "learning_rate": 5.573956657518691e-06, "loss": 0.0018, "step": 15700 }, { "epoch": 0.9469583458195985, "grad_norm": 0.09847810864448547, "learning_rate": 5.542411911296174e-06, "loss": 0.0014, "step": 15800 }, { "epoch": 0.9529517530716212, "grad_norm": 0.011624569073319435, "learning_rate": 5.510867165073658e-06, "loss": 0.0016, "step": 15900 }, { "epoch": 0.958945160323644, "grad_norm": 0.06741365045309067, "learning_rate": 5.479322418851141e-06, "loss": 0.0015, "step": 16000 }, { "epoch": 0.9649385675756668, "grad_norm": 0.021546615287661552, "learning_rate": 5.447777672628625e-06, "loss": 0.0017, "step": 16100 }, { "epoch": 0.9709319748276896, "grad_norm": 0.1303360015153885, "learning_rate": 5.4162329264061075e-06, "loss": 0.0018, "step": 16200 }, { "epoch": 0.9769253820797124, "grad_norm": 0.10070718824863434, "learning_rate": 5.38468818018359e-06, "loss": 0.0018, "step": 16300 }, { "epoch": 0.982918789331735, "grad_norm": 0.08305861055850983, "learning_rate": 5.353143433961074e-06, "loss": 0.0016, "step": 16400 }, { "epoch": 0.9889121965837578, "grad_norm": 0.007656518369913101, "learning_rate": 5.321598687738557e-06, "loss": 0.0017, "step": 16500 }, { "epoch": 0.9949056038357806, "grad_norm": 0.0743492990732193, "learning_rate": 5.290053941516041e-06, "loss": 0.0015, "step": 16600 }, { "epoch": 1.0, "eval_accuracy": 0.8271744263468347, "eval_f1": 0.7498195656860883, "eval_loss": 0.001594877801835537, "eval_precision": 0.6861185445920746, "eval_recall": 0.8271744263468347, "eval_runtime": 1686.0917, "eval_samples_per_second": 8.796, "eval_steps_per_second": 1.1, "step": 16685 }, { "epoch": 1.0008990110878033, "grad_norm": 0.05216585099697113, "learning_rate": 5.258509195293524e-06, "loss": 0.0015, "step": 16700 }, { "epoch": 1.0068924183398262, "grad_norm": 0.12606635689735413, "learning_rate": 5.226964449071008e-06, "loss": 0.0011, "step": 16800 }, { "epoch": 1.012885825591849, "grad_norm": 0.0004606186121236533, "learning_rate": 5.195419702848491e-06, "loss": 0.001, "step": 16900 }, { "epoch": 1.0188792328438718, "grad_norm": 0.000365409447113052, "learning_rate": 5.1638749566259735e-06, "loss": 0.001, "step": 17000 }, { "epoch": 1.0248726400958945, "grad_norm": 0.031485725194215775, "learning_rate": 5.132330210403458e-06, "loss": 0.0012, "step": 17100 }, { "epoch": 1.0308660473479172, "grad_norm": 0.0031660550739616156, "learning_rate": 5.100785464180941e-06, "loss": 0.0011, "step": 17200 }, { "epoch": 1.0368594545999401, "grad_norm": 0.04788443446159363, "learning_rate": 5.0692407179584244e-06, "loss": 0.001, "step": 17300 }, { "epoch": 1.0428528618519628, "grad_norm": 0.07966958731412888, "learning_rate": 5.037695971735907e-06, "loss": 0.001, "step": 17400 }, { "epoch": 1.0488462691039857, "grad_norm": 0.2937103807926178, "learning_rate": 5.006151225513392e-06, "loss": 0.0007, "step": 17500 }, { "epoch": 1.0548396763560084, "grad_norm": 0.0027551730163395405, "learning_rate": 4.974606479290875e-06, "loss": 0.0007, "step": 17600 }, { "epoch": 1.060833083608031, "grad_norm": 0.08430271595716476, "learning_rate": 4.943061733068357e-06, "loss": 0.0008, "step": 17700 }, { "epoch": 1.066826490860054, "grad_norm": 0.24536843597888947, "learning_rate": 4.911516986845841e-06, "loss": 0.0006, "step": 17800 }, { "epoch": 1.0728198981120767, "grad_norm": 0.040876179933547974, "learning_rate": 4.879972240623325e-06, "loss": 0.0009, "step": 17900 }, { "epoch": 1.0788133053640996, "grad_norm": 0.0515579879283905, "learning_rate": 4.848427494400808e-06, "loss": 0.0007, "step": 18000 } ], "logging_steps": 100, "max_steps": 33370, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 7.597573697465206e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }