llama3.1 / last-checkpoint /trainer_state.json
kaushik3009's picture
Training in progress, step 2900, checkpoint
02bb15f verified
raw
history blame contribute delete
No virus
57.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.1186113789778207,
"eval_steps": 100,
"global_step": 2900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003857280617164899,
"grad_norm": 2.45951247215271,
"learning_rate": 2.497749774977498e-05,
"loss": 3.6523,
"step": 10
},
{
"epoch": 0.007714561234329798,
"grad_norm": 1.4561721086502075,
"learning_rate": 2.4952495249524954e-05,
"loss": 2.7058,
"step": 20
},
{
"epoch": 0.011571841851494697,
"grad_norm": 1.2352159023284912,
"learning_rate": 2.4927492749274926e-05,
"loss": 2.0281,
"step": 30
},
{
"epoch": 0.015429122468659595,
"grad_norm": 1.2523480653762817,
"learning_rate": 2.4902490249024905e-05,
"loss": 1.5702,
"step": 40
},
{
"epoch": 0.019286403085824494,
"grad_norm": 1.407914400100708,
"learning_rate": 2.4877487748774877e-05,
"loss": 1.177,
"step": 50
},
{
"epoch": 0.023143683702989394,
"grad_norm": 1.1699292659759521,
"learning_rate": 2.4852485248524852e-05,
"loss": 0.9623,
"step": 60
},
{
"epoch": 0.02700096432015429,
"grad_norm": 1.0662754774093628,
"learning_rate": 2.4827482748274828e-05,
"loss": 1.029,
"step": 70
},
{
"epoch": 0.03085824493731919,
"grad_norm": 0.9690182209014893,
"learning_rate": 2.4802480248024803e-05,
"loss": 0.946,
"step": 80
},
{
"epoch": 0.03471552555448409,
"grad_norm": 0.8241429328918457,
"learning_rate": 2.477747774777478e-05,
"loss": 0.8613,
"step": 90
},
{
"epoch": 0.03857280617164899,
"grad_norm": 0.9876273274421692,
"learning_rate": 2.4752475247524754e-05,
"loss": 0.891,
"step": 100
},
{
"epoch": 0.03857280617164899,
"eval_loss": 0.8584423065185547,
"eval_runtime": 94.247,
"eval_samples_per_second": 55.015,
"eval_steps_per_second": 6.886,
"step": 100
},
{
"epoch": 0.04243008678881389,
"grad_norm": 0.8240203261375427,
"learning_rate": 2.472747274727473e-05,
"loss": 0.8924,
"step": 110
},
{
"epoch": 0.04628736740597879,
"grad_norm": 0.7671812176704407,
"learning_rate": 2.47024702470247e-05,
"loss": 0.8446,
"step": 120
},
{
"epoch": 0.05014464802314368,
"grad_norm": 0.9588340520858765,
"learning_rate": 2.467746774677468e-05,
"loss": 0.8491,
"step": 130
},
{
"epoch": 0.05400192864030858,
"grad_norm": 0.9825944304466248,
"learning_rate": 2.4652465246524652e-05,
"loss": 0.8437,
"step": 140
},
{
"epoch": 0.05785920925747348,
"grad_norm": 0.9779114723205566,
"learning_rate": 2.4627462746274628e-05,
"loss": 0.8404,
"step": 150
},
{
"epoch": 0.06171648987463838,
"grad_norm": 0.9949918389320374,
"learning_rate": 2.4602460246024603e-05,
"loss": 0.772,
"step": 160
},
{
"epoch": 0.06557377049180328,
"grad_norm": 0.9132283329963684,
"learning_rate": 2.457745774577458e-05,
"loss": 0.8132,
"step": 170
},
{
"epoch": 0.06943105110896818,
"grad_norm": 0.8586040735244751,
"learning_rate": 2.4552455245524554e-05,
"loss": 0.8365,
"step": 180
},
{
"epoch": 0.07328833172613308,
"grad_norm": 0.78518146276474,
"learning_rate": 2.452745274527453e-05,
"loss": 0.7607,
"step": 190
},
{
"epoch": 0.07714561234329798,
"grad_norm": 1.1228320598602295,
"learning_rate": 2.45024502450245e-05,
"loss": 0.8262,
"step": 200
},
{
"epoch": 0.07714561234329798,
"eval_loss": 0.7683142423629761,
"eval_runtime": 94.3304,
"eval_samples_per_second": 54.966,
"eval_steps_per_second": 6.88,
"step": 200
},
{
"epoch": 0.08100289296046287,
"grad_norm": 1.47947096824646,
"learning_rate": 2.447744774477448e-05,
"loss": 0.7629,
"step": 210
},
{
"epoch": 0.08486017357762778,
"grad_norm": 1.398677110671997,
"learning_rate": 2.4452445244524452e-05,
"loss": 0.7253,
"step": 220
},
{
"epoch": 0.08871745419479267,
"grad_norm": 0.8628906607627869,
"learning_rate": 2.4427442744274428e-05,
"loss": 0.7691,
"step": 230
},
{
"epoch": 0.09257473481195758,
"grad_norm": 0.9008379578590393,
"learning_rate": 2.4402440244024403e-05,
"loss": 0.6461,
"step": 240
},
{
"epoch": 0.09643201542912247,
"grad_norm": 0.6998778581619263,
"learning_rate": 2.437743774377438e-05,
"loss": 0.7174,
"step": 250
},
{
"epoch": 0.10028929604628736,
"grad_norm": 0.863390326499939,
"learning_rate": 2.4352435243524354e-05,
"loss": 0.6757,
"step": 260
},
{
"epoch": 0.10414657666345227,
"grad_norm": 1.0060020685195923,
"learning_rate": 2.432743274327433e-05,
"loss": 0.6933,
"step": 270
},
{
"epoch": 0.10800385728061716,
"grad_norm": 0.8257681727409363,
"learning_rate": 2.4302430243024305e-05,
"loss": 0.7369,
"step": 280
},
{
"epoch": 0.11186113789778207,
"grad_norm": 0.6368749141693115,
"learning_rate": 2.4277427742774277e-05,
"loss": 0.6612,
"step": 290
},
{
"epoch": 0.11571841851494696,
"grad_norm": 0.8179033994674683,
"learning_rate": 2.4252425242524256e-05,
"loss": 0.6744,
"step": 300
},
{
"epoch": 0.11571841851494696,
"eval_loss": 0.689677357673645,
"eval_runtime": 94.4186,
"eval_samples_per_second": 54.915,
"eval_steps_per_second": 6.874,
"step": 300
},
{
"epoch": 0.11957569913211186,
"grad_norm": 0.7856632471084595,
"learning_rate": 2.4227422742274228e-05,
"loss": 0.6407,
"step": 310
},
{
"epoch": 0.12343297974927676,
"grad_norm": 0.788524866104126,
"learning_rate": 2.4202420242024203e-05,
"loss": 0.7115,
"step": 320
},
{
"epoch": 0.12729026036644167,
"grad_norm": 1.0506746768951416,
"learning_rate": 2.417741774177418e-05,
"loss": 0.6825,
"step": 330
},
{
"epoch": 0.13114754098360656,
"grad_norm": 1.2924314737319946,
"learning_rate": 2.4152415241524154e-05,
"loss": 0.6627,
"step": 340
},
{
"epoch": 0.13500482160077146,
"grad_norm": 0.8082237243652344,
"learning_rate": 2.4127412741274126e-05,
"loss": 0.6895,
"step": 350
},
{
"epoch": 0.13886210221793635,
"grad_norm": 0.787610650062561,
"learning_rate": 2.4102410241024105e-05,
"loss": 0.6839,
"step": 360
},
{
"epoch": 0.14271938283510124,
"grad_norm": 0.7939244508743286,
"learning_rate": 2.4077407740774077e-05,
"loss": 0.6533,
"step": 370
},
{
"epoch": 0.14657666345226616,
"grad_norm": 0.7655636668205261,
"learning_rate": 2.4052405240524052e-05,
"loss": 0.6645,
"step": 380
},
{
"epoch": 0.15043394406943106,
"grad_norm": 0.709829568862915,
"learning_rate": 2.402740274027403e-05,
"loss": 0.6792,
"step": 390
},
{
"epoch": 0.15429122468659595,
"grad_norm": 0.7088485956192017,
"learning_rate": 2.4002400240024003e-05,
"loss": 0.6568,
"step": 400
},
{
"epoch": 0.15429122468659595,
"eval_loss": 0.677769124507904,
"eval_runtime": 94.3721,
"eval_samples_per_second": 54.942,
"eval_steps_per_second": 6.877,
"step": 400
},
{
"epoch": 0.15814850530376084,
"grad_norm": 0.739398717880249,
"learning_rate": 2.397739773977398e-05,
"loss": 0.6802,
"step": 410
},
{
"epoch": 0.16200578592092574,
"grad_norm": 0.7921575307846069,
"learning_rate": 2.3952395239523954e-05,
"loss": 0.6598,
"step": 420
},
{
"epoch": 0.16586306653809066,
"grad_norm": 0.9333528280258179,
"learning_rate": 2.392739273927393e-05,
"loss": 0.6543,
"step": 430
},
{
"epoch": 0.16972034715525555,
"grad_norm": 0.906482994556427,
"learning_rate": 2.39023902390239e-05,
"loss": 0.692,
"step": 440
},
{
"epoch": 0.17357762777242045,
"grad_norm": 0.8562319278717041,
"learning_rate": 2.387738773877388e-05,
"loss": 0.6963,
"step": 450
},
{
"epoch": 0.17743490838958534,
"grad_norm": 0.8864608407020569,
"learning_rate": 2.3852385238523852e-05,
"loss": 0.6672,
"step": 460
},
{
"epoch": 0.18129218900675023,
"grad_norm": 0.7445130944252014,
"learning_rate": 2.3827382738273828e-05,
"loss": 0.6052,
"step": 470
},
{
"epoch": 0.18514946962391515,
"grad_norm": 0.751557469367981,
"learning_rate": 2.3802380238023803e-05,
"loss": 0.6301,
"step": 480
},
{
"epoch": 0.18900675024108005,
"grad_norm": 0.6981202960014343,
"learning_rate": 2.377737773777378e-05,
"loss": 0.6423,
"step": 490
},
{
"epoch": 0.19286403085824494,
"grad_norm": 0.9979777336120605,
"learning_rate": 2.3752375237523754e-05,
"loss": 0.6075,
"step": 500
},
{
"epoch": 0.19286403085824494,
"eval_loss": 0.6642400622367859,
"eval_runtime": 94.3518,
"eval_samples_per_second": 54.954,
"eval_steps_per_second": 6.879,
"step": 500
},
{
"epoch": 0.19672131147540983,
"grad_norm": 0.7130064368247986,
"learning_rate": 2.372737273727373e-05,
"loss": 0.6054,
"step": 510
},
{
"epoch": 0.20057859209257473,
"grad_norm": 0.7771989703178406,
"learning_rate": 2.37023702370237e-05,
"loss": 0.6621,
"step": 520
},
{
"epoch": 0.20443587270973965,
"grad_norm": 0.8572603464126587,
"learning_rate": 2.3677367736773677e-05,
"loss": 0.6563,
"step": 530
},
{
"epoch": 0.20829315332690454,
"grad_norm": 0.8305298686027527,
"learning_rate": 2.3652365236523656e-05,
"loss": 0.6658,
"step": 540
},
{
"epoch": 0.21215043394406943,
"grad_norm": 0.8520190119743347,
"learning_rate": 2.3627362736273628e-05,
"loss": 0.638,
"step": 550
},
{
"epoch": 0.21600771456123433,
"grad_norm": 0.9404274225234985,
"learning_rate": 2.3602360236023603e-05,
"loss": 0.6067,
"step": 560
},
{
"epoch": 0.21986499517839922,
"grad_norm": 0.8018991351127625,
"learning_rate": 2.357735773577358e-05,
"loss": 0.6385,
"step": 570
},
{
"epoch": 0.22372227579556414,
"grad_norm": 0.8628789186477661,
"learning_rate": 2.3552355235523554e-05,
"loss": 0.6554,
"step": 580
},
{
"epoch": 0.22757955641272903,
"grad_norm": 0.8279526829719543,
"learning_rate": 2.352735273527353e-05,
"loss": 0.6114,
"step": 590
},
{
"epoch": 0.23143683702989393,
"grad_norm": 0.8158825635910034,
"learning_rate": 2.3502350235023505e-05,
"loss": 0.6149,
"step": 600
},
{
"epoch": 0.23143683702989393,
"eval_loss": 0.6521801352500916,
"eval_runtime": 94.3458,
"eval_samples_per_second": 54.957,
"eval_steps_per_second": 6.879,
"step": 600
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.8334428071975708,
"learning_rate": 2.3477347734773477e-05,
"loss": 0.6769,
"step": 610
},
{
"epoch": 0.2391513982642237,
"grad_norm": 0.9083623886108398,
"learning_rate": 2.3452345234523456e-05,
"loss": 0.6331,
"step": 620
},
{
"epoch": 0.24300867888138863,
"grad_norm": 1.199766993522644,
"learning_rate": 2.3427342734273428e-05,
"loss": 0.6967,
"step": 630
},
{
"epoch": 0.24686595949855353,
"grad_norm": 1.2198294401168823,
"learning_rate": 2.3402340234023403e-05,
"loss": 0.6618,
"step": 640
},
{
"epoch": 0.2507232401157184,
"grad_norm": 0.8489105701446533,
"learning_rate": 2.337733773377338e-05,
"loss": 0.6242,
"step": 650
},
{
"epoch": 0.25458052073288334,
"grad_norm": 1.0652421712875366,
"learning_rate": 2.3352335233523354e-05,
"loss": 0.6192,
"step": 660
},
{
"epoch": 0.25843780135004824,
"grad_norm": 0.7928668856620789,
"learning_rate": 2.3327332733273326e-05,
"loss": 0.5706,
"step": 670
},
{
"epoch": 0.26229508196721313,
"grad_norm": 0.8512901663780212,
"learning_rate": 2.3302330233023305e-05,
"loss": 0.6457,
"step": 680
},
{
"epoch": 0.266152362584378,
"grad_norm": 0.8443427085876465,
"learning_rate": 2.327732773277328e-05,
"loss": 0.601,
"step": 690
},
{
"epoch": 0.2700096432015429,
"grad_norm": 0.8724773526191711,
"learning_rate": 2.3252325232523252e-05,
"loss": 0.6476,
"step": 700
},
{
"epoch": 0.2700096432015429,
"eval_loss": 0.6422102451324463,
"eval_runtime": 94.4282,
"eval_samples_per_second": 54.909,
"eval_steps_per_second": 6.873,
"step": 700
},
{
"epoch": 0.2738669238187078,
"grad_norm": 0.8733763098716736,
"learning_rate": 2.322732273227323e-05,
"loss": 0.6523,
"step": 710
},
{
"epoch": 0.2777242044358727,
"grad_norm": 0.8932089805603027,
"learning_rate": 2.3202320232023203e-05,
"loss": 0.6305,
"step": 720
},
{
"epoch": 0.2815814850530376,
"grad_norm": 0.9854605197906494,
"learning_rate": 2.317731773177318e-05,
"loss": 0.6358,
"step": 730
},
{
"epoch": 0.2854387656702025,
"grad_norm": 0.8158785700798035,
"learning_rate": 2.3152315231523154e-05,
"loss": 0.6027,
"step": 740
},
{
"epoch": 0.2892960462873674,
"grad_norm": 0.9273302555084229,
"learning_rate": 2.312731273127313e-05,
"loss": 0.6431,
"step": 750
},
{
"epoch": 0.29315332690453233,
"grad_norm": 0.9094042181968689,
"learning_rate": 2.31023102310231e-05,
"loss": 0.5767,
"step": 760
},
{
"epoch": 0.2970106075216972,
"grad_norm": 0.8175253868103027,
"learning_rate": 2.307730773077308e-05,
"loss": 0.6174,
"step": 770
},
{
"epoch": 0.3008678881388621,
"grad_norm": 0.8517961502075195,
"learning_rate": 2.3052305230523052e-05,
"loss": 0.6183,
"step": 780
},
{
"epoch": 0.304725168756027,
"grad_norm": 0.8863179087638855,
"learning_rate": 2.3027302730273028e-05,
"loss": 0.5849,
"step": 790
},
{
"epoch": 0.3085824493731919,
"grad_norm": 0.9195278882980347,
"learning_rate": 2.3002300230023003e-05,
"loss": 0.6016,
"step": 800
},
{
"epoch": 0.3085824493731919,
"eval_loss": 0.6320463418960571,
"eval_runtime": 94.3592,
"eval_samples_per_second": 54.95,
"eval_steps_per_second": 6.878,
"step": 800
},
{
"epoch": 0.3124397299903568,
"grad_norm": 0.9424280524253845,
"learning_rate": 2.297729772977298e-05,
"loss": 0.5961,
"step": 810
},
{
"epoch": 0.3162970106075217,
"grad_norm": 1.031079888343811,
"learning_rate": 2.295229522952295e-05,
"loss": 0.6357,
"step": 820
},
{
"epoch": 0.3201542912246866,
"grad_norm": 0.9320313334465027,
"learning_rate": 2.292729272927293e-05,
"loss": 0.6228,
"step": 830
},
{
"epoch": 0.3240115718418515,
"grad_norm": 0.9292299747467041,
"learning_rate": 2.2902290229022905e-05,
"loss": 0.6504,
"step": 840
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.8377825021743774,
"learning_rate": 2.2877287728772877e-05,
"loss": 0.5952,
"step": 850
},
{
"epoch": 0.3317261330761813,
"grad_norm": 0.8555241227149963,
"learning_rate": 2.2852285228522856e-05,
"loss": 0.5852,
"step": 860
},
{
"epoch": 0.3355834136933462,
"grad_norm": 1.0691065788269043,
"learning_rate": 2.2827282728272828e-05,
"loss": 0.5806,
"step": 870
},
{
"epoch": 0.3394406943105111,
"grad_norm": 1.0052144527435303,
"learning_rate": 2.2802280228022803e-05,
"loss": 0.6592,
"step": 880
},
{
"epoch": 0.343297974927676,
"grad_norm": 1.000553011894226,
"learning_rate": 2.277727772777278e-05,
"loss": 0.6347,
"step": 890
},
{
"epoch": 0.3471552555448409,
"grad_norm": 1.13107430934906,
"learning_rate": 2.2752275227522754e-05,
"loss": 0.5989,
"step": 900
},
{
"epoch": 0.3471552555448409,
"eval_loss": 0.6231358647346497,
"eval_runtime": 94.3809,
"eval_samples_per_second": 54.937,
"eval_steps_per_second": 6.876,
"step": 900
},
{
"epoch": 0.3510125361620058,
"grad_norm": 1.0130326747894287,
"learning_rate": 2.272727272727273e-05,
"loss": 0.6227,
"step": 910
},
{
"epoch": 0.3548698167791707,
"grad_norm": 1.0335384607315063,
"learning_rate": 2.2702270227022705e-05,
"loss": 0.5277,
"step": 920
},
{
"epoch": 0.35872709739633557,
"grad_norm": 0.9162185788154602,
"learning_rate": 2.2677267726772677e-05,
"loss": 0.5633,
"step": 930
},
{
"epoch": 0.36258437801350046,
"grad_norm": 0.9492796063423157,
"learning_rate": 2.2652265226522652e-05,
"loss": 0.6536,
"step": 940
},
{
"epoch": 0.36644165863066536,
"grad_norm": 1.0065137147903442,
"learning_rate": 2.2627262726272628e-05,
"loss": 0.6622,
"step": 950
},
{
"epoch": 0.3702989392478303,
"grad_norm": 0.917143702507019,
"learning_rate": 2.2602260226022603e-05,
"loss": 0.6391,
"step": 960
},
{
"epoch": 0.3741562198649952,
"grad_norm": 0.9580853581428528,
"learning_rate": 2.257725772577258e-05,
"loss": 0.6354,
"step": 970
},
{
"epoch": 0.3780135004821601,
"grad_norm": 1.1998488903045654,
"learning_rate": 2.2552255225522554e-05,
"loss": 0.5885,
"step": 980
},
{
"epoch": 0.381870781099325,
"grad_norm": 0.9667923450469971,
"learning_rate": 2.252725272527253e-05,
"loss": 0.6199,
"step": 990
},
{
"epoch": 0.3857280617164899,
"grad_norm": 0.9675014019012451,
"learning_rate": 2.2502250225022505e-05,
"loss": 0.5522,
"step": 1000
},
{
"epoch": 0.3857280617164899,
"eval_loss": 0.6154375672340393,
"eval_runtime": 94.3972,
"eval_samples_per_second": 54.927,
"eval_steps_per_second": 6.875,
"step": 1000
},
{
"epoch": 0.38958534233365477,
"grad_norm": 1.035885214805603,
"learning_rate": 2.247724772477248e-05,
"loss": 0.5868,
"step": 1010
},
{
"epoch": 0.39344262295081966,
"grad_norm": 1.1226266622543335,
"learning_rate": 2.2452245224522452e-05,
"loss": 0.5787,
"step": 1020
},
{
"epoch": 0.39729990356798456,
"grad_norm": 1.0908483266830444,
"learning_rate": 2.2427242724272428e-05,
"loss": 0.6161,
"step": 1030
},
{
"epoch": 0.40115718418514945,
"grad_norm": 0.9660767316818237,
"learning_rate": 2.2402240224022403e-05,
"loss": 0.6277,
"step": 1040
},
{
"epoch": 0.40501446480231434,
"grad_norm": 0.9711313843727112,
"learning_rate": 2.237723772377238e-05,
"loss": 0.6062,
"step": 1050
},
{
"epoch": 0.4088717454194793,
"grad_norm": 0.9374969601631165,
"learning_rate": 2.2352235223522354e-05,
"loss": 0.6299,
"step": 1060
},
{
"epoch": 0.4127290260366442,
"grad_norm": 1.0570039749145508,
"learning_rate": 2.232723272327233e-05,
"loss": 0.5965,
"step": 1070
},
{
"epoch": 0.4165863066538091,
"grad_norm": 1.0144932270050049,
"learning_rate": 2.23022302230223e-05,
"loss": 0.5479,
"step": 1080
},
{
"epoch": 0.420443587270974,
"grad_norm": 0.9654034972190857,
"learning_rate": 2.227722772277228e-05,
"loss": 0.5768,
"step": 1090
},
{
"epoch": 0.42430086788813887,
"grad_norm": 0.9580025672912598,
"learning_rate": 2.2252225222522252e-05,
"loss": 0.6023,
"step": 1100
},
{
"epoch": 0.42430086788813887,
"eval_loss": 0.6073106527328491,
"eval_runtime": 94.4156,
"eval_samples_per_second": 54.917,
"eval_steps_per_second": 6.874,
"step": 1100
},
{
"epoch": 0.42815814850530376,
"grad_norm": 1.0227288007736206,
"learning_rate": 2.2227222722272228e-05,
"loss": 0.5824,
"step": 1110
},
{
"epoch": 0.43201542912246865,
"grad_norm": 0.977800726890564,
"learning_rate": 2.2202220222022203e-05,
"loss": 0.5629,
"step": 1120
},
{
"epoch": 0.43587270973963355,
"grad_norm": 0.9433587789535522,
"learning_rate": 2.217721772177218e-05,
"loss": 0.5774,
"step": 1130
},
{
"epoch": 0.43972999035679844,
"grad_norm": 1.0534788370132446,
"learning_rate": 2.2152215221522154e-05,
"loss": 0.6191,
"step": 1140
},
{
"epoch": 0.44358727097396333,
"grad_norm": 0.9741374850273132,
"learning_rate": 2.212721272127213e-05,
"loss": 0.5989,
"step": 1150
},
{
"epoch": 0.4474445515911283,
"grad_norm": 1.1215403079986572,
"learning_rate": 2.2102210221022105e-05,
"loss": 0.6547,
"step": 1160
},
{
"epoch": 0.4513018322082932,
"grad_norm": 1.1161948442459106,
"learning_rate": 2.2077207720772077e-05,
"loss": 0.5999,
"step": 1170
},
{
"epoch": 0.45515911282545807,
"grad_norm": 1.1462429761886597,
"learning_rate": 2.2052205220522055e-05,
"loss": 0.6458,
"step": 1180
},
{
"epoch": 0.45901639344262296,
"grad_norm": 1.0904706716537476,
"learning_rate": 2.2027202720272027e-05,
"loss": 0.5839,
"step": 1190
},
{
"epoch": 0.46287367405978785,
"grad_norm": 1.0991252660751343,
"learning_rate": 2.2002200220022003e-05,
"loss": 0.5403,
"step": 1200
},
{
"epoch": 0.46287367405978785,
"eval_loss": 0.6001272797584534,
"eval_runtime": 94.5589,
"eval_samples_per_second": 54.834,
"eval_steps_per_second": 6.863,
"step": 1200
},
{
"epoch": 0.46673095467695275,
"grad_norm": 1.221454381942749,
"learning_rate": 2.197719771977198e-05,
"loss": 0.6155,
"step": 1210
},
{
"epoch": 0.47058823529411764,
"grad_norm": 1.0147477388381958,
"learning_rate": 2.1952195219521954e-05,
"loss": 0.62,
"step": 1220
},
{
"epoch": 0.47444551591128253,
"grad_norm": 1.0702507495880127,
"learning_rate": 2.1927192719271926e-05,
"loss": 0.5605,
"step": 1230
},
{
"epoch": 0.4783027965284474,
"grad_norm": 1.295518398284912,
"learning_rate": 2.1902190219021905e-05,
"loss": 0.5065,
"step": 1240
},
{
"epoch": 0.4821600771456123,
"grad_norm": 1.1323541402816772,
"learning_rate": 2.1877187718771877e-05,
"loss": 0.5726,
"step": 1250
},
{
"epoch": 0.48601735776277727,
"grad_norm": 0.9562482833862305,
"learning_rate": 2.1852185218521852e-05,
"loss": 0.5683,
"step": 1260
},
{
"epoch": 0.48987463837994216,
"grad_norm": 1.129547119140625,
"learning_rate": 2.1827182718271827e-05,
"loss": 0.5732,
"step": 1270
},
{
"epoch": 0.49373191899710706,
"grad_norm": 1.0175765752792358,
"learning_rate": 2.1802180218021803e-05,
"loss": 0.5251,
"step": 1280
},
{
"epoch": 0.49758919961427195,
"grad_norm": 1.1538267135620117,
"learning_rate": 2.177717771777178e-05,
"loss": 0.5798,
"step": 1290
},
{
"epoch": 0.5014464802314368,
"grad_norm": 1.1203854084014893,
"learning_rate": 2.1752175217521754e-05,
"loss": 0.535,
"step": 1300
},
{
"epoch": 0.5014464802314368,
"eval_loss": 0.593771755695343,
"eval_runtime": 94.4579,
"eval_samples_per_second": 54.892,
"eval_steps_per_second": 6.871,
"step": 1300
},
{
"epoch": 0.5053037608486017,
"grad_norm": 1.158937692642212,
"learning_rate": 2.172717271727173e-05,
"loss": 0.5667,
"step": 1310
},
{
"epoch": 0.5091610414657667,
"grad_norm": 1.1078110933303833,
"learning_rate": 2.17021702170217e-05,
"loss": 0.6097,
"step": 1320
},
{
"epoch": 0.5130183220829315,
"grad_norm": 1.1934500932693481,
"learning_rate": 2.167716771677168e-05,
"loss": 0.5478,
"step": 1330
},
{
"epoch": 0.5168756027000965,
"grad_norm": 1.048662781715393,
"learning_rate": 2.1652165216521652e-05,
"loss": 0.5753,
"step": 1340
},
{
"epoch": 0.5207328833172613,
"grad_norm": 1.0503116846084595,
"learning_rate": 2.1627162716271627e-05,
"loss": 0.5762,
"step": 1350
},
{
"epoch": 0.5245901639344263,
"grad_norm": 1.1861109733581543,
"learning_rate": 2.1602160216021603e-05,
"loss": 0.5808,
"step": 1360
},
{
"epoch": 0.5284474445515911,
"grad_norm": 1.178539752960205,
"learning_rate": 2.1577157715771578e-05,
"loss": 0.5584,
"step": 1370
},
{
"epoch": 0.532304725168756,
"grad_norm": 1.0662671327590942,
"learning_rate": 2.1552155215521554e-05,
"loss": 0.5535,
"step": 1380
},
{
"epoch": 0.5361620057859209,
"grad_norm": 1.1202431917190552,
"learning_rate": 2.152715271527153e-05,
"loss": 0.5555,
"step": 1390
},
{
"epoch": 0.5400192864030858,
"grad_norm": 1.1992982625961304,
"learning_rate": 2.15021502150215e-05,
"loss": 0.5712,
"step": 1400
},
{
"epoch": 0.5400192864030858,
"eval_loss": 0.5875272750854492,
"eval_runtime": 94.4312,
"eval_samples_per_second": 54.908,
"eval_steps_per_second": 6.873,
"step": 1400
},
{
"epoch": 0.5438765670202508,
"grad_norm": 1.1259962320327759,
"learning_rate": 2.147714771477148e-05,
"loss": 0.5676,
"step": 1410
},
{
"epoch": 0.5477338476374156,
"grad_norm": 1.0652165412902832,
"learning_rate": 2.1452145214521452e-05,
"loss": 0.5551,
"step": 1420
},
{
"epoch": 0.5515911282545806,
"grad_norm": 1.1056393384933472,
"learning_rate": 2.1427142714271427e-05,
"loss": 0.508,
"step": 1430
},
{
"epoch": 0.5554484088717454,
"grad_norm": 1.1506450176239014,
"learning_rate": 2.1402140214021403e-05,
"loss": 0.582,
"step": 1440
},
{
"epoch": 0.5593056894889104,
"grad_norm": 1.4107190370559692,
"learning_rate": 2.1377137713771378e-05,
"loss": 0.5821,
"step": 1450
},
{
"epoch": 0.5631629701060752,
"grad_norm": 1.2830005884170532,
"learning_rate": 2.1352135213521354e-05,
"loss": 0.5451,
"step": 1460
},
{
"epoch": 0.5670202507232401,
"grad_norm": 1.1122502088546753,
"learning_rate": 2.132713271327133e-05,
"loss": 0.5905,
"step": 1470
},
{
"epoch": 0.570877531340405,
"grad_norm": 1.1104683876037598,
"learning_rate": 2.1302130213021305e-05,
"loss": 0.6189,
"step": 1480
},
{
"epoch": 0.5747348119575699,
"grad_norm": 1.2569029331207275,
"learning_rate": 2.1277127712771277e-05,
"loss": 0.5717,
"step": 1490
},
{
"epoch": 0.5785920925747348,
"grad_norm": 1.1278156042099,
"learning_rate": 2.1252125212521255e-05,
"loss": 0.5686,
"step": 1500
},
{
"epoch": 0.5785920925747348,
"eval_loss": 0.5797137022018433,
"eval_runtime": 94.4199,
"eval_samples_per_second": 54.914,
"eval_steps_per_second": 6.874,
"step": 1500
},
{
"epoch": 0.5824493731918997,
"grad_norm": 1.075393795967102,
"learning_rate": 2.1227122712271227e-05,
"loss": 0.5849,
"step": 1510
},
{
"epoch": 0.5863066538090647,
"grad_norm": 1.2325960397720337,
"learning_rate": 2.1202120212021203e-05,
"loss": 0.5706,
"step": 1520
},
{
"epoch": 0.5901639344262295,
"grad_norm": 1.1058759689331055,
"learning_rate": 2.1177117711771178e-05,
"loss": 0.5706,
"step": 1530
},
{
"epoch": 0.5940212150433944,
"grad_norm": 1.1634057760238647,
"learning_rate": 2.1152115211521154e-05,
"loss": 0.5518,
"step": 1540
},
{
"epoch": 0.5978784956605593,
"grad_norm": 1.0119497776031494,
"learning_rate": 2.1127112711271126e-05,
"loss": 0.5104,
"step": 1550
},
{
"epoch": 0.6017357762777242,
"grad_norm": 1.2648943662643433,
"learning_rate": 2.1102110211021104e-05,
"loss": 0.5261,
"step": 1560
},
{
"epoch": 0.6055930568948891,
"grad_norm": 1.2454555034637451,
"learning_rate": 2.1077107710771077e-05,
"loss": 0.5633,
"step": 1570
},
{
"epoch": 0.609450337512054,
"grad_norm": 1.1793566942214966,
"learning_rate": 2.1052105210521052e-05,
"loss": 0.535,
"step": 1580
},
{
"epoch": 0.6133076181292189,
"grad_norm": 1.5229750871658325,
"learning_rate": 2.102710271027103e-05,
"loss": 0.5559,
"step": 1590
},
{
"epoch": 0.6171648987463838,
"grad_norm": 1.2203059196472168,
"learning_rate": 2.1002100210021003e-05,
"loss": 0.5315,
"step": 1600
},
{
"epoch": 0.6171648987463838,
"eval_loss": 0.5732572078704834,
"eval_runtime": 94.4141,
"eval_samples_per_second": 54.918,
"eval_steps_per_second": 6.874,
"step": 1600
},
{
"epoch": 0.6210221793635486,
"grad_norm": 1.4130253791809082,
"learning_rate": 2.0977097709770978e-05,
"loss": 0.5521,
"step": 1610
},
{
"epoch": 0.6248794599807136,
"grad_norm": 1.2830981016159058,
"learning_rate": 2.0952095209520954e-05,
"loss": 0.5432,
"step": 1620
},
{
"epoch": 0.6287367405978785,
"grad_norm": 1.1956433057785034,
"learning_rate": 2.092709270927093e-05,
"loss": 0.5746,
"step": 1630
},
{
"epoch": 0.6325940212150434,
"grad_norm": 1.5104076862335205,
"learning_rate": 2.09020902090209e-05,
"loss": 0.5916,
"step": 1640
},
{
"epoch": 0.6364513018322083,
"grad_norm": 1.2112847566604614,
"learning_rate": 2.087708770877088e-05,
"loss": 0.5322,
"step": 1650
},
{
"epoch": 0.6403085824493732,
"grad_norm": 1.1859279870986938,
"learning_rate": 2.0852085208520852e-05,
"loss": 0.599,
"step": 1660
},
{
"epoch": 0.6441658630665381,
"grad_norm": 1.348300576210022,
"learning_rate": 2.0827082708270827e-05,
"loss": 0.605,
"step": 1670
},
{
"epoch": 0.648023143683703,
"grad_norm": 1.3982155323028564,
"learning_rate": 2.0802080208020803e-05,
"loss": 0.5367,
"step": 1680
},
{
"epoch": 0.6518804243008679,
"grad_norm": 1.2189476490020752,
"learning_rate": 2.0777077707770778e-05,
"loss": 0.5855,
"step": 1690
},
{
"epoch": 0.6557377049180327,
"grad_norm": 1.3908072710037231,
"learning_rate": 2.0752075207520754e-05,
"loss": 0.5876,
"step": 1700
},
{
"epoch": 0.6557377049180327,
"eval_loss": 0.5670270919799805,
"eval_runtime": 94.4393,
"eval_samples_per_second": 54.903,
"eval_steps_per_second": 6.872,
"step": 1700
},
{
"epoch": 0.6595949855351977,
"grad_norm": 1.150038480758667,
"learning_rate": 2.072707270727073e-05,
"loss": 0.5151,
"step": 1710
},
{
"epoch": 0.6634522661523626,
"grad_norm": 1.2351560592651367,
"learning_rate": 2.07020702070207e-05,
"loss": 0.5171,
"step": 1720
},
{
"epoch": 0.6673095467695275,
"grad_norm": 1.2720533609390259,
"learning_rate": 2.0677067706770676e-05,
"loss": 0.5526,
"step": 1730
},
{
"epoch": 0.6711668273866924,
"grad_norm": 1.2330290079116821,
"learning_rate": 2.0652065206520655e-05,
"loss": 0.5516,
"step": 1740
},
{
"epoch": 0.6750241080038573,
"grad_norm": 1.319873571395874,
"learning_rate": 2.0627062706270627e-05,
"loss": 0.5512,
"step": 1750
},
{
"epoch": 0.6788813886210222,
"grad_norm": 1.663527250289917,
"learning_rate": 2.0602060206020603e-05,
"loss": 0.556,
"step": 1760
},
{
"epoch": 0.682738669238187,
"grad_norm": 1.2730813026428223,
"learning_rate": 2.0577057705770578e-05,
"loss": 0.5362,
"step": 1770
},
{
"epoch": 0.686595949855352,
"grad_norm": 1.2985719442367554,
"learning_rate": 2.0552055205520554e-05,
"loss": 0.6448,
"step": 1780
},
{
"epoch": 0.6904532304725168,
"grad_norm": 1.384941577911377,
"learning_rate": 2.052705270527053e-05,
"loss": 0.5767,
"step": 1790
},
{
"epoch": 0.6943105110896818,
"grad_norm": 1.2721012830734253,
"learning_rate": 2.0502050205020504e-05,
"loss": 0.6248,
"step": 1800
},
{
"epoch": 0.6943105110896818,
"eval_loss": 0.560900092124939,
"eval_runtime": 94.4846,
"eval_samples_per_second": 54.877,
"eval_steps_per_second": 6.869,
"step": 1800
},
{
"epoch": 0.6981677917068466,
"grad_norm": 1.3880654573440552,
"learning_rate": 2.0477047704770476e-05,
"loss": 0.5389,
"step": 1810
},
{
"epoch": 0.7020250723240116,
"grad_norm": 1.2518627643585205,
"learning_rate": 2.0452045204520455e-05,
"loss": 0.566,
"step": 1820
},
{
"epoch": 0.7058823529411765,
"grad_norm": 1.4524362087249756,
"learning_rate": 2.0427042704270427e-05,
"loss": 0.5105,
"step": 1830
},
{
"epoch": 0.7097396335583414,
"grad_norm": 1.2816158533096313,
"learning_rate": 2.0402040204020403e-05,
"loss": 0.5308,
"step": 1840
},
{
"epoch": 0.7135969141755063,
"grad_norm": 1.286135196685791,
"learning_rate": 2.0377037703770378e-05,
"loss": 0.5273,
"step": 1850
},
{
"epoch": 0.7174541947926711,
"grad_norm": 1.4501844644546509,
"learning_rate": 2.0352035203520354e-05,
"loss": 0.5622,
"step": 1860
},
{
"epoch": 0.7213114754098361,
"grad_norm": 1.3340784311294556,
"learning_rate": 2.0327032703270326e-05,
"loss": 0.6137,
"step": 1870
},
{
"epoch": 0.7251687560270009,
"grad_norm": 1.439643383026123,
"learning_rate": 2.0302030203020304e-05,
"loss": 0.5846,
"step": 1880
},
{
"epoch": 0.7290260366441659,
"grad_norm": 1.2474430799484253,
"learning_rate": 2.027702770277028e-05,
"loss": 0.5519,
"step": 1890
},
{
"epoch": 0.7328833172613307,
"grad_norm": 1.0996040105819702,
"learning_rate": 2.0252025202520252e-05,
"loss": 0.5481,
"step": 1900
},
{
"epoch": 0.7328833172613307,
"eval_loss": 0.5548669695854187,
"eval_runtime": 94.5252,
"eval_samples_per_second": 54.853,
"eval_steps_per_second": 6.866,
"step": 1900
},
{
"epoch": 0.7367405978784957,
"grad_norm": 1.5467498302459717,
"learning_rate": 2.022702270227023e-05,
"loss": 0.546,
"step": 1910
},
{
"epoch": 0.7405978784956606,
"grad_norm": 1.4486864805221558,
"learning_rate": 2.0202020202020203e-05,
"loss": 0.5239,
"step": 1920
},
{
"epoch": 0.7444551591128254,
"grad_norm": 1.3535338640213013,
"learning_rate": 2.0177017701770178e-05,
"loss": 0.5733,
"step": 1930
},
{
"epoch": 0.7483124397299904,
"grad_norm": 1.4148615598678589,
"learning_rate": 2.0152015201520154e-05,
"loss": 0.5177,
"step": 1940
},
{
"epoch": 0.7521697203471552,
"grad_norm": 1.5134552717208862,
"learning_rate": 2.012701270127013e-05,
"loss": 0.5643,
"step": 1950
},
{
"epoch": 0.7560270009643202,
"grad_norm": 1.5626767873764038,
"learning_rate": 2.01020102010201e-05,
"loss": 0.5317,
"step": 1960
},
{
"epoch": 0.759884281581485,
"grad_norm": 1.3729217052459717,
"learning_rate": 2.007700770077008e-05,
"loss": 0.5859,
"step": 1970
},
{
"epoch": 0.76374156219865,
"grad_norm": 1.5823298692703247,
"learning_rate": 2.0052005200520052e-05,
"loss": 0.517,
"step": 1980
},
{
"epoch": 0.7675988428158148,
"grad_norm": 1.4126390218734741,
"learning_rate": 2.0027002700270027e-05,
"loss": 0.578,
"step": 1990
},
{
"epoch": 0.7714561234329798,
"grad_norm": 1.5024161338806152,
"learning_rate": 2.0002000200020003e-05,
"loss": 0.4779,
"step": 2000
},
{
"epoch": 0.7714561234329798,
"eval_loss": 0.548928439617157,
"eval_runtime": 94.5508,
"eval_samples_per_second": 54.838,
"eval_steps_per_second": 6.864,
"step": 2000
},
{
"epoch": 0.7753134040501446,
"grad_norm": 1.4644631147384644,
"learning_rate": 1.9976997699769978e-05,
"loss": 0.545,
"step": 2010
},
{
"epoch": 0.7791706846673095,
"grad_norm": 1.394882082939148,
"learning_rate": 1.995199519951995e-05,
"loss": 0.5502,
"step": 2020
},
{
"epoch": 0.7830279652844745,
"grad_norm": 1.4921457767486572,
"learning_rate": 1.992699269926993e-05,
"loss": 0.6197,
"step": 2030
},
{
"epoch": 0.7868852459016393,
"grad_norm": 1.3136405944824219,
"learning_rate": 1.9901990199019904e-05,
"loss": 0.5296,
"step": 2040
},
{
"epoch": 0.7907425265188043,
"grad_norm": 1.5223480463027954,
"learning_rate": 1.9876987698769876e-05,
"loss": 0.4991,
"step": 2050
},
{
"epoch": 0.7945998071359691,
"grad_norm": 1.4527870416641235,
"learning_rate": 1.9851985198519855e-05,
"loss": 0.5194,
"step": 2060
},
{
"epoch": 0.7984570877531341,
"grad_norm": 1.4777238368988037,
"learning_rate": 1.9826982698269827e-05,
"loss": 0.5511,
"step": 2070
},
{
"epoch": 0.8023143683702989,
"grad_norm": 1.8136184215545654,
"learning_rate": 1.9801980198019803e-05,
"loss": 0.5819,
"step": 2080
},
{
"epoch": 0.8061716489874639,
"grad_norm": 1.7190624475479126,
"learning_rate": 1.9776977697769778e-05,
"loss": 0.5725,
"step": 2090
},
{
"epoch": 0.8100289296046287,
"grad_norm": 1.2566032409667969,
"learning_rate": 1.9751975197519753e-05,
"loss": 0.5471,
"step": 2100
},
{
"epoch": 0.8100289296046287,
"eval_loss": 0.5430962443351746,
"eval_runtime": 94.4905,
"eval_samples_per_second": 54.873,
"eval_steps_per_second": 6.868,
"step": 2100
},
{
"epoch": 0.8138862102217936,
"grad_norm": 1.1948508024215698,
"learning_rate": 1.9726972697269725e-05,
"loss": 0.5449,
"step": 2110
},
{
"epoch": 0.8177434908389586,
"grad_norm": 1.355807900428772,
"learning_rate": 1.9701970197019704e-05,
"loss": 0.5238,
"step": 2120
},
{
"epoch": 0.8216007714561234,
"grad_norm": 1.4238370656967163,
"learning_rate": 1.9676967696769676e-05,
"loss": 0.5425,
"step": 2130
},
{
"epoch": 0.8254580520732884,
"grad_norm": 1.5667427778244019,
"learning_rate": 1.9651965196519652e-05,
"loss": 0.5571,
"step": 2140
},
{
"epoch": 0.8293153326904532,
"grad_norm": 1.5513569116592407,
"learning_rate": 1.9626962696269627e-05,
"loss": 0.5631,
"step": 2150
},
{
"epoch": 0.8331726133076182,
"grad_norm": 1.3871880769729614,
"learning_rate": 1.9601960196019603e-05,
"loss": 0.5687,
"step": 2160
},
{
"epoch": 0.837029893924783,
"grad_norm": 1.4342153072357178,
"learning_rate": 1.9576957695769578e-05,
"loss": 0.5193,
"step": 2170
},
{
"epoch": 0.840887174541948,
"grad_norm": 1.4925063848495483,
"learning_rate": 1.9551955195519553e-05,
"loss": 0.5548,
"step": 2180
},
{
"epoch": 0.8447444551591128,
"grad_norm": 1.5816041231155396,
"learning_rate": 1.952695269526953e-05,
"loss": 0.5538,
"step": 2190
},
{
"epoch": 0.8486017357762777,
"grad_norm": 1.803604006767273,
"learning_rate": 1.9501950195019504e-05,
"loss": 0.4947,
"step": 2200
},
{
"epoch": 0.8486017357762777,
"eval_loss": 0.5378134846687317,
"eval_runtime": 94.5121,
"eval_samples_per_second": 54.861,
"eval_steps_per_second": 6.867,
"step": 2200
},
{
"epoch": 0.8524590163934426,
"grad_norm": 1.5246657133102417,
"learning_rate": 1.947694769476948e-05,
"loss": 0.5183,
"step": 2210
},
{
"epoch": 0.8563162970106075,
"grad_norm": 1.4470975399017334,
"learning_rate": 1.9451945194519452e-05,
"loss": 0.5197,
"step": 2220
},
{
"epoch": 0.8601735776277725,
"grad_norm": 1.6767865419387817,
"learning_rate": 1.9426942694269427e-05,
"loss": 0.5654,
"step": 2230
},
{
"epoch": 0.8640308582449373,
"grad_norm": 1.5155974626541138,
"learning_rate": 1.9401940194019403e-05,
"loss": 0.6042,
"step": 2240
},
{
"epoch": 0.8678881388621023,
"grad_norm": 1.6148077249526978,
"learning_rate": 1.9376937693769378e-05,
"loss": 0.5055,
"step": 2250
},
{
"epoch": 0.8717454194792671,
"grad_norm": 1.5768954753875732,
"learning_rate": 1.9351935193519353e-05,
"loss": 0.4966,
"step": 2260
},
{
"epoch": 0.875602700096432,
"grad_norm": 1.5010885000228882,
"learning_rate": 1.932693269326933e-05,
"loss": 0.501,
"step": 2270
},
{
"epoch": 0.8794599807135969,
"grad_norm": 1.661967158317566,
"learning_rate": 1.93019301930193e-05,
"loss": 0.5405,
"step": 2280
},
{
"epoch": 0.8833172613307618,
"grad_norm": 1.5393158197402954,
"learning_rate": 1.927692769276928e-05,
"loss": 0.5544,
"step": 2290
},
{
"epoch": 0.8871745419479267,
"grad_norm": 1.7475782632827759,
"learning_rate": 1.9251925192519252e-05,
"loss": 0.6173,
"step": 2300
},
{
"epoch": 0.8871745419479267,
"eval_loss": 0.5320296287536621,
"eval_runtime": 94.5369,
"eval_samples_per_second": 54.846,
"eval_steps_per_second": 6.865,
"step": 2300
},
{
"epoch": 0.8910318225650916,
"grad_norm": 1.3934800624847412,
"learning_rate": 1.9226922692269227e-05,
"loss": 0.5125,
"step": 2310
},
{
"epoch": 0.8948891031822566,
"grad_norm": 1.6484580039978027,
"learning_rate": 1.9201920192019203e-05,
"loss": 0.494,
"step": 2320
},
{
"epoch": 0.8987463837994214,
"grad_norm": 1.6516157388687134,
"learning_rate": 1.9176917691769178e-05,
"loss": 0.5253,
"step": 2330
},
{
"epoch": 0.9026036644165863,
"grad_norm": 1.5073869228363037,
"learning_rate": 1.9151915191519153e-05,
"loss": 0.5516,
"step": 2340
},
{
"epoch": 0.9064609450337512,
"grad_norm": 1.582481026649475,
"learning_rate": 1.912691269126913e-05,
"loss": 0.5621,
"step": 2350
},
{
"epoch": 0.9103182256509161,
"grad_norm": 1.4449944496154785,
"learning_rate": 1.9101910191019104e-05,
"loss": 0.5494,
"step": 2360
},
{
"epoch": 0.914175506268081,
"grad_norm": 1.7907747030258179,
"learning_rate": 1.9076907690769076e-05,
"loss": 0.5404,
"step": 2370
},
{
"epoch": 0.9180327868852459,
"grad_norm": 1.719509243965149,
"learning_rate": 1.9051905190519055e-05,
"loss": 0.5283,
"step": 2380
},
{
"epoch": 0.9218900675024108,
"grad_norm": 1.5800633430480957,
"learning_rate": 1.9026902690269027e-05,
"loss": 0.5292,
"step": 2390
},
{
"epoch": 0.9257473481195757,
"grad_norm": 1.4846770763397217,
"learning_rate": 1.9001900190019003e-05,
"loss": 0.524,
"step": 2400
},
{
"epoch": 0.9257473481195757,
"eval_loss": 0.5241175889968872,
"eval_runtime": 94.4587,
"eval_samples_per_second": 54.892,
"eval_steps_per_second": 6.871,
"step": 2400
},
{
"epoch": 0.9296046287367405,
"grad_norm": 1.7714641094207764,
"learning_rate": 1.8976897689768978e-05,
"loss": 0.4915,
"step": 2410
},
{
"epoch": 0.9334619093539055,
"grad_norm": 1.964656114578247,
"learning_rate": 1.8951895189518953e-05,
"loss": 0.4874,
"step": 2420
},
{
"epoch": 0.9373191899710704,
"grad_norm": 1.6763602495193481,
"learning_rate": 1.8926892689268925e-05,
"loss": 0.5526,
"step": 2430
},
{
"epoch": 0.9411764705882353,
"grad_norm": 1.6096868515014648,
"learning_rate": 1.8901890189018904e-05,
"loss": 0.5101,
"step": 2440
},
{
"epoch": 0.9450337512054002,
"grad_norm": 1.5164107084274292,
"learning_rate": 1.8876887688768876e-05,
"loss": 0.5307,
"step": 2450
},
{
"epoch": 0.9488910318225651,
"grad_norm": 1.4356317520141602,
"learning_rate": 1.885188518851885e-05,
"loss": 0.4733,
"step": 2460
},
{
"epoch": 0.95274831243973,
"grad_norm": 1.6256446838378906,
"learning_rate": 1.8826882688268827e-05,
"loss": 0.5726,
"step": 2470
},
{
"epoch": 0.9566055930568949,
"grad_norm": 1.5358326435089111,
"learning_rate": 1.8801880188018802e-05,
"loss": 0.5134,
"step": 2480
},
{
"epoch": 0.9604628736740598,
"grad_norm": 1.862509846687317,
"learning_rate": 1.8776877687768778e-05,
"loss": 0.5277,
"step": 2490
},
{
"epoch": 0.9643201542912246,
"grad_norm": 1.7659302949905396,
"learning_rate": 1.8751875187518753e-05,
"loss": 0.5523,
"step": 2500
},
{
"epoch": 0.9643201542912246,
"eval_loss": 0.5180462002754211,
"eval_runtime": 94.4176,
"eval_samples_per_second": 54.916,
"eval_steps_per_second": 6.874,
"step": 2500
},
{
"epoch": 0.9681774349083896,
"grad_norm": 1.5947084426879883,
"learning_rate": 1.872687268726873e-05,
"loss": 0.5419,
"step": 2510
},
{
"epoch": 0.9720347155255545,
"grad_norm": 1.829914927482605,
"learning_rate": 1.87018701870187e-05,
"loss": 0.5897,
"step": 2520
},
{
"epoch": 0.9758919961427194,
"grad_norm": 1.3083444833755493,
"learning_rate": 1.867686768676868e-05,
"loss": 0.4932,
"step": 2530
},
{
"epoch": 0.9797492767598843,
"grad_norm": 1.5652191638946533,
"learning_rate": 1.865186518651865e-05,
"loss": 0.4967,
"step": 2540
},
{
"epoch": 0.9836065573770492,
"grad_norm": 1.7959744930267334,
"learning_rate": 1.8626862686268627e-05,
"loss": 0.4934,
"step": 2550
},
{
"epoch": 0.9874638379942141,
"grad_norm": 1.6218141317367554,
"learning_rate": 1.8601860186018602e-05,
"loss": 0.4809,
"step": 2560
},
{
"epoch": 0.991321118611379,
"grad_norm": 1.641104817390442,
"learning_rate": 1.8576857685768578e-05,
"loss": 0.4789,
"step": 2570
},
{
"epoch": 0.9951783992285439,
"grad_norm": 1.732410192489624,
"learning_rate": 1.8551855185518553e-05,
"loss": 0.4998,
"step": 2580
},
{
"epoch": 0.9990356798457087,
"grad_norm": 1.8680731058120728,
"learning_rate": 1.852685268526853e-05,
"loss": 0.5097,
"step": 2590
},
{
"epoch": 1.0028929604628736,
"grad_norm": 1.7208608388900757,
"learning_rate": 1.85018501850185e-05,
"loss": 0.4809,
"step": 2600
},
{
"epoch": 1.0028929604628736,
"eval_loss": 0.5124805569648743,
"eval_runtime": 94.4179,
"eval_samples_per_second": 54.915,
"eval_steps_per_second": 6.874,
"step": 2600
},
{
"epoch": 1.0067502410800386,
"grad_norm": 1.9916785955429077,
"learning_rate": 1.847684768476848e-05,
"loss": 0.4324,
"step": 2610
},
{
"epoch": 1.0106075216972035,
"grad_norm": 1.5762462615966797,
"learning_rate": 1.845184518451845e-05,
"loss": 0.4817,
"step": 2620
},
{
"epoch": 1.0144648023143683,
"grad_norm": 2.0109360218048096,
"learning_rate": 1.8426842684268427e-05,
"loss": 0.441,
"step": 2630
},
{
"epoch": 1.0183220829315334,
"grad_norm": 1.7828129529953003,
"learning_rate": 1.8401840184018402e-05,
"loss": 0.4551,
"step": 2640
},
{
"epoch": 1.0221793635486982,
"grad_norm": 1.7471317052841187,
"learning_rate": 1.8376837683768378e-05,
"loss": 0.3956,
"step": 2650
},
{
"epoch": 1.026036644165863,
"grad_norm": 1.9026498794555664,
"learning_rate": 1.8351835183518353e-05,
"loss": 0.4544,
"step": 2660
},
{
"epoch": 1.0298939247830279,
"grad_norm": 1.9493508338928223,
"learning_rate": 1.832683268326833e-05,
"loss": 0.4609,
"step": 2670
},
{
"epoch": 1.033751205400193,
"grad_norm": 1.8381072282791138,
"learning_rate": 1.8301830183018304e-05,
"loss": 0.4321,
"step": 2680
},
{
"epoch": 1.0376084860173578,
"grad_norm": 1.5527135133743286,
"learning_rate": 1.8276827682768276e-05,
"loss": 0.4112,
"step": 2690
},
{
"epoch": 1.0414657666345226,
"grad_norm": 2.231661319732666,
"learning_rate": 1.8251825182518255e-05,
"loss": 0.4279,
"step": 2700
},
{
"epoch": 1.0414657666345226,
"eval_loss": 0.5103564262390137,
"eval_runtime": 94.417,
"eval_samples_per_second": 54.916,
"eval_steps_per_second": 6.874,
"step": 2700
},
{
"epoch": 1.0453230472516875,
"grad_norm": 3.195507049560547,
"learning_rate": 1.8226822682268227e-05,
"loss": 0.4678,
"step": 2710
},
{
"epoch": 1.0491803278688525,
"grad_norm": 1.8608683347702026,
"learning_rate": 1.8201820182018202e-05,
"loss": 0.4831,
"step": 2720
},
{
"epoch": 1.0530376084860174,
"grad_norm": 2.1820995807647705,
"learning_rate": 1.8176817681768178e-05,
"loss": 0.4,
"step": 2730
},
{
"epoch": 1.0568948891031822,
"grad_norm": 1.7552732229232788,
"learning_rate": 1.8151815181518153e-05,
"loss": 0.4431,
"step": 2740
},
{
"epoch": 1.0607521697203472,
"grad_norm": 2.040696859359741,
"learning_rate": 1.8126812681268125e-05,
"loss": 0.528,
"step": 2750
},
{
"epoch": 1.064609450337512,
"grad_norm": 1.7921245098114014,
"learning_rate": 1.8101810181018104e-05,
"loss": 0.449,
"step": 2760
},
{
"epoch": 1.068466730954677,
"grad_norm": 2.0593929290771484,
"learning_rate": 1.8076807680768076e-05,
"loss": 0.41,
"step": 2770
},
{
"epoch": 1.0723240115718418,
"grad_norm": 2.059739112854004,
"learning_rate": 1.805180518051805e-05,
"loss": 0.4451,
"step": 2780
},
{
"epoch": 1.0761812921890068,
"grad_norm": 2.0607693195343018,
"learning_rate": 1.802680268026803e-05,
"loss": 0.4387,
"step": 2790
},
{
"epoch": 1.0800385728061717,
"grad_norm": 1.7160958051681519,
"learning_rate": 1.8001800180018002e-05,
"loss": 0.4501,
"step": 2800
},
{
"epoch": 1.0800385728061717,
"eval_loss": 0.5034841895103455,
"eval_runtime": 94.5244,
"eval_samples_per_second": 54.854,
"eval_steps_per_second": 6.866,
"step": 2800
},
{
"epoch": 1.0838958534233365,
"grad_norm": 1.879629373550415,
"learning_rate": 1.7976797679767978e-05,
"loss": 0.4553,
"step": 2810
},
{
"epoch": 1.0877531340405016,
"grad_norm": 2.0610523223876953,
"learning_rate": 1.7951795179517953e-05,
"loss": 0.4842,
"step": 2820
},
{
"epoch": 1.0916104146576664,
"grad_norm": 1.8454833030700684,
"learning_rate": 1.792679267926793e-05,
"loss": 0.4288,
"step": 2830
},
{
"epoch": 1.0954676952748312,
"grad_norm": 1.7830801010131836,
"learning_rate": 1.79017901790179e-05,
"loss": 0.4552,
"step": 2840
},
{
"epoch": 1.099324975891996,
"grad_norm": 1.7110368013381958,
"learning_rate": 1.787678767876788e-05,
"loss": 0.4557,
"step": 2850
},
{
"epoch": 1.1031822565091611,
"grad_norm": 2.69413161277771,
"learning_rate": 1.785178517851785e-05,
"loss": 0.5252,
"step": 2860
},
{
"epoch": 1.107039537126326,
"grad_norm": 2.2572829723358154,
"learning_rate": 1.7826782678267827e-05,
"loss": 0.5042,
"step": 2870
},
{
"epoch": 1.1108968177434908,
"grad_norm": 2.144115447998047,
"learning_rate": 1.7801780178017802e-05,
"loss": 0.4615,
"step": 2880
},
{
"epoch": 1.1147540983606556,
"grad_norm": 1.661698818206787,
"learning_rate": 1.7776777677767778e-05,
"loss": 0.429,
"step": 2890
},
{
"epoch": 1.1186113789778207,
"grad_norm": 2.2900257110595703,
"learning_rate": 1.7751775177517753e-05,
"loss": 0.4651,
"step": 2900
},
{
"epoch": 1.1186113789778207,
"eval_loss": 0.4993349611759186,
"eval_runtime": 94.4361,
"eval_samples_per_second": 54.905,
"eval_steps_per_second": 6.872,
"step": 2900
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3606948147288474e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}