besimray's picture
Training in progress, step 110, checkpoint
53c04e6 verified
raw
history blame
24.8 kB
{
"best_metric": 0.8914665579795837,
"best_model_checkpoint": "miner_id_24/checkpoint-110",
"epoch": 0.26252983293556087,
"eval_steps": 5,
"global_step": 110,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002386634844868735,
"grad_norm": 1.6264880895614624,
"learning_rate": 2e-05,
"loss": 1.7576,
"step": 1
},
{
"epoch": 0.002386634844868735,
"eval_loss": 1.4666008949279785,
"eval_runtime": 26.2232,
"eval_samples_per_second": 6.75,
"eval_steps_per_second": 3.394,
"step": 1
},
{
"epoch": 0.00477326968973747,
"grad_norm": 0.8592613935470581,
"learning_rate": 4e-05,
"loss": 0.8915,
"step": 2
},
{
"epoch": 0.007159904534606206,
"grad_norm": 0.985676646232605,
"learning_rate": 6e-05,
"loss": 1.1643,
"step": 3
},
{
"epoch": 0.00954653937947494,
"grad_norm": 1.3740118741989136,
"learning_rate": 8e-05,
"loss": 1.5305,
"step": 4
},
{
"epoch": 0.011933174224343675,
"grad_norm": 0.9007270336151123,
"learning_rate": 0.0001,
"loss": 1.191,
"step": 5
},
{
"epoch": 0.011933174224343675,
"eval_loss": 1.421371340751648,
"eval_runtime": 26.2293,
"eval_samples_per_second": 6.748,
"eval_steps_per_second": 3.393,
"step": 5
},
{
"epoch": 0.014319809069212411,
"grad_norm": 2.1037585735321045,
"learning_rate": 0.00012,
"loss": 1.6708,
"step": 6
},
{
"epoch": 0.016706443914081145,
"grad_norm": 0.9997168183326721,
"learning_rate": 0.00014,
"loss": 1.2175,
"step": 7
},
{
"epoch": 0.01909307875894988,
"grad_norm": 1.029247522354126,
"learning_rate": 0.00016,
"loss": 1.2724,
"step": 8
},
{
"epoch": 0.021479713603818614,
"grad_norm": 1.2429360151290894,
"learning_rate": 0.00018,
"loss": 1.3846,
"step": 9
},
{
"epoch": 0.02386634844868735,
"grad_norm": 0.4330953061580658,
"learning_rate": 0.0002,
"loss": 0.4449,
"step": 10
},
{
"epoch": 0.02386634844868735,
"eval_loss": 1.1578279733657837,
"eval_runtime": 27.585,
"eval_samples_per_second": 6.417,
"eval_steps_per_second": 3.226,
"step": 10
},
{
"epoch": 0.026252983293556086,
"grad_norm": 0.9032881259918213,
"learning_rate": 0.0001999979446958366,
"loss": 1.1913,
"step": 11
},
{
"epoch": 0.028639618138424822,
"grad_norm": 1.043502926826477,
"learning_rate": 0.00019999177886783194,
"loss": 1.067,
"step": 12
},
{
"epoch": 0.031026252983293555,
"grad_norm": 0.984898030757904,
"learning_rate": 0.00019998150276943902,
"loss": 1.2715,
"step": 13
},
{
"epoch": 0.03341288782816229,
"grad_norm": 0.8400496244430542,
"learning_rate": 0.000199967116823068,
"loss": 0.8158,
"step": 14
},
{
"epoch": 0.03579952267303103,
"grad_norm": 0.925491452217102,
"learning_rate": 0.0001999486216200688,
"loss": 1.6707,
"step": 15
},
{
"epoch": 0.03579952267303103,
"eval_loss": 1.0528874397277832,
"eval_runtime": 26.2645,
"eval_samples_per_second": 6.739,
"eval_steps_per_second": 3.389,
"step": 15
},
{
"epoch": 0.03818615751789976,
"grad_norm": 0.749546468257904,
"learning_rate": 0.00019992601792070679,
"loss": 0.9433,
"step": 16
},
{
"epoch": 0.0405727923627685,
"grad_norm": 0.9893090128898621,
"learning_rate": 0.00019989930665413147,
"loss": 1.0754,
"step": 17
},
{
"epoch": 0.04295942720763723,
"grad_norm": 0.8510335087776184,
"learning_rate": 0.00019986848891833845,
"loss": 1.2892,
"step": 18
},
{
"epoch": 0.045346062052505964,
"grad_norm": 0.8910903334617615,
"learning_rate": 0.0001998335659801241,
"loss": 0.8846,
"step": 19
},
{
"epoch": 0.0477326968973747,
"grad_norm": 0.8577730059623718,
"learning_rate": 0.00019979453927503364,
"loss": 0.9342,
"step": 20
},
{
"epoch": 0.0477326968973747,
"eval_loss": 1.0087474584579468,
"eval_runtime": 26.2663,
"eval_samples_per_second": 6.739,
"eval_steps_per_second": 3.388,
"step": 20
},
{
"epoch": 0.050119331742243436,
"grad_norm": 0.9688092470169067,
"learning_rate": 0.00019975141040730207,
"loss": 1.1933,
"step": 21
},
{
"epoch": 0.05250596658711217,
"grad_norm": 0.7774447202682495,
"learning_rate": 0.0001997041811497882,
"loss": 0.8412,
"step": 22
},
{
"epoch": 0.05489260143198091,
"grad_norm": 0.7570910453796387,
"learning_rate": 0.00019965285344390184,
"loss": 0.8872,
"step": 23
},
{
"epoch": 0.057279236276849645,
"grad_norm": 0.5232482552528381,
"learning_rate": 0.00019959742939952392,
"loss": 0.9852,
"step": 24
},
{
"epoch": 0.059665871121718374,
"grad_norm": 1.0864367485046387,
"learning_rate": 0.00019953791129491983,
"loss": 1.1447,
"step": 25
},
{
"epoch": 0.059665871121718374,
"eval_loss": 0.9940932393074036,
"eval_runtime": 26.2586,
"eval_samples_per_second": 6.741,
"eval_steps_per_second": 3.389,
"step": 25
},
{
"epoch": 0.06205250596658711,
"grad_norm": 0.9833227396011353,
"learning_rate": 0.00019947430157664576,
"loss": 1.4313,
"step": 26
},
{
"epoch": 0.06443914081145585,
"grad_norm": 0.7902147769927979,
"learning_rate": 0.00019940660285944803,
"loss": 1.0728,
"step": 27
},
{
"epoch": 0.06682577565632458,
"grad_norm": 0.6701146364212036,
"learning_rate": 0.00019933481792615583,
"loss": 0.8888,
"step": 28
},
{
"epoch": 0.06921241050119331,
"grad_norm": 0.845893383026123,
"learning_rate": 0.0001992589497275665,
"loss": 0.9849,
"step": 29
},
{
"epoch": 0.07159904534606205,
"grad_norm": 0.7827535271644592,
"learning_rate": 0.0001991790013823246,
"loss": 1.331,
"step": 30
},
{
"epoch": 0.07159904534606205,
"eval_loss": 0.9710781574249268,
"eval_runtime": 26.2207,
"eval_samples_per_second": 6.75,
"eval_steps_per_second": 3.394,
"step": 30
},
{
"epoch": 0.07398568019093078,
"grad_norm": 0.9344210028648376,
"learning_rate": 0.00019909497617679348,
"loss": 1.013,
"step": 31
},
{
"epoch": 0.07637231503579953,
"grad_norm": 0.6921480894088745,
"learning_rate": 0.0001990068775649202,
"loss": 0.6896,
"step": 32
},
{
"epoch": 0.07875894988066826,
"grad_norm": 0.804854154586792,
"learning_rate": 0.00019891470916809362,
"loss": 0.8884,
"step": 33
},
{
"epoch": 0.081145584725537,
"grad_norm": 0.7858572602272034,
"learning_rate": 0.00019881847477499557,
"loss": 0.8469,
"step": 34
},
{
"epoch": 0.08353221957040573,
"grad_norm": 0.7166551947593689,
"learning_rate": 0.00019871817834144504,
"loss": 0.9763,
"step": 35
},
{
"epoch": 0.08353221957040573,
"eval_loss": 0.9543755054473877,
"eval_runtime": 26.2507,
"eval_samples_per_second": 6.743,
"eval_steps_per_second": 3.39,
"step": 35
},
{
"epoch": 0.08591885441527446,
"grad_norm": 0.7922567129135132,
"learning_rate": 0.0001986138239902355,
"loss": 1.0582,
"step": 36
},
{
"epoch": 0.0883054892601432,
"grad_norm": 1.0015374422073364,
"learning_rate": 0.0001985054160109657,
"loss": 1.0839,
"step": 37
},
{
"epoch": 0.09069212410501193,
"grad_norm": 1.455461025238037,
"learning_rate": 0.00019839295885986296,
"loss": 1.2132,
"step": 38
},
{
"epoch": 0.09307875894988067,
"grad_norm": 0.781535267829895,
"learning_rate": 0.0001982764571596004,
"loss": 0.7504,
"step": 39
},
{
"epoch": 0.0954653937947494,
"grad_norm": 0.8738696575164795,
"learning_rate": 0.00019815591569910654,
"loss": 0.6841,
"step": 40
},
{
"epoch": 0.0954653937947494,
"eval_loss": 0.9494202136993408,
"eval_runtime": 26.2561,
"eval_samples_per_second": 6.741,
"eval_steps_per_second": 3.39,
"step": 40
},
{
"epoch": 0.09785202863961814,
"grad_norm": 1.0066485404968262,
"learning_rate": 0.00019803133943336874,
"loss": 1.1701,
"step": 41
},
{
"epoch": 0.10023866348448687,
"grad_norm": 0.8418964147567749,
"learning_rate": 0.0001979027334832293,
"loss": 1.0454,
"step": 42
},
{
"epoch": 0.1026252983293556,
"grad_norm": 0.9529140591621399,
"learning_rate": 0.00019777010313517518,
"loss": 0.9311,
"step": 43
},
{
"epoch": 0.10501193317422435,
"grad_norm": 0.9450991749763489,
"learning_rate": 0.00019763345384112043,
"loss": 0.9843,
"step": 44
},
{
"epoch": 0.10739856801909307,
"grad_norm": 0.8391886353492737,
"learning_rate": 0.00019749279121818235,
"loss": 0.7273,
"step": 45
},
{
"epoch": 0.10739856801909307,
"eval_loss": 0.9417486786842346,
"eval_runtime": 26.2253,
"eval_samples_per_second": 6.749,
"eval_steps_per_second": 3.394,
"step": 45
},
{
"epoch": 0.10978520286396182,
"grad_norm": 0.8711974620819092,
"learning_rate": 0.00019734812104845047,
"loss": 1.0276,
"step": 46
},
{
"epoch": 0.11217183770883055,
"grad_norm": 0.6002468466758728,
"learning_rate": 0.00019719944927874881,
"loss": 0.7602,
"step": 47
},
{
"epoch": 0.11455847255369929,
"grad_norm": 1.059345006942749,
"learning_rate": 0.0001970467820203915,
"loss": 1.3701,
"step": 48
},
{
"epoch": 0.11694510739856802,
"grad_norm": 0.9398312568664551,
"learning_rate": 0.00019689012554893154,
"loss": 0.9587,
"step": 49
},
{
"epoch": 0.11933174224343675,
"grad_norm": 0.8663720488548279,
"learning_rate": 0.00019672948630390294,
"loss": 0.766,
"step": 50
},
{
"epoch": 0.11933174224343675,
"eval_loss": 0.9387193918228149,
"eval_runtime": 26.2728,
"eval_samples_per_second": 6.737,
"eval_steps_per_second": 3.388,
"step": 50
},
{
"epoch": 0.12171837708830549,
"grad_norm": 0.7873828411102295,
"learning_rate": 0.00019656487088855592,
"loss": 1.2591,
"step": 51
},
{
"epoch": 0.12410501193317422,
"grad_norm": 0.6708641648292542,
"learning_rate": 0.00019639628606958533,
"loss": 0.7497,
"step": 52
},
{
"epoch": 0.12649164677804295,
"grad_norm": 0.5296116471290588,
"learning_rate": 0.0001962237387768529,
"loss": 0.9209,
"step": 53
},
{
"epoch": 0.1288782816229117,
"grad_norm": 0.701153576374054,
"learning_rate": 0.00019604723610310194,
"loss": 0.9039,
"step": 54
},
{
"epoch": 0.13126491646778043,
"grad_norm": 0.607571005821228,
"learning_rate": 0.00019586678530366606,
"loss": 0.9841,
"step": 55
},
{
"epoch": 0.13126491646778043,
"eval_loss": 0.9331147074699402,
"eval_runtime": 26.2365,
"eval_samples_per_second": 6.746,
"eval_steps_per_second": 3.392,
"step": 55
},
{
"epoch": 0.13365155131264916,
"grad_norm": 0.7479352951049805,
"learning_rate": 0.00019568239379617088,
"loss": 0.8665,
"step": 56
},
{
"epoch": 0.1360381861575179,
"grad_norm": 0.8115050196647644,
"learning_rate": 0.00019549406916022905,
"loss": 1.0235,
"step": 57
},
{
"epoch": 0.13842482100238662,
"grad_norm": 0.5173358917236328,
"learning_rate": 0.00019530181913712872,
"loss": 0.8548,
"step": 58
},
{
"epoch": 0.14081145584725538,
"grad_norm": 0.917434811592102,
"learning_rate": 0.00019510565162951537,
"loss": 0.9314,
"step": 59
},
{
"epoch": 0.1431980906921241,
"grad_norm": 0.7886952757835388,
"learning_rate": 0.00019490557470106686,
"loss": 0.8733,
"step": 60
},
{
"epoch": 0.1431980906921241,
"eval_loss": 0.9312522411346436,
"eval_runtime": 26.2683,
"eval_samples_per_second": 6.738,
"eval_steps_per_second": 3.388,
"step": 60
},
{
"epoch": 0.14558472553699284,
"grad_norm": 0.7969014644622803,
"learning_rate": 0.00019470159657616215,
"loss": 0.6603,
"step": 61
},
{
"epoch": 0.14797136038186157,
"grad_norm": 0.9583745002746582,
"learning_rate": 0.00019449372563954293,
"loss": 1.466,
"step": 62
},
{
"epoch": 0.15035799522673032,
"grad_norm": 0.7162270545959473,
"learning_rate": 0.0001942819704359693,
"loss": 0.8824,
"step": 63
},
{
"epoch": 0.15274463007159905,
"grad_norm": 0.47627589106559753,
"learning_rate": 0.00019406633966986828,
"loss": 0.6296,
"step": 64
},
{
"epoch": 0.15513126491646778,
"grad_norm": 0.7122361063957214,
"learning_rate": 0.00019384684220497605,
"loss": 0.64,
"step": 65
},
{
"epoch": 0.15513126491646778,
"eval_loss": 0.9237020015716553,
"eval_runtime": 26.2941,
"eval_samples_per_second": 6.732,
"eval_steps_per_second": 3.385,
"step": 65
},
{
"epoch": 0.1575178997613365,
"grad_norm": 0.8619460463523865,
"learning_rate": 0.00019362348706397373,
"loss": 1.1417,
"step": 66
},
{
"epoch": 0.15990453460620524,
"grad_norm": 0.7796176075935364,
"learning_rate": 0.00019339628342811632,
"loss": 0.859,
"step": 67
},
{
"epoch": 0.162291169451074,
"grad_norm": 0.8874034285545349,
"learning_rate": 0.0001931652406368554,
"loss": 0.9786,
"step": 68
},
{
"epoch": 0.16467780429594273,
"grad_norm": 0.7292467951774597,
"learning_rate": 0.0001929303681874552,
"loss": 0.8093,
"step": 69
},
{
"epoch": 0.16706443914081145,
"grad_norm": 0.8368440270423889,
"learning_rate": 0.0001926916757346022,
"loss": 1.3638,
"step": 70
},
{
"epoch": 0.16706443914081145,
"eval_loss": 0.9212387800216675,
"eval_runtime": 26.2497,
"eval_samples_per_second": 6.743,
"eval_steps_per_second": 3.391,
"step": 70
},
{
"epoch": 0.16945107398568018,
"grad_norm": 0.6919611692428589,
"learning_rate": 0.00019244917309000817,
"loss": 0.9292,
"step": 71
},
{
"epoch": 0.1718377088305489,
"grad_norm": 0.8706820607185364,
"learning_rate": 0.00019220287022200707,
"loss": 1.122,
"step": 72
},
{
"epoch": 0.17422434367541767,
"grad_norm": 0.6912488341331482,
"learning_rate": 0.0001919527772551451,
"loss": 1.0077,
"step": 73
},
{
"epoch": 0.1766109785202864,
"grad_norm": 0.6089036464691162,
"learning_rate": 0.00019169890446976454,
"loss": 0.9816,
"step": 74
},
{
"epoch": 0.17899761336515513,
"grad_norm": 0.7333152294158936,
"learning_rate": 0.00019144126230158127,
"loss": 0.8958,
"step": 75
},
{
"epoch": 0.17899761336515513,
"eval_loss": 0.9194671511650085,
"eval_runtime": 26.2263,
"eval_samples_per_second": 6.749,
"eval_steps_per_second": 3.394,
"step": 75
},
{
"epoch": 0.18138424821002386,
"grad_norm": 0.7732102870941162,
"learning_rate": 0.0001911798613412557,
"loss": 0.8417,
"step": 76
},
{
"epoch": 0.18377088305489261,
"grad_norm": 0.6444439888000488,
"learning_rate": 0.0001909147123339575,
"loss": 0.9722,
"step": 77
},
{
"epoch": 0.18615751789976134,
"grad_norm": 0.522572934627533,
"learning_rate": 0.0001906458261789238,
"loss": 0.7848,
"step": 78
},
{
"epoch": 0.18854415274463007,
"grad_norm": 0.6048774719238281,
"learning_rate": 0.00019037321392901136,
"loss": 1.0934,
"step": 79
},
{
"epoch": 0.1909307875894988,
"grad_norm": 0.7766179442405701,
"learning_rate": 0.0001900968867902419,
"loss": 0.8923,
"step": 80
},
{
"epoch": 0.1909307875894988,
"eval_loss": 0.9133721590042114,
"eval_runtime": 26.2379,
"eval_samples_per_second": 6.746,
"eval_steps_per_second": 3.392,
"step": 80
},
{
"epoch": 0.19331742243436753,
"grad_norm": 0.6204676032066345,
"learning_rate": 0.0001898168561213419,
"loss": 0.7503,
"step": 81
},
{
"epoch": 0.1957040572792363,
"grad_norm": 0.5830619931221008,
"learning_rate": 0.0001895331334332753,
"loss": 0.7373,
"step": 82
},
{
"epoch": 0.19809069212410502,
"grad_norm": 0.6380212306976318,
"learning_rate": 0.0001892457303887706,
"loss": 0.789,
"step": 83
},
{
"epoch": 0.20047732696897375,
"grad_norm": 0.6974972486495972,
"learning_rate": 0.0001889546588018412,
"loss": 0.8987,
"step": 84
},
{
"epoch": 0.20286396181384247,
"grad_norm": 0.6037717461585999,
"learning_rate": 0.00018865993063730004,
"loss": 1.1555,
"step": 85
},
{
"epoch": 0.20286396181384247,
"eval_loss": 0.9121592044830322,
"eval_runtime": 26.2683,
"eval_samples_per_second": 6.738,
"eval_steps_per_second": 3.388,
"step": 85
},
{
"epoch": 0.2052505966587112,
"grad_norm": 0.5582723617553711,
"learning_rate": 0.00018836155801026753,
"loss": 0.6236,
"step": 86
},
{
"epoch": 0.20763723150357996,
"grad_norm": 0.7444891929626465,
"learning_rate": 0.0001880595531856738,
"loss": 0.6428,
"step": 87
},
{
"epoch": 0.2100238663484487,
"grad_norm": 0.5842642188072205,
"learning_rate": 0.00018775392857775432,
"loss": 0.9149,
"step": 88
},
{
"epoch": 0.21241050119331742,
"grad_norm": 0.6907650232315063,
"learning_rate": 0.00018744469674953956,
"loss": 0.8653,
"step": 89
},
{
"epoch": 0.21479713603818615,
"grad_norm": 0.7942304611206055,
"learning_rate": 0.00018713187041233896,
"loss": 0.7069,
"step": 90
},
{
"epoch": 0.21479713603818615,
"eval_loss": 0.910210132598877,
"eval_runtime": 26.2879,
"eval_samples_per_second": 6.733,
"eval_steps_per_second": 3.386,
"step": 90
},
{
"epoch": 0.2171837708830549,
"grad_norm": 0.7289912104606628,
"learning_rate": 0.00018681546242521786,
"loss": 0.7671,
"step": 91
},
{
"epoch": 0.21957040572792363,
"grad_norm": 0.6675541400909424,
"learning_rate": 0.00018649548579446936,
"loss": 0.6645,
"step": 92
},
{
"epoch": 0.22195704057279236,
"grad_norm": 0.4568694829940796,
"learning_rate": 0.0001861719536730795,
"loss": 0.7777,
"step": 93
},
{
"epoch": 0.2243436754176611,
"grad_norm": 0.715552568435669,
"learning_rate": 0.00018584487936018661,
"loss": 1.1339,
"step": 94
},
{
"epoch": 0.22673031026252982,
"grad_norm": 0.8277347087860107,
"learning_rate": 0.00018551427630053463,
"loss": 0.9398,
"step": 95
},
{
"epoch": 0.22673031026252982,
"eval_loss": 0.9004982113838196,
"eval_runtime": 26.2476,
"eval_samples_per_second": 6.743,
"eval_steps_per_second": 3.391,
"step": 95
},
{
"epoch": 0.22911694510739858,
"grad_norm": 0.7335620522499084,
"learning_rate": 0.00018518015808392045,
"loss": 0.8793,
"step": 96
},
{
"epoch": 0.2315035799522673,
"grad_norm": 0.5709030032157898,
"learning_rate": 0.00018484253844463526,
"loss": 0.7161,
"step": 97
},
{
"epoch": 0.23389021479713604,
"grad_norm": 0.6625402569770813,
"learning_rate": 0.00018450143126090015,
"loss": 0.8631,
"step": 98
},
{
"epoch": 0.23627684964200477,
"grad_norm": 0.3961223065853119,
"learning_rate": 0.00018415685055429533,
"loss": 0.6657,
"step": 99
},
{
"epoch": 0.2386634844868735,
"grad_norm": 0.8134841322898865,
"learning_rate": 0.00018380881048918405,
"loss": 1.1527,
"step": 100
},
{
"epoch": 0.2386634844868735,
"eval_loss": 0.8966440558433533,
"eval_runtime": 26.2452,
"eval_samples_per_second": 6.744,
"eval_steps_per_second": 3.391,
"step": 100
},
{
"epoch": 0.24105011933174225,
"grad_norm": 0.6637277007102966,
"learning_rate": 0.00018345732537213027,
"loss": 1.175,
"step": 101
},
{
"epoch": 0.24343675417661098,
"grad_norm": 0.5118420124053955,
"learning_rate": 0.00018310240965131041,
"loss": 0.852,
"step": 102
},
{
"epoch": 0.2458233890214797,
"grad_norm": 0.6100435256958008,
"learning_rate": 0.00018274407791591966,
"loss": 0.8036,
"step": 103
},
{
"epoch": 0.24821002386634844,
"grad_norm": 0.8629751801490784,
"learning_rate": 0.00018238234489557215,
"loss": 0.9638,
"step": 104
},
{
"epoch": 0.25059665871121717,
"grad_norm": 0.6925487518310547,
"learning_rate": 0.0001820172254596956,
"loss": 1.0022,
"step": 105
},
{
"epoch": 0.25059665871121717,
"eval_loss": 0.8950950503349304,
"eval_runtime": 26.2737,
"eval_samples_per_second": 6.737,
"eval_steps_per_second": 3.387,
"step": 105
},
{
"epoch": 0.2529832935560859,
"grad_norm": 0.6932248473167419,
"learning_rate": 0.00018164873461691986,
"loss": 0.7119,
"step": 106
},
{
"epoch": 0.2553699284009546,
"grad_norm": 0.6727349162101746,
"learning_rate": 0.00018127688751446027,
"loss": 1.0059,
"step": 107
},
{
"epoch": 0.2577565632458234,
"grad_norm": 0.7314417362213135,
"learning_rate": 0.00018090169943749476,
"loss": 0.8165,
"step": 108
},
{
"epoch": 0.26014319809069214,
"grad_norm": 0.5489733219146729,
"learning_rate": 0.0001805231858085356,
"loss": 0.8262,
"step": 109
},
{
"epoch": 0.26252983293556087,
"grad_norm": 0.5698820948600769,
"learning_rate": 0.00018014136218679567,
"loss": 0.7001,
"step": 110
},
{
"epoch": 0.26252983293556087,
"eval_loss": 0.8914665579795837,
"eval_runtime": 26.2981,
"eval_samples_per_second": 6.731,
"eval_steps_per_second": 3.384,
"step": 110
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 10,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.153367887413248e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}