Desm0nt's picture
Update 11.07 bulild v13
820dbd8 verified
{
"best_metric": 1.47908163,
"best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v9-20240710-235159\\checkpoint-500",
"epoch": 2.8828828828828827,
"eval_steps": 50,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"acc": 0.4856407,
"epoch": 0.0036036036036036037,
"grad_norm": 0.734375,
"learning_rate": 2.4107142857142856e-06,
"loss": 2.42667556,
"memory(GiB)": 18.11,
"step": 1,
"train_speed(iter/s)": 0.072451
},
{
"acc": 0.50815099,
"epoch": 0.018018018018018018,
"grad_norm": 0.671875,
"learning_rate": 1.2053571428571429e-05,
"loss": 2.28746271,
"memory(GiB)": 19.3,
"step": 5,
"train_speed(iter/s)": 0.081978
},
{
"acc": 0.50680609,
"epoch": 0.036036036036036036,
"grad_norm": 0.76953125,
"learning_rate": 2.4107142857142858e-05,
"loss": 2.29894772,
"memory(GiB)": 19.3,
"step": 10,
"train_speed(iter/s)": 0.084125
},
{
"acc": 0.51412601,
"epoch": 0.05405405405405406,
"grad_norm": 0.76171875,
"learning_rate": 3.616071428571428e-05,
"loss": 2.34161263,
"memory(GiB)": 19.7,
"step": 15,
"train_speed(iter/s)": 0.08456
},
{
"acc": 0.52338777,
"epoch": 0.07207207207207207,
"grad_norm": 0.6015625,
"learning_rate": 4.8214285714285716e-05,
"loss": 2.23036633,
"memory(GiB)": 19.88,
"step": 20,
"train_speed(iter/s)": 0.084117
},
{
"acc": 0.55944238,
"epoch": 0.09009009009009009,
"grad_norm": 0.66796875,
"learning_rate": 6.026785714285715e-05,
"loss": 2.01084595,
"memory(GiB)": 19.93,
"step": 25,
"train_speed(iter/s)": 0.084444
},
{
"acc": 0.57758675,
"epoch": 0.10810810810810811,
"grad_norm": 0.765625,
"learning_rate": 7.232142857142856e-05,
"loss": 1.94100876,
"memory(GiB)": 20.21,
"step": 30,
"train_speed(iter/s)": 0.085158
},
{
"acc": 0.5666451,
"epoch": 0.12612612612612611,
"grad_norm": 0.796875,
"learning_rate": 8.4375e-05,
"loss": 1.96992569,
"memory(GiB)": 19.42,
"step": 35,
"train_speed(iter/s)": 0.085562
},
{
"acc": 0.55766659,
"epoch": 0.14414414414414414,
"grad_norm": 0.828125,
"learning_rate": 9.642857142857143e-05,
"loss": 2.01305885,
"memory(GiB)": 19.71,
"step": 40,
"train_speed(iter/s)": 0.0857
},
{
"acc": 0.56964116,
"epoch": 0.16216216216216217,
"grad_norm": 0.83203125,
"learning_rate": 0.00010848214285714286,
"loss": 1.925914,
"memory(GiB)": 19.68,
"step": 45,
"train_speed(iter/s)": 0.08577
},
{
"acc": 0.56270452,
"epoch": 0.18018018018018017,
"grad_norm": 0.9375,
"learning_rate": 0.0001205357142857143,
"loss": 1.94923038,
"memory(GiB)": 19.65,
"step": 50,
"train_speed(iter/s)": 0.085942
},
{
"epoch": 0.18018018018018017,
"eval_acc": 0.5890983000739098,
"eval_loss": 1.795773983001709,
"eval_runtime": 136.6505,
"eval_samples_per_second": 1.105,
"eval_steps_per_second": 0.556,
"step": 50
},
{
"acc": 0.57772484,
"epoch": 0.1981981981981982,
"grad_norm": 0.7265625,
"learning_rate": 0.00013258928571428571,
"loss": 1.86195869,
"memory(GiB)": 23.11,
"step": 55,
"train_speed(iter/s)": 0.070857
},
{
"acc": 0.59196444,
"epoch": 0.21621621621621623,
"grad_norm": 0.8125,
"learning_rate": 0.00013499518432841625,
"loss": 1.74724998,
"memory(GiB)": 19.42,
"step": 60,
"train_speed(iter/s)": 0.071911
},
{
"acc": 0.57253065,
"epoch": 0.23423423423423423,
"grad_norm": 0.69921875,
"learning_rate": 0.00013497562184025362,
"loss": 1.87580814,
"memory(GiB)": 19.61,
"step": 65,
"train_speed(iter/s)": 0.072807
},
{
"acc": 0.59546819,
"epoch": 0.25225225225225223,
"grad_norm": 0.73046875,
"learning_rate": 0.00013494101591406666,
"loss": 1.73464546,
"memory(GiB)": 19.58,
"step": 70,
"train_speed(iter/s)": 0.073652
},
{
"acc": 0.59667702,
"epoch": 0.2702702702702703,
"grad_norm": 0.8203125,
"learning_rate": 0.00013489137426511745,
"loss": 1.69518318,
"memory(GiB)": 18.19,
"step": 75,
"train_speed(iter/s)": 0.074445
},
{
"acc": 0.61824327,
"epoch": 0.2882882882882883,
"grad_norm": 0.828125,
"learning_rate": 0.00013482670796082633,
"loss": 1.64374161,
"memory(GiB)": 19.52,
"step": 80,
"train_speed(iter/s)": 0.075071
},
{
"acc": 0.60798159,
"epoch": 0.3063063063063063,
"grad_norm": 0.7734375,
"learning_rate": 0.00013474703141830443,
"loss": 1.68669338,
"memory(GiB)": 19.57,
"step": 85,
"train_speed(iter/s)": 0.07562
},
{
"acc": 0.5981144,
"epoch": 0.32432432432432434,
"grad_norm": 0.80078125,
"learning_rate": 0.00013465236240113953,
"loss": 1.701264,
"memory(GiB)": 20.19,
"step": 90,
"train_speed(iter/s)": 0.076188
},
{
"acc": 0.59871612,
"epoch": 0.34234234234234234,
"grad_norm": 1.0234375,
"learning_rate": 0.00013454272201543564,
"loss": 1.76608849,
"memory(GiB)": 19.35,
"step": 95,
"train_speed(iter/s)": 0.076637
},
{
"acc": 0.61396523,
"epoch": 0.36036036036036034,
"grad_norm": 0.7109375,
"learning_rate": 0.00013441813470510747,
"loss": 1.61449242,
"memory(GiB)": 19.69,
"step": 100,
"train_speed(iter/s)": 0.077075
},
{
"epoch": 0.36036036036036034,
"eval_acc": 0.6091648189209165,
"eval_loss": 1.6449466943740845,
"eval_runtime": 134.5726,
"eval_samples_per_second": 1.122,
"eval_steps_per_second": 0.565,
"step": 100
},
{
"acc": 0.61147785,
"epoch": 0.3783783783783784,
"grad_norm": 0.69921875,
"learning_rate": 0.00013427862824643083,
"loss": 1.60589867,
"memory(GiB)": 21.03,
"step": 105,
"train_speed(iter/s)": 0.070426
},
{
"acc": 0.6038115,
"epoch": 0.3963963963963964,
"grad_norm": 0.88671875,
"learning_rate": 0.00013412423374184996,
"loss": 1.69055023,
"memory(GiB)": 19.44,
"step": 110,
"train_speed(iter/s)": 0.07105
},
{
"acc": 0.62303677,
"epoch": 0.4144144144144144,
"grad_norm": 0.84375,
"learning_rate": 0.00013395498561304334,
"loss": 1.5716897,
"memory(GiB)": 19.27,
"step": 115,
"train_speed(iter/s)": 0.071618
},
{
"acc": 0.6214046,
"epoch": 0.43243243243243246,
"grad_norm": 0.640625,
"learning_rate": 0.00013377092159324956,
"loss": 1.57531881,
"memory(GiB)": 19.36,
"step": 120,
"train_speed(iter/s)": 0.07209
},
{
"acc": 0.58676672,
"epoch": 0.45045045045045046,
"grad_norm": 0.68359375,
"learning_rate": 0.00013357208271885473,
"loss": 1.74933128,
"memory(GiB)": 19.32,
"step": 125,
"train_speed(iter/s)": 0.072581
},
{
"acc": 0.59380612,
"epoch": 0.46846846846846846,
"grad_norm": 0.7890625,
"learning_rate": 0.00013335851332024374,
"loss": 1.69583378,
"memory(GiB)": 20.18,
"step": 130,
"train_speed(iter/s)": 0.073016
},
{
"acc": 0.62007999,
"epoch": 0.4864864864864865,
"grad_norm": 0.73828125,
"learning_rate": 0.0001331302610119168,
"loss": 1.60020466,
"memory(GiB)": 19.52,
"step": 135,
"train_speed(iter/s)": 0.073417
},
{
"acc": 0.6116991,
"epoch": 0.5045045045045045,
"grad_norm": 1.1015625,
"learning_rate": 0.00013288737668187408,
"loss": 1.62470894,
"memory(GiB)": 19.47,
"step": 140,
"train_speed(iter/s)": 0.073817
},
{
"acc": 0.60051751,
"epoch": 0.5225225225225225,
"grad_norm": 0.87109375,
"learning_rate": 0.00013262991448027034,
"loss": 1.6651041,
"memory(GiB)": 19.42,
"step": 145,
"train_speed(iter/s)": 0.074194
},
{
"acc": 0.60736594,
"epoch": 0.5405405405405406,
"grad_norm": 0.76953125,
"learning_rate": 0.00013235793180734238,
"loss": 1.64281559,
"memory(GiB)": 19.53,
"step": 150,
"train_speed(iter/s)": 0.074547
},
{
"epoch": 0.5405405405405406,
"eval_acc": 0.6190317812269032,
"eval_loss": 1.5917434692382812,
"eval_runtime": 135.0141,
"eval_samples_per_second": 1.118,
"eval_steps_per_second": 0.563,
"step": 150
},
{
"acc": 0.61663914,
"epoch": 0.5585585585585585,
"grad_norm": 1.0625,
"learning_rate": 0.00013207148930061195,
"loss": 1.60914173,
"memory(GiB)": 23.05,
"step": 155,
"train_speed(iter/s)": 0.070306
},
{
"acc": 0.60967774,
"epoch": 0.5765765765765766,
"grad_norm": 0.76953125,
"learning_rate": 0.00013177065082136668,
"loss": 1.59582939,
"memory(GiB)": 19.47,
"step": 160,
"train_speed(iter/s)": 0.070712
},
{
"acc": 0.63630972,
"epoch": 0.5945945945945946,
"grad_norm": 0.70703125,
"learning_rate": 0.00013145548344042262,
"loss": 1.50356016,
"memory(GiB)": 19.62,
"step": 165,
"train_speed(iter/s)": 0.071104
},
{
"acc": 0.60439692,
"epoch": 0.6126126126126126,
"grad_norm": 0.73046875,
"learning_rate": 0.00013112605742317095,
"loss": 1.67050171,
"memory(GiB)": 19.41,
"step": 170,
"train_speed(iter/s)": 0.071478
},
{
"acc": 0.62380457,
"epoch": 0.6306306306306306,
"grad_norm": 0.76171875,
"learning_rate": 0.0001307824462139125,
"loss": 1.53042831,
"memory(GiB)": 19.5,
"step": 175,
"train_speed(iter/s)": 0.071843
},
{
"acc": 0.61549187,
"epoch": 0.6486486486486487,
"grad_norm": 0.7578125,
"learning_rate": 0.00013042472641948386,
"loss": 1.59476538,
"memory(GiB)": 19.53,
"step": 180,
"train_speed(iter/s)": 0.072168
},
{
"acc": 0.64418182,
"epoch": 0.6666666666666666,
"grad_norm": 1.1796875,
"learning_rate": 0.0001300529777921779,
"loss": 1.47999802,
"memory(GiB)": 19.32,
"step": 185,
"train_speed(iter/s)": 0.072501
},
{
"acc": 0.62201657,
"epoch": 0.6846846846846847,
"grad_norm": 0.6484375,
"learning_rate": 0.00012966728321196346,
"loss": 1.5685544,
"memory(GiB)": 19.47,
"step": 190,
"train_speed(iter/s)": 0.072821
},
{
"acc": 0.61418505,
"epoch": 0.7027027027027027,
"grad_norm": 0.8984375,
"learning_rate": 0.00012926772866800757,
"loss": 1.6284462,
"memory(GiB)": 19.45,
"step": 195,
"train_speed(iter/s)": 0.073127
},
{
"acc": 0.62820964,
"epoch": 0.7207207207207207,
"grad_norm": 0.8515625,
"learning_rate": 0.00012885440323950434,
"loss": 1.54364405,
"memory(GiB)": 19.53,
"step": 200,
"train_speed(iter/s)": 0.073413
},
{
"epoch": 0.7207207207207207,
"eval_acc": 0.6269770879526977,
"eval_loss": 1.5466336011886597,
"eval_runtime": 134.7868,
"eval_samples_per_second": 1.12,
"eval_steps_per_second": 0.564,
"step": 200
},
{
"acc": 0.6605804,
"epoch": 0.7387387387387387,
"grad_norm": 0.7578125,
"learning_rate": 0.00012842739907581525,
"loss": 1.42957153,
"memory(GiB)": 23.0,
"step": 205,
"train_speed(iter/s)": 0.070232
},
{
"acc": 0.61267309,
"epoch": 0.7567567567567568,
"grad_norm": 0.90234375,
"learning_rate": 0.00012798681137592477,
"loss": 1.62853241,
"memory(GiB)": 17.96,
"step": 210,
"train_speed(iter/s)": 0.070571
},
{
"acc": 0.63069816,
"epoch": 0.7747747747747747,
"grad_norm": 0.89453125,
"learning_rate": 0.00012753273836721597,
"loss": 1.56295233,
"memory(GiB)": 19.4,
"step": 215,
"train_speed(iter/s)": 0.070892
},
{
"acc": 0.60362072,
"epoch": 0.7927927927927928,
"grad_norm": 1.0703125,
"learning_rate": 0.00012706528128357127,
"loss": 1.63038826,
"memory(GiB)": 19.37,
"step": 220,
"train_speed(iter/s)": 0.071181
},
{
"acc": 0.62272639,
"epoch": 0.8108108108108109,
"grad_norm": 0.8828125,
"learning_rate": 0.00012658454434280253,
"loss": 1.5756237,
"memory(GiB)": 19.62,
"step": 225,
"train_speed(iter/s)": 0.071466
},
{
"acc": 0.59926658,
"epoch": 0.8288288288288288,
"grad_norm": 0.75390625,
"learning_rate": 0.00012609063472341633,
"loss": 1.60503426,
"memory(GiB)": 19.63,
"step": 230,
"train_speed(iter/s)": 0.071751
},
{
"acc": 0.60133944,
"epoch": 0.8468468468468469,
"grad_norm": 1.3515625,
"learning_rate": 0.0001255836625407187,
"loss": 1.64450779,
"memory(GiB)": 19.31,
"step": 235,
"train_speed(iter/s)": 0.072034
},
{
"acc": 0.64020758,
"epoch": 0.8648648648648649,
"grad_norm": 0.9375,
"learning_rate": 0.00012506374082226534,
"loss": 1.47053967,
"memory(GiB)": 18.85,
"step": 240,
"train_speed(iter/s)": 0.072286
},
{
"acc": 0.62713485,
"epoch": 0.8828828828828829,
"grad_norm": 0.82421875,
"learning_rate": 0.00012453098548266276,
"loss": 1.51464148,
"memory(GiB)": 19.35,
"step": 245,
"train_speed(iter/s)": 0.07254
},
{
"acc": 0.6202302,
"epoch": 0.9009009009009009,
"grad_norm": 0.625,
"learning_rate": 0.0001239855152977253,
"loss": 1.54778471,
"memory(GiB)": 19.53,
"step": 250,
"train_speed(iter/s)": 0.072758
},
{
"epoch": 0.9009009009009009,
"eval_acc": 0.6308573540280857,
"eval_loss": 1.510523796081543,
"eval_runtime": 134.5445,
"eval_samples_per_second": 1.122,
"eval_steps_per_second": 0.565,
"step": 250
},
{
"acc": 0.63671951,
"epoch": 0.918918918918919,
"grad_norm": 1.7109375,
"learning_rate": 0.00012342745187799459,
"loss": 1.48321924,
"memory(GiB)": 19.53,
"step": 255,
"train_speed(iter/s)": 0.070273
},
{
"acc": 0.63577223,
"epoch": 0.9369369369369369,
"grad_norm": 0.7890625,
"learning_rate": 0.000122856919641627,
"loss": 1.50699987,
"memory(GiB)": 19.94,
"step": 260,
"train_speed(iter/s)": 0.070553
},
{
"acc": 0.64953299,
"epoch": 0.954954954954955,
"grad_norm": 0.85546875,
"learning_rate": 0.000122274045786655,
"loss": 1.46005678,
"memory(GiB)": 20.1,
"step": 265,
"train_speed(iter/s)": 0.070802
},
{
"acc": 0.62153759,
"epoch": 0.972972972972973,
"grad_norm": 1.0625,
"learning_rate": 0.00012167896026262893,
"loss": 1.55834417,
"memory(GiB)": 19.86,
"step": 270,
"train_speed(iter/s)": 0.071052
},
{
"acc": 0.64055209,
"epoch": 0.990990990990991,
"grad_norm": 1.125,
"learning_rate": 0.00012107179574164504,
"loss": 1.54932261,
"memory(GiB)": 20.06,
"step": 275,
"train_speed(iter/s)": 0.071274
},
{
"acc": 0.62708969,
"epoch": 1.009009009009009,
"grad_norm": 0.671875,
"learning_rate": 0.00012045268758876699,
"loss": 1.49731979,
"memory(GiB)": 19.82,
"step": 280,
"train_speed(iter/s)": 0.07152
},
{
"acc": 0.6689836,
"epoch": 1.027027027027027,
"grad_norm": 0.859375,
"learning_rate": 0.00011982177383184648,
"loss": 1.2817215,
"memory(GiB)": 19.85,
"step": 285,
"train_speed(iter/s)": 0.07175
},
{
"acc": 0.67519293,
"epoch": 1.045045045045045,
"grad_norm": 1.046875,
"learning_rate": 0.00011917919513075066,
"loss": 1.28632126,
"memory(GiB)": 19.98,
"step": 290,
"train_speed(iter/s)": 0.071951
},
{
"acc": 0.67276659,
"epoch": 1.063063063063063,
"grad_norm": 0.8984375,
"learning_rate": 0.00011852509474600237,
"loss": 1.27065611,
"memory(GiB)": 20.03,
"step": 295,
"train_speed(iter/s)": 0.072155
},
{
"acc": 0.64641519,
"epoch": 1.0810810810810811,
"grad_norm": 0.98046875,
"learning_rate": 0.00011785961850684083,
"loss": 1.38271847,
"memory(GiB)": 19.09,
"step": 300,
"train_speed(iter/s)": 0.072371
},
{
"epoch": 1.0810810810810811,
"eval_acc": 0.6305617147080562,
"eval_loss": 1.523685097694397,
"eval_runtime": 134.8234,
"eval_samples_per_second": 1.12,
"eval_steps_per_second": 0.564,
"step": 300
},
{
"acc": 0.67837138,
"epoch": 1.0990990990990992,
"grad_norm": 0.953125,
"learning_rate": 0.00011718291477870959,
"loss": 1.29290819,
"memory(GiB)": 22.8,
"step": 305,
"train_speed(iter/s)": 0.070277
},
{
"acc": 0.67195911,
"epoch": 1.117117117117117,
"grad_norm": 1.796875,
"learning_rate": 0.00011649513443017889,
"loss": 1.24073734,
"memory(GiB)": 19.39,
"step": 310,
"train_speed(iter/s)": 0.070516
},
{
"acc": 0.69478951,
"epoch": 1.135135135135135,
"grad_norm": 1.203125,
"learning_rate": 0.00011579643079931018,
"loss": 1.20378675,
"memory(GiB)": 19.38,
"step": 315,
"train_speed(iter/s)": 0.070713
},
{
"acc": 0.68726826,
"epoch": 1.1531531531531531,
"grad_norm": 0.98828125,
"learning_rate": 0.00011508695965946992,
"loss": 1.23284683,
"memory(GiB)": 19.98,
"step": 320,
"train_speed(iter/s)": 0.070919
},
{
"acc": 0.65419765,
"epoch": 1.1711711711711712,
"grad_norm": 0.93359375,
"learning_rate": 0.00011436687918460052,
"loss": 1.37520065,
"memory(GiB)": 20.02,
"step": 325,
"train_speed(iter/s)": 0.071117
},
{
"acc": 0.66610641,
"epoch": 1.1891891891891893,
"grad_norm": 0.8671875,
"learning_rate": 0.000113636349913956,
"loss": 1.30743008,
"memory(GiB)": 19.35,
"step": 330,
"train_speed(iter/s)": 0.071322
},
{
"acc": 0.67390976,
"epoch": 1.2072072072072073,
"grad_norm": 1.6640625,
"learning_rate": 0.00011289553471631045,
"loss": 1.28322783,
"memory(GiB)": 19.49,
"step": 335,
"train_speed(iter/s)": 0.071518
},
{
"acc": 0.68137512,
"epoch": 1.2252252252252251,
"grad_norm": 0.6953125,
"learning_rate": 0.00011214459875364693,
"loss": 1.23027716,
"memory(GiB)": 19.38,
"step": 340,
"train_speed(iter/s)": 0.071692
},
{
"acc": 0.67859125,
"epoch": 1.2432432432432432,
"grad_norm": 0.78515625,
"learning_rate": 0.00011138370944433531,
"loss": 1.22896252,
"memory(GiB)": 20.06,
"step": 345,
"train_speed(iter/s)": 0.071876
},
{
"acc": 0.66445112,
"epoch": 1.2612612612612613,
"grad_norm": 0.90234375,
"learning_rate": 0.00011061303642580694,
"loss": 1.30674038,
"memory(GiB)": 19.49,
"step": 350,
"train_speed(iter/s)": 0.072045
},
{
"epoch": 1.2612612612612613,
"eval_acc": 0.6356245380635624,
"eval_loss": 1.5072119235992432,
"eval_runtime": 134.5232,
"eval_samples_per_second": 1.122,
"eval_steps_per_second": 0.565,
"step": 350
},
{
"acc": 0.67729836,
"epoch": 1.2792792792792793,
"grad_norm": 0.90625,
"learning_rate": 0.00010983275151673467,
"loss": 1.24173574,
"memory(GiB)": 18.93,
"step": 355,
"train_speed(iter/s)": 0.07029
},
{
"acc": 0.7040791,
"epoch": 1.2972972972972974,
"grad_norm": 0.84765625,
"learning_rate": 0.00010904302867872639,
"loss": 1.17582674,
"memory(GiB)": 19.29,
"step": 360,
"train_speed(iter/s)": 0.070479
},
{
"acc": 0.66356058,
"epoch": 1.3153153153153152,
"grad_norm": 0.82421875,
"learning_rate": 0.00010824404397754104,
"loss": 1.26798725,
"memory(GiB)": 19.36,
"step": 365,
"train_speed(iter/s)": 0.070661
},
{
"acc": 0.69379635,
"epoch": 1.3333333333333333,
"grad_norm": 0.98828125,
"learning_rate": 0.0001074359755438354,
"loss": 1.24331112,
"memory(GiB)": 20.16,
"step": 370,
"train_speed(iter/s)": 0.070843
},
{
"acc": 0.68220735,
"epoch": 1.3513513513513513,
"grad_norm": 0.94140625,
"learning_rate": 0.00010661900353345051,
"loss": 1.20891714,
"memory(GiB)": 19.61,
"step": 375,
"train_speed(iter/s)": 0.071015
},
{
"acc": 0.67620883,
"epoch": 1.3693693693693694,
"grad_norm": 1.0625,
"learning_rate": 0.0001057933100872466,
"loss": 1.23957863,
"memory(GiB)": 20.17,
"step": 380,
"train_speed(iter/s)": 0.071181
},
{
"acc": 0.63655629,
"epoch": 1.3873873873873874,
"grad_norm": 0.78515625,
"learning_rate": 0.00010495907929049546,
"loss": 1.44390507,
"memory(GiB)": 19.25,
"step": 385,
"train_speed(iter/s)": 0.071356
},
{
"acc": 0.67883902,
"epoch": 1.4054054054054055,
"grad_norm": 0.8828125,
"learning_rate": 0.00010411649713183925,
"loss": 1.29691544,
"memory(GiB)": 18.78,
"step": 390,
"train_speed(iter/s)": 0.071515
},
{
"acc": 0.67202511,
"epoch": 1.4234234234234235,
"grad_norm": 0.953125,
"learning_rate": 0.00010326575146182521,
"loss": 1.31318274,
"memory(GiB)": 19.88,
"step": 395,
"train_speed(iter/s)": 0.071677
},
{
"acc": 0.69274058,
"epoch": 1.4414414414414414,
"grad_norm": 0.82421875,
"learning_rate": 0.00010240703195102489,
"loss": 1.15976305,
"memory(GiB)": 19.46,
"step": 400,
"train_speed(iter/s)": 0.071832
},
{
"epoch": 1.4414414414414414,
"eval_acc": 0.6368440502586844,
"eval_loss": 1.4986343383789062,
"eval_runtime": 134.3425,
"eval_samples_per_second": 1.124,
"eval_steps_per_second": 0.566,
"step": 400
},
{
"acc": 0.71039405,
"epoch": 1.4594594594594594,
"grad_norm": 0.77734375,
"learning_rate": 0.0001015405300477479,
"loss": 1.12253609,
"memory(GiB)": 19.92,
"step": 405,
"train_speed(iter/s)": 0.070298
},
{
"acc": 0.71356583,
"epoch": 1.4774774774774775,
"grad_norm": 0.84375,
"learning_rate": 0.0001006664389353592,
"loss": 1.13753939,
"memory(GiB)": 19.31,
"step": 410,
"train_speed(iter/s)": 0.070457
},
{
"acc": 0.675458,
"epoch": 1.4954954954954955,
"grad_norm": 1.1328125,
"learning_rate": 9.978495348920958e-05,
"loss": 1.29233532,
"memory(GiB)": 19.06,
"step": 415,
"train_speed(iter/s)": 0.070616
},
{
"acc": 0.67761598,
"epoch": 1.5135135135135136,
"grad_norm": 0.6875,
"learning_rate": 9.889627023318897e-05,
"loss": 1.22440186,
"memory(GiB)": 19.16,
"step": 420,
"train_speed(iter/s)": 0.070773
},
{
"acc": 0.67492404,
"epoch": 1.5315315315315314,
"grad_norm": 0.81640625,
"learning_rate": 9.800058729591212e-05,
"loss": 1.22408361,
"memory(GiB)": 19.97,
"step": 425,
"train_speed(iter/s)": 0.070935
},
{
"acc": 0.68050842,
"epoch": 1.5495495495495497,
"grad_norm": 0.84765625,
"learning_rate": 9.70981043665466e-05,
"loss": 1.2078824,
"memory(GiB)": 19.92,
"step": 430,
"train_speed(iter/s)": 0.07109
},
{
"acc": 0.6750885,
"epoch": 1.5675675675675675,
"grad_norm": 0.66796875,
"learning_rate": 9.618902265029284e-05,
"loss": 1.28742075,
"memory(GiB)": 19.27,
"step": 435,
"train_speed(iter/s)": 0.071229
},
{
"acc": 0.64411507,
"epoch": 1.5855855855855856,
"grad_norm": 0.95703125,
"learning_rate": 9.527354482352616e-05,
"loss": 1.37240067,
"memory(GiB)": 20.21,
"step": 440,
"train_speed(iter/s)": 0.071374
},
{
"acc": 0.67574663,
"epoch": 1.6036036036036037,
"grad_norm": 0.83984375,
"learning_rate": 9.435187498861085e-05,
"loss": 1.27780771,
"memory(GiB)": 19.95,
"step": 445,
"train_speed(iter/s)": 0.071519
},
{
"acc": 0.67897987,
"epoch": 1.6216216216216215,
"grad_norm": 1.2265625,
"learning_rate": 9.342421862839632e-05,
"loss": 1.26616125,
"memory(GiB)": 19.32,
"step": 450,
"train_speed(iter/s)": 0.071661
},
{
"epoch": 1.6216216216216215,
"eval_acc": 0.6424611973392461,
"eval_loss": 1.4772522449493408,
"eval_runtime": 134.5995,
"eval_samples_per_second": 1.122,
"eval_steps_per_second": 0.565,
"step": 450
},
{
"acc": 0.66755495,
"epoch": 1.6396396396396398,
"grad_norm": 1.0390625,
"learning_rate": 9.249078256040541e-05,
"loss": 1.30118093,
"memory(GiB)": 22.82,
"step": 455,
"train_speed(iter/s)": 0.070312
},
{
"acc": 0.66560607,
"epoch": 1.6576576576576576,
"grad_norm": 1.0546875,
"learning_rate": 9.155177489072527e-05,
"loss": 1.31042576,
"memory(GiB)": 19.56,
"step": 460,
"train_speed(iter/s)": 0.070454
},
{
"acc": 0.67957892,
"epoch": 1.6756756756756757,
"grad_norm": 1.3828125,
"learning_rate": 9.060740496761082e-05,
"loss": 1.31165123,
"memory(GiB)": 19.38,
"step": 465,
"train_speed(iter/s)": 0.070592
},
{
"acc": 0.6744031,
"epoch": 1.6936936936936937,
"grad_norm": 1.4140625,
"learning_rate": 8.965788333481144e-05,
"loss": 1.26758223,
"memory(GiB)": 19.42,
"step": 470,
"train_speed(iter/s)": 0.070726
},
{
"acc": 0.66551232,
"epoch": 1.7117117117117115,
"grad_norm": 0.98046875,
"learning_rate": 8.870342168463085e-05,
"loss": 1.27216129,
"memory(GiB)": 19.27,
"step": 475,
"train_speed(iter/s)": 0.070864
},
{
"acc": 0.65833273,
"epoch": 1.7297297297297298,
"grad_norm": 0.9140625,
"learning_rate": 8.77442328107313e-05,
"loss": 1.32684155,
"memory(GiB)": 19.48,
"step": 480,
"train_speed(iter/s)": 0.070997
},
{
"acc": 0.68646383,
"epoch": 1.7477477477477477,
"grad_norm": 1.3671875,
"learning_rate": 8.678053056069184e-05,
"loss": 1.2200016,
"memory(GiB)": 19.24,
"step": 485,
"train_speed(iter/s)": 0.071136
},
{
"acc": 0.69040904,
"epoch": 1.7657657657657657,
"grad_norm": 1.6171875,
"learning_rate": 8.581252978833194e-05,
"loss": 1.18706884,
"memory(GiB)": 19.53,
"step": 490,
"train_speed(iter/s)": 0.07127
},
{
"acc": 0.66571455,
"epoch": 1.7837837837837838,
"grad_norm": 0.8515625,
"learning_rate": 8.484044630581057e-05,
"loss": 1.29456005,
"memory(GiB)": 20.09,
"step": 495,
"train_speed(iter/s)": 0.071401
},
{
"acc": 0.67682033,
"epoch": 1.8018018018018018,
"grad_norm": 1.0,
"learning_rate": 8.386449683551164e-05,
"loss": 1.20547714,
"memory(GiB)": 19.95,
"step": 500,
"train_speed(iter/s)": 0.071533
},
{
"epoch": 1.8018018018018018,
"eval_acc": 0.6413155949741316,
"eval_loss": 1.479081630706787,
"eval_runtime": 134.2299,
"eval_samples_per_second": 1.125,
"eval_steps_per_second": 0.566,
"step": 500
},
{
"acc": 0.67326751,
"epoch": 1.8198198198198199,
"grad_norm": 1.0546875,
"learning_rate": 8.288489896172669e-05,
"loss": 1.25247726,
"memory(GiB)": 20.29,
"step": 505,
"train_speed(iter/s)": 0.070304
},
{
"acc": 0.66375732,
"epoch": 1.8378378378378377,
"grad_norm": 0.9296875,
"learning_rate": 8.190187108214514e-05,
"loss": 1.28065901,
"memory(GiB)": 20.04,
"step": 510,
"train_speed(iter/s)": 0.070438
},
{
"acc": 0.69006267,
"epoch": 1.855855855855856,
"grad_norm": 1.0234375,
"learning_rate": 8.091563235916343e-05,
"loss": 1.13905525,
"memory(GiB)": 20.03,
"step": 515,
"train_speed(iter/s)": 0.070569
},
{
"acc": 0.69745221,
"epoch": 1.8738738738738738,
"grad_norm": 0.96484375,
"learning_rate": 7.992640267102351e-05,
"loss": 1.14712362,
"memory(GiB)": 18.5,
"step": 520,
"train_speed(iter/s)": 0.070709
},
{
"acc": 0.6707756,
"epoch": 1.8918918918918919,
"grad_norm": 1.328125,
"learning_rate": 7.893440256279186e-05,
"loss": 1.30717278,
"memory(GiB)": 20.66,
"step": 525,
"train_speed(iter/s)": 0.07083
},
{
"acc": 0.66872559,
"epoch": 1.90990990990991,
"grad_norm": 0.9765625,
"learning_rate": 7.793985319718982e-05,
"loss": 1.28408003,
"memory(GiB)": 19.48,
"step": 530,
"train_speed(iter/s)": 0.070948
},
{
"acc": 0.68111048,
"epoch": 1.9279279279279278,
"grad_norm": 0.76171875,
"learning_rate": 7.694297630528612e-05,
"loss": 1.21391411,
"memory(GiB)": 19.88,
"step": 535,
"train_speed(iter/s)": 0.071071
},
{
"acc": 0.65094652,
"epoch": 1.945945945945946,
"grad_norm": 0.83203125,
"learning_rate": 7.594399413706277e-05,
"loss": 1.34138126,
"memory(GiB)": 19.9,
"step": 540,
"train_speed(iter/s)": 0.071193
},
{
"acc": 0.67896776,
"epoch": 1.9639639639639639,
"grad_norm": 0.796875,
"learning_rate": 7.494312941186529e-05,
"loss": 1.22575331,
"memory(GiB)": 19.43,
"step": 545,
"train_speed(iter/s)": 0.071302
},
{
"acc": 0.6839644,
"epoch": 1.981981981981982,
"grad_norm": 0.78515625,
"learning_rate": 7.394060526874825e-05,
"loss": 1.25017443,
"memory(GiB)": 19.25,
"step": 550,
"train_speed(iter/s)": 0.07142
},
{
"epoch": 1.981981981981982,
"eval_acc": 0.645269770879527,
"eval_loss": 1.4606801271438599,
"eval_runtime": 134.7756,
"eval_samples_per_second": 1.12,
"eval_steps_per_second": 0.564,
"step": 550
},
{
"acc": 0.68771811,
"epoch": 2.0,
"grad_norm": 0.81640625,
"learning_rate": 7.293664521672729e-05,
"loss": 1.22415581,
"memory(GiB)": 22.67,
"step": 555,
"train_speed(iter/s)": 0.070304
},
{
"acc": 0.741537,
"epoch": 2.018018018018018,
"grad_norm": 0.6171875,
"learning_rate": 7.193147308494851e-05,
"loss": 0.95370378,
"memory(GiB)": 19.64,
"step": 560,
"train_speed(iter/s)": 0.070425
},
{
"acc": 0.75044699,
"epoch": 2.036036036036036,
"grad_norm": 1.09375,
"learning_rate": 7.09253129727867e-05,
"loss": 0.95568914,
"memory(GiB)": 19.4,
"step": 565,
"train_speed(iter/s)": 0.070541
},
{
"acc": 0.75126195,
"epoch": 2.054054054054054,
"grad_norm": 1.3671875,
"learning_rate": 6.991838919988322e-05,
"loss": 0.92719631,
"memory(GiB)": 19.54,
"step": 570,
"train_speed(iter/s)": 0.070658
},
{
"acc": 0.74883032,
"epoch": 2.0720720720720722,
"grad_norm": 1.0078125,
"learning_rate": 6.891092625613469e-05,
"loss": 0.92080975,
"memory(GiB)": 20.17,
"step": 575,
"train_speed(iter/s)": 0.07077
},
{
"acc": 0.76222944,
"epoch": 2.09009009009009,
"grad_norm": 0.99609375,
"learning_rate": 6.790314875164393e-05,
"loss": 0.88407106,
"memory(GiB)": 19.57,
"step": 580,
"train_speed(iter/s)": 0.070882
},
{
"acc": 0.76224823,
"epoch": 2.108108108108108,
"grad_norm": 1.0859375,
"learning_rate": 6.689528136664377e-05,
"loss": 0.85150976,
"memory(GiB)": 19.54,
"step": 585,
"train_speed(iter/s)": 0.070995
},
{
"acc": 0.73958569,
"epoch": 2.126126126126126,
"grad_norm": 1.3828125,
"learning_rate": 6.588754880140573e-05,
"loss": 0.92128286,
"memory(GiB)": 19.58,
"step": 590,
"train_speed(iter/s)": 0.071101
},
{
"acc": 0.74549003,
"epoch": 2.144144144144144,
"grad_norm": 1.359375,
"learning_rate": 6.488017572614363e-05,
"loss": 0.90851021,
"memory(GiB)": 18.59,
"step": 595,
"train_speed(iter/s)": 0.071211
},
{
"acc": 0.73912826,
"epoch": 2.1621621621621623,
"grad_norm": 1.3125,
"learning_rate": 6.387338673092443e-05,
"loss": 0.92900734,
"memory(GiB)": 19.54,
"step": 600,
"train_speed(iter/s)": 0.071321
},
{
"epoch": 2.1621621621621623,
"eval_acc": 0.6320768662232077,
"eval_loss": 1.5818341970443726,
"eval_runtime": 134.4691,
"eval_samples_per_second": 1.123,
"eval_steps_per_second": 0.565,
"step": 600
},
{
"acc": 0.75979438,
"epoch": 2.18018018018018,
"grad_norm": 1.09375,
"learning_rate": 6.286740627559656e-05,
"loss": 0.89129753,
"memory(GiB)": 22.37,
"step": 605,
"train_speed(iter/s)": 0.070301
},
{
"acc": 0.72820721,
"epoch": 2.1981981981981984,
"grad_norm": 2.15625,
"learning_rate": 6.186245863974757e-05,
"loss": 0.96495447,
"memory(GiB)": 19.6,
"step": 610,
"train_speed(iter/s)": 0.070413
},
{
"acc": 0.75764585,
"epoch": 2.2162162162162162,
"grad_norm": 1.0078125,
"learning_rate": 6.0858767872701715e-05,
"loss": 0.89218092,
"memory(GiB)": 20.15,
"step": 615,
"train_speed(iter/s)": 0.070515
},
{
"acc": 0.75772595,
"epoch": 2.234234234234234,
"grad_norm": 1.6328125,
"learning_rate": 5.985655774356901e-05,
"loss": 0.89191771,
"memory(GiB)": 19.46,
"step": 620,
"train_speed(iter/s)": 0.070627
},
{
"acc": 0.7377079,
"epoch": 2.2522522522522523,
"grad_norm": 1.1875,
"learning_rate": 5.8856051691356884e-05,
"loss": 0.94241228,
"memory(GiB)": 19.35,
"step": 625,
"train_speed(iter/s)": 0.070733
},
{
"acc": 0.77948771,
"epoch": 2.27027027027027,
"grad_norm": 1.2890625,
"learning_rate": 5.785747277515506e-05,
"loss": 0.79317036,
"memory(GiB)": 20.48,
"step": 630,
"train_speed(iter/s)": 0.070844
},
{
"acc": 0.76766949,
"epoch": 2.2882882882882885,
"grad_norm": 0.97265625,
"learning_rate": 5.686104362440552e-05,
"loss": 0.82855272,
"memory(GiB)": 20.12,
"step": 635,
"train_speed(iter/s)": 0.070945
},
{
"acc": 0.74998231,
"epoch": 2.3063063063063063,
"grad_norm": 2.9375,
"learning_rate": 5.586698638926811e-05,
"loss": 0.93049393,
"memory(GiB)": 20.06,
"step": 640,
"train_speed(iter/s)": 0.071044
},
{
"acc": 0.75094385,
"epoch": 2.3243243243243246,
"grad_norm": 1.1875,
"learning_rate": 5.487552269109287e-05,
"loss": 0.86875353,
"memory(GiB)": 19.33,
"step": 645,
"train_speed(iter/s)": 0.071146
},
{
"acc": 0.74836354,
"epoch": 2.3423423423423424,
"grad_norm": 1.1328125,
"learning_rate": 5.388687357301051e-05,
"loss": 0.88861446,
"memory(GiB)": 20.11,
"step": 650,
"train_speed(iter/s)": 0.071249
},
{
"epoch": 2.3423423423423424,
"eval_acc": 0.630709534368071,
"eval_loss": 1.5767972469329834,
"eval_runtime": 134.3063,
"eval_samples_per_second": 1.124,
"eval_steps_per_second": 0.566,
"step": 650
},
{
"acc": 0.76697993,
"epoch": 2.3603603603603602,
"grad_norm": 1.2734375,
"learning_rate": 5.290125945065162e-05,
"loss": 0.85701361,
"memory(GiB)": 22.96,
"step": 655,
"train_speed(iter/s)": 0.070324
},
{
"acc": 0.76252317,
"epoch": 2.3783783783783785,
"grad_norm": 1.0390625,
"learning_rate": 5.191890006300573e-05,
"loss": 0.85787058,
"memory(GiB)": 20.13,
"step": 660,
"train_speed(iter/s)": 0.070422
},
{
"acc": 0.7651772,
"epoch": 2.3963963963963963,
"grad_norm": 1.1875,
"learning_rate": 5.094001442343155e-05,
"loss": 0.8521904,
"memory(GiB)": 19.86,
"step": 665,
"train_speed(iter/s)": 0.070523
},
{
"acc": 0.73847542,
"epoch": 2.4144144144144146,
"grad_norm": 1.2734375,
"learning_rate": 4.996482077082849e-05,
"loss": 0.95858736,
"memory(GiB)": 19.29,
"step": 670,
"train_speed(iter/s)": 0.070628
},
{
"acc": 0.74675932,
"epoch": 2.4324324324324325,
"grad_norm": 1.2734375,
"learning_rate": 4.899353652098139e-05,
"loss": 0.86487961,
"memory(GiB)": 18.64,
"step": 675,
"train_speed(iter/s)": 0.070727
},
{
"acc": 0.73309464,
"epoch": 2.4504504504504503,
"grad_norm": 1.8671875,
"learning_rate": 4.802637821808819e-05,
"loss": 0.93775883,
"memory(GiB)": 19.78,
"step": 680,
"train_speed(iter/s)": 0.070825
},
{
"acc": 0.76575212,
"epoch": 2.4684684684684686,
"grad_norm": 1.03125,
"learning_rate": 4.706356148648246e-05,
"loss": 0.8259285,
"memory(GiB)": 19.9,
"step": 685,
"train_speed(iter/s)": 0.07092
},
{
"acc": 0.76865396,
"epoch": 2.4864864864864864,
"grad_norm": 1.3125,
"learning_rate": 4.6105300982560625e-05,
"loss": 0.84868517,
"memory(GiB)": 19.19,
"step": 690,
"train_speed(iter/s)": 0.071014
},
{
"acc": 0.75694928,
"epoch": 2.5045045045045047,
"grad_norm": 1.03125,
"learning_rate": 4.515181034692515e-05,
"loss": 0.87043924,
"memory(GiB)": 19.95,
"step": 695,
"train_speed(iter/s)": 0.071105
},
{
"acc": 0.75771561,
"epoch": 2.5225225225225225,
"grad_norm": 1.3515625,
"learning_rate": 4.420330215675415e-05,
"loss": 0.86245804,
"memory(GiB)": 19.18,
"step": 700,
"train_speed(iter/s)": 0.071194
},
{
"epoch": 2.5225225225225225,
"eval_acc": 0.6335181079083518,
"eval_loss": 1.5894646644592285,
"eval_runtime": 134.225,
"eval_samples_per_second": 1.125,
"eval_steps_per_second": 0.566,
"step": 700
},
{
"acc": 0.76191721,
"epoch": 2.5405405405405403,
"grad_norm": 1.71875,
"learning_rate": 4.325998787840818e-05,
"loss": 0.85848246,
"memory(GiB)": 19.14,
"step": 705,
"train_speed(iter/s)": 0.070324
},
{
"acc": 0.76571012,
"epoch": 2.5585585585585586,
"grad_norm": 1.15625,
"learning_rate": 4.2322077820284477e-05,
"loss": 0.85979414,
"memory(GiB)": 20.01,
"step": 710,
"train_speed(iter/s)": 0.070422
},
{
"acc": 0.73852654,
"epoch": 2.5765765765765765,
"grad_norm": 1.6484375,
"learning_rate": 4.138978108592962e-05,
"loss": 0.90148897,
"memory(GiB)": 19.05,
"step": 715,
"train_speed(iter/s)": 0.070518
},
{
"acc": 0.76960816,
"epoch": 2.5945945945945947,
"grad_norm": 3.71875,
"learning_rate": 4.046330552742053e-05,
"loss": 0.88053255,
"memory(GiB)": 19.25,
"step": 720,
"train_speed(iter/s)": 0.070616
},
{
"acc": 0.77552128,
"epoch": 2.6126126126126126,
"grad_norm": 0.96484375,
"learning_rate": 3.954285769902474e-05,
"loss": 0.83608866,
"memory(GiB)": 19.96,
"step": 725,
"train_speed(iter/s)": 0.070707
},
{
"acc": 0.76034231,
"epoch": 2.6306306306306304,
"grad_norm": 1.078125,
"learning_rate": 3.8628642811149894e-05,
"loss": 0.84258709,
"memory(GiB)": 19.75,
"step": 730,
"train_speed(iter/s)": 0.070796
},
{
"acc": 0.73506665,
"epoch": 2.6486486486486487,
"grad_norm": 2.125,
"learning_rate": 3.772086468459271e-05,
"loss": 0.96418314,
"memory(GiB)": 19.94,
"step": 735,
"train_speed(iter/s)": 0.070887
},
{
"acc": 0.74339218,
"epoch": 2.6666666666666665,
"grad_norm": 1.3359375,
"learning_rate": 3.6819725705098094e-05,
"loss": 0.94632616,
"memory(GiB)": 19.98,
"step": 740,
"train_speed(iter/s)": 0.070978
},
{
"acc": 0.75258017,
"epoch": 2.684684684684685,
"grad_norm": 1.328125,
"learning_rate": 3.592542677823787e-05,
"loss": 0.89630384,
"memory(GiB)": 19.9,
"step": 745,
"train_speed(iter/s)": 0.071065
},
{
"acc": 0.7422905,
"epoch": 2.7027027027027026,
"grad_norm": 1.46875,
"learning_rate": 3.503816728461963e-05,
"loss": 0.92554636,
"memory(GiB)": 19.94,
"step": 750,
"train_speed(iter/s)": 0.071152
},
{
"epoch": 2.7027027027027026,
"eval_acc": 0.6360679970436068,
"eval_loss": 1.577430248260498,
"eval_runtime": 134.0595,
"eval_samples_per_second": 1.126,
"eval_steps_per_second": 0.567,
"step": 750
},
{
"acc": 0.76009235,
"epoch": 2.7207207207207205,
"grad_norm": 1.7265625,
"learning_rate": 3.415814503543563e-05,
"loss": 0.89433851,
"memory(GiB)": 19.38,
"step": 755,
"train_speed(iter/s)": 0.070345
},
{
"acc": 0.75049233,
"epoch": 2.7387387387387387,
"grad_norm": 1.453125,
"learning_rate": 3.3285556228361483e-05,
"loss": 0.90194426,
"memory(GiB)": 19.78,
"step": 760,
"train_speed(iter/s)": 0.070432
},
{
"acc": 0.73652792,
"epoch": 2.756756756756757,
"grad_norm": 1.375,
"learning_rate": 3.2420595403814615e-05,
"loss": 0.94170513,
"memory(GiB)": 19.18,
"step": 765,
"train_speed(iter/s)": 0.070517
},
{
"acc": 0.74097948,
"epoch": 2.774774774774775,
"grad_norm": 1.171875,
"learning_rate": 3.156345540158226e-05,
"loss": 0.92526283,
"memory(GiB)": 19.96,
"step": 770,
"train_speed(iter/s)": 0.070603
},
{
"acc": 0.77357135,
"epoch": 2.7927927927927927,
"grad_norm": 1.21875,
"learning_rate": 3.0714327317828445e-05,
"loss": 0.84344234,
"memory(GiB)": 19.42,
"step": 775,
"train_speed(iter/s)": 0.070681
},
{
"acc": 0.76570077,
"epoch": 2.810810810810811,
"grad_norm": 1.4765625,
"learning_rate": 2.9873400462489982e-05,
"loss": 0.85261898,
"memory(GiB)": 19.91,
"step": 780,
"train_speed(iter/s)": 0.070768
},
{
"acc": 0.73979292,
"epoch": 2.828828828828829,
"grad_norm": 1.375,
"learning_rate": 2.904086231707032e-05,
"loss": 0.94777365,
"memory(GiB)": 19.72,
"step": 785,
"train_speed(iter/s)": 0.07085
},
{
"acc": 0.75035534,
"epoch": 2.846846846846847,
"grad_norm": 1.1484375,
"learning_rate": 2.8216898492841355e-05,
"loss": 0.88380022,
"memory(GiB)": 19.09,
"step": 790,
"train_speed(iter/s)": 0.070936
},
{
"acc": 0.76033754,
"epoch": 2.864864864864865,
"grad_norm": 1.078125,
"learning_rate": 2.7401692689462153e-05,
"loss": 0.84767551,
"memory(GiB)": 20.02,
"step": 795,
"train_speed(iter/s)": 0.071016
},
{
"acc": 0.74806399,
"epoch": 2.8828828828828827,
"grad_norm": 1.53125,
"learning_rate": 2.6595426654023643e-05,
"loss": 0.92544088,
"memory(GiB)": 19.88,
"step": 800,
"train_speed(iter/s)": 0.0711
},
{
"epoch": 2.8828828828828827,
"eval_acc": 0.635920177383592,
"eval_loss": 1.5869847536087036,
"eval_runtime": 134.517,
"eval_samples_per_second": 1.123,
"eval_steps_per_second": 0.565,
"step": 800
}
],
"logging_steps": 5,
"max_steps": 1108,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.3166381763355443e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}