{ "best_metric": 0.16356664896011353, "best_model_checkpoint": "./vit-indian-food/checkpoint-500", "epoch": 2.6178010471204187, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 6.967112064361572, "learning_rate": 0.00019895287958115185, "loss": 2.3423, "step": 5 }, { "epoch": 0.05, "grad_norm": 6.768218517303467, "learning_rate": 0.00019790575916230367, "loss": 1.5082, "step": 10 }, { "epoch": 0.08, "grad_norm": 6.283707141876221, "learning_rate": 0.0001968586387434555, "loss": 1.1555, "step": 15 }, { "epoch": 0.1, "grad_norm": 4.167559623718262, "learning_rate": 0.00019581151832460733, "loss": 0.7045, "step": 20 }, { "epoch": 0.13, "grad_norm": 6.519095420837402, "learning_rate": 0.00019476439790575917, "loss": 0.5942, "step": 25 }, { "epoch": 0.16, "grad_norm": 4.0483479499816895, "learning_rate": 0.000193717277486911, "loss": 0.45, "step": 30 }, { "epoch": 0.18, "grad_norm": 4.359745025634766, "learning_rate": 0.00019267015706806283, "loss": 0.3833, "step": 35 }, { "epoch": 0.21, "grad_norm": 5.201784610748291, "learning_rate": 0.00019162303664921465, "loss": 0.3702, "step": 40 }, { "epoch": 0.24, "grad_norm": 4.259886741638184, "learning_rate": 0.0001905759162303665, "loss": 0.3227, "step": 45 }, { "epoch": 0.26, "grad_norm": 4.89500617980957, "learning_rate": 0.00018952879581151833, "loss": 0.2368, "step": 50 }, { "epoch": 0.26, "eval_accuracy": 0.9002624671916011, "eval_loss": 0.33144867420196533, "eval_precision": 0.9131344269428404, "eval_recall": 0.9002624671916011, "eval_runtime": 17.436, "eval_samples_per_second": 43.703, "eval_steps_per_second": 2.753, "step": 50 }, { "epoch": 0.29, "grad_norm": 4.650336265563965, "learning_rate": 0.00018848167539267018, "loss": 0.4139, "step": 55 }, { "epoch": 0.31, "grad_norm": 4.881960391998291, "learning_rate": 0.00018743455497382202, "loss": 0.2109, "step": 60 }, { "epoch": 0.34, "grad_norm": 5.3574066162109375, "learning_rate": 0.00018638743455497384, "loss": 0.3076, "step": 65 }, { "epoch": 0.37, "grad_norm": 5.008607864379883, "learning_rate": 0.00018534031413612568, "loss": 0.2632, "step": 70 }, { "epoch": 0.39, "grad_norm": 4.561398506164551, "learning_rate": 0.0001842931937172775, "loss": 0.2062, "step": 75 }, { "epoch": 0.42, "grad_norm": 5.034663677215576, "learning_rate": 0.00018324607329842934, "loss": 0.4752, "step": 80 }, { "epoch": 0.45, "grad_norm": 8.143318176269531, "learning_rate": 0.00018219895287958115, "loss": 0.3963, "step": 85 }, { "epoch": 0.47, "grad_norm": 6.556373119354248, "learning_rate": 0.000181151832460733, "loss": 0.5088, "step": 90 }, { "epoch": 0.5, "grad_norm": 4.414945125579834, "learning_rate": 0.0001801047120418848, "loss": 0.2903, "step": 95 }, { "epoch": 0.52, "grad_norm": 2.464193820953369, "learning_rate": 0.00017905759162303666, "loss": 0.2801, "step": 100 }, { "epoch": 0.52, "eval_accuracy": 0.9422572178477691, "eval_loss": 0.18048594892024994, "eval_precision": 0.9466471604125616, "eval_recall": 0.9422572178477691, "eval_runtime": 17.2463, "eval_samples_per_second": 44.183, "eval_steps_per_second": 2.783, "step": 100 }, { "epoch": 0.55, "grad_norm": 5.096113681793213, "learning_rate": 0.0001780104712041885, "loss": 0.213, "step": 105 }, { "epoch": 0.58, "grad_norm": 1.4251294136047363, "learning_rate": 0.00017696335078534032, "loss": 0.1831, "step": 110 }, { "epoch": 0.6, "grad_norm": 3.766965627670288, "learning_rate": 0.00017591623036649216, "loss": 0.1636, "step": 115 }, { "epoch": 0.63, "grad_norm": 3.66446852684021, "learning_rate": 0.00017486910994764398, "loss": 0.3075, "step": 120 }, { "epoch": 0.65, "grad_norm": 4.702181339263916, "learning_rate": 0.00017382198952879582, "loss": 0.2472, "step": 125 }, { "epoch": 0.68, "grad_norm": 8.463685035705566, "learning_rate": 0.00017277486910994763, "loss": 0.4341, "step": 130 }, { "epoch": 0.71, "grad_norm": 5.947971343994141, "learning_rate": 0.00017172774869109948, "loss": 0.2291, "step": 135 }, { "epoch": 0.73, "grad_norm": 5.682934761047363, "learning_rate": 0.00017068062827225132, "loss": 0.1632, "step": 140 }, { "epoch": 0.76, "grad_norm": 9.100614547729492, "learning_rate": 0.00016963350785340316, "loss": 0.307, "step": 145 }, { "epoch": 0.79, "grad_norm": 4.207605361938477, "learning_rate": 0.00016858638743455498, "loss": 0.2362, "step": 150 }, { "epoch": 0.79, "eval_accuracy": 0.9186351706036745, "eval_loss": 0.29145774245262146, "eval_precision": 0.9273784378254858, "eval_recall": 0.9186351706036745, "eval_runtime": 17.4226, "eval_samples_per_second": 43.736, "eval_steps_per_second": 2.755, "step": 150 }, { "epoch": 0.81, "grad_norm": 3.4303929805755615, "learning_rate": 0.00016753926701570682, "loss": 0.074, "step": 155 }, { "epoch": 0.84, "grad_norm": 4.852244853973389, "learning_rate": 0.00016649214659685867, "loss": 0.2731, "step": 160 }, { "epoch": 0.86, "grad_norm": 0.3142894506454468, "learning_rate": 0.00016544502617801048, "loss": 0.1295, "step": 165 }, { "epoch": 0.89, "grad_norm": 3.892892360687256, "learning_rate": 0.00016439790575916233, "loss": 0.225, "step": 170 }, { "epoch": 0.92, "grad_norm": 4.833043575286865, "learning_rate": 0.00016335078534031414, "loss": 0.2674, "step": 175 }, { "epoch": 0.94, "grad_norm": 5.150189399719238, "learning_rate": 0.00016230366492146599, "loss": 0.241, "step": 180 }, { "epoch": 0.97, "grad_norm": 5.426727771759033, "learning_rate": 0.0001612565445026178, "loss": 0.2049, "step": 185 }, { "epoch": 0.99, "grad_norm": 5.2064738273620605, "learning_rate": 0.00016020942408376964, "loss": 0.1756, "step": 190 }, { "epoch": 1.02, "grad_norm": 2.034318685531616, "learning_rate": 0.00015916230366492146, "loss": 0.0499, "step": 195 }, { "epoch": 1.05, "grad_norm": 4.4703145027160645, "learning_rate": 0.0001581151832460733, "loss": 0.042, "step": 200 }, { "epoch": 1.05, "eval_accuracy": 0.9461942257217848, "eval_loss": 0.16130226850509644, "eval_precision": 0.9511635531837818, "eval_recall": 0.9461942257217848, "eval_runtime": 17.3623, "eval_samples_per_second": 43.888, "eval_steps_per_second": 2.765, "step": 200 }, { "epoch": 1.07, "grad_norm": 0.06716866791248322, "learning_rate": 0.00015706806282722515, "loss": 0.0643, "step": 205 }, { "epoch": 1.1, "grad_norm": 7.002842903137207, "learning_rate": 0.00015602094240837696, "loss": 0.1028, "step": 210 }, { "epoch": 1.13, "grad_norm": 0.013943655416369438, "learning_rate": 0.0001549738219895288, "loss": 0.0941, "step": 215 }, { "epoch": 1.15, "grad_norm": 0.42214348912239075, "learning_rate": 0.00015392670157068062, "loss": 0.0273, "step": 220 }, { "epoch": 1.18, "grad_norm": 8.428805351257324, "learning_rate": 0.00015287958115183247, "loss": 0.0589, "step": 225 }, { "epoch": 1.2, "grad_norm": 2.468398094177246, "learning_rate": 0.00015183246073298428, "loss": 0.0321, "step": 230 }, { "epoch": 1.23, "grad_norm": 0.8661732077598572, "learning_rate": 0.00015078534031413612, "loss": 0.0856, "step": 235 }, { "epoch": 1.26, "grad_norm": 0.4541776478290558, "learning_rate": 0.00014973821989528797, "loss": 0.0136, "step": 240 }, { "epoch": 1.28, "grad_norm": 4.832011699676514, "learning_rate": 0.0001486910994764398, "loss": 0.1087, "step": 245 }, { "epoch": 1.31, "grad_norm": 4.744746685028076, "learning_rate": 0.00014764397905759163, "loss": 0.0477, "step": 250 }, { "epoch": 1.31, "eval_accuracy": 0.9514435695538058, "eval_loss": 0.14496225118637085, "eval_precision": 0.9534067355318784, "eval_recall": 0.9514435695538058, "eval_runtime": 17.4463, "eval_samples_per_second": 43.677, "eval_steps_per_second": 2.751, "step": 250 }, { "epoch": 1.34, "grad_norm": 0.053365129977464676, "learning_rate": 0.00014659685863874347, "loss": 0.0171, "step": 255 }, { "epoch": 1.36, "grad_norm": 0.41375842690467834, "learning_rate": 0.00014554973821989531, "loss": 0.1189, "step": 260 }, { "epoch": 1.39, "grad_norm": 0.05034720525145531, "learning_rate": 0.00014450261780104713, "loss": 0.0377, "step": 265 }, { "epoch": 1.41, "grad_norm": 2.498413324356079, "learning_rate": 0.00014345549738219897, "loss": 0.0334, "step": 270 }, { "epoch": 1.44, "grad_norm": 0.06190189719200134, "learning_rate": 0.0001424083769633508, "loss": 0.0158, "step": 275 }, { "epoch": 1.47, "grad_norm": 1.9862781763076782, "learning_rate": 0.00014136125654450263, "loss": 0.0186, "step": 280 }, { "epoch": 1.49, "grad_norm": 0.047805577516555786, "learning_rate": 0.00014031413612565445, "loss": 0.0705, "step": 285 }, { "epoch": 1.52, "grad_norm": 0.03765374794602394, "learning_rate": 0.0001392670157068063, "loss": 0.1822, "step": 290 }, { "epoch": 1.54, "grad_norm": 5.020593643188477, "learning_rate": 0.0001382198952879581, "loss": 0.0375, "step": 295 }, { "epoch": 1.57, "grad_norm": 1.0980048179626465, "learning_rate": 0.00013717277486910995, "loss": 0.1297, "step": 300 }, { "epoch": 1.57, "eval_accuracy": 0.937007874015748, "eval_loss": 0.24536970257759094, "eval_precision": 0.9442846541005356, "eval_recall": 0.937007874015748, "eval_runtime": 17.5121, "eval_samples_per_second": 43.513, "eval_steps_per_second": 2.741, "step": 300 }, { "epoch": 1.6, "grad_norm": 3.535846471786499, "learning_rate": 0.0001361256544502618, "loss": 0.1071, "step": 305 }, { "epoch": 1.62, "grad_norm": 4.182868957519531, "learning_rate": 0.0001350785340314136, "loss": 0.0249, "step": 310 }, { "epoch": 1.65, "grad_norm": 0.36884206533432007, "learning_rate": 0.00013403141361256545, "loss": 0.0762, "step": 315 }, { "epoch": 1.68, "grad_norm": 5.14253044128418, "learning_rate": 0.00013298429319371727, "loss": 0.0766, "step": 320 }, { "epoch": 1.7, "grad_norm": 1.2552727460861206, "learning_rate": 0.0001319371727748691, "loss": 0.0093, "step": 325 }, { "epoch": 1.73, "grad_norm": 3.767280101776123, "learning_rate": 0.00013089005235602096, "loss": 0.2232, "step": 330 }, { "epoch": 1.75, "grad_norm": 0.010386645793914795, "learning_rate": 0.00012984293193717277, "loss": 0.0026, "step": 335 }, { "epoch": 1.78, "grad_norm": 4.292264461517334, "learning_rate": 0.00012879581151832462, "loss": 0.2457, "step": 340 }, { "epoch": 1.81, "grad_norm": 0.024649567902088165, "learning_rate": 0.00012774869109947646, "loss": 0.0306, "step": 345 }, { "epoch": 1.83, "grad_norm": 5.098654270172119, "learning_rate": 0.00012670157068062827, "loss": 0.1156, "step": 350 }, { "epoch": 1.83, "eval_accuracy": 0.9501312335958005, "eval_loss": 0.17998863756656647, "eval_precision": 0.9531660720469141, "eval_recall": 0.9501312335958005, "eval_runtime": 17.4136, "eval_samples_per_second": 43.759, "eval_steps_per_second": 2.756, "step": 350 }, { "epoch": 1.86, "grad_norm": 0.9651967287063599, "learning_rate": 0.00012565445026178012, "loss": 0.0618, "step": 355 }, { "epoch": 1.88, "grad_norm": 2.727735996246338, "learning_rate": 0.00012460732984293196, "loss": 0.0705, "step": 360 }, { "epoch": 1.91, "grad_norm": 4.953824043273926, "learning_rate": 0.00012356020942408378, "loss": 0.2112, "step": 365 }, { "epoch": 1.94, "grad_norm": 0.08269181102514267, "learning_rate": 0.00012251308900523562, "loss": 0.071, "step": 370 }, { "epoch": 1.96, "grad_norm": 0.06396706402301788, "learning_rate": 0.00012146596858638744, "loss": 0.1069, "step": 375 }, { "epoch": 1.99, "grad_norm": 0.28295230865478516, "learning_rate": 0.00012041884816753928, "loss": 0.0759, "step": 380 }, { "epoch": 2.02, "grad_norm": 1.5805819034576416, "learning_rate": 0.0001193717277486911, "loss": 0.0622, "step": 385 }, { "epoch": 2.04, "grad_norm": 2.3917598724365234, "learning_rate": 0.00011832460732984294, "loss": 0.0512, "step": 390 }, { "epoch": 2.07, "grad_norm": 0.03971586003899574, "learning_rate": 0.00011727748691099475, "loss": 0.0163, "step": 395 }, { "epoch": 2.09, "grad_norm": 0.024471383541822433, "learning_rate": 0.0001162303664921466, "loss": 0.0188, "step": 400 }, { "epoch": 2.09, "eval_accuracy": 0.9593175853018373, "eval_loss": 0.14536795020103455, "eval_precision": 0.9601421810596481, "eval_recall": 0.9593175853018373, "eval_runtime": 17.4, "eval_samples_per_second": 43.793, "eval_steps_per_second": 2.759, "step": 400 }, { "epoch": 2.12, "grad_norm": 0.0705149918794632, "learning_rate": 0.00011518324607329844, "loss": 0.0062, "step": 405 }, { "epoch": 2.15, "grad_norm": 0.013097544200718403, "learning_rate": 0.00011413612565445027, "loss": 0.0191, "step": 410 }, { "epoch": 2.17, "grad_norm": 0.5236541628837585, "learning_rate": 0.00011308900523560211, "loss": 0.0403, "step": 415 }, { "epoch": 2.2, "grad_norm": 0.01967461034655571, "learning_rate": 0.00011204188481675393, "loss": 0.0032, "step": 420 }, { "epoch": 2.23, "grad_norm": 0.02198900654911995, "learning_rate": 0.00011099476439790577, "loss": 0.0019, "step": 425 }, { "epoch": 2.25, "grad_norm": 0.14719614386558533, "learning_rate": 0.00010994764397905759, "loss": 0.0052, "step": 430 }, { "epoch": 2.28, "grad_norm": 0.021793629974126816, "learning_rate": 0.00010890052356020943, "loss": 0.0105, "step": 435 }, { "epoch": 2.3, "grad_norm": 0.02368028089404106, "learning_rate": 0.00010785340314136125, "loss": 0.001, "step": 440 }, { "epoch": 2.33, "grad_norm": 0.010441082529723644, "learning_rate": 0.00010680628272251309, "loss": 0.0019, "step": 445 }, { "epoch": 2.36, "grad_norm": 0.015402965247631073, "learning_rate": 0.00010575916230366492, "loss": 0.0049, "step": 450 }, { "epoch": 2.36, "eval_accuracy": 0.9711286089238845, "eval_loss": 0.1082761362195015, "eval_precision": 0.9714506772867647, "eval_recall": 0.9711286089238845, "eval_runtime": 17.6808, "eval_samples_per_second": 43.098, "eval_steps_per_second": 2.715, "step": 450 }, { "epoch": 2.38, "grad_norm": 0.07379986345767975, "learning_rate": 0.00010471204188481676, "loss": 0.0047, "step": 455 }, { "epoch": 2.41, "grad_norm": 5.459177494049072, "learning_rate": 0.0001036649214659686, "loss": 0.0192, "step": 460 }, { "epoch": 2.43, "grad_norm": 0.004949438851326704, "learning_rate": 0.00010261780104712042, "loss": 0.0148, "step": 465 }, { "epoch": 2.46, "grad_norm": 0.17624743282794952, "learning_rate": 0.00010157068062827227, "loss": 0.0087, "step": 470 }, { "epoch": 2.49, "grad_norm": 0.010408706963062286, "learning_rate": 0.00010052356020942408, "loss": 0.0018, "step": 475 }, { "epoch": 2.51, "grad_norm": 0.007278731558471918, "learning_rate": 9.947643979057593e-05, "loss": 0.0029, "step": 480 }, { "epoch": 2.54, "grad_norm": 0.01558419968932867, "learning_rate": 9.842931937172776e-05, "loss": 0.0549, "step": 485 }, { "epoch": 2.57, "grad_norm": 0.005123753100633621, "learning_rate": 9.738219895287959e-05, "loss": 0.0006, "step": 490 }, { "epoch": 2.59, "grad_norm": 0.011126354336738586, "learning_rate": 9.633507853403142e-05, "loss": 0.003, "step": 495 }, { "epoch": 2.62, "grad_norm": 0.02012033388018608, "learning_rate": 9.528795811518324e-05, "loss": 0.0139, "step": 500 }, { "epoch": 2.62, "eval_accuracy": 0.963254593175853, "eval_loss": 0.16356664896011353, "eval_precision": 0.9644146563432678, "eval_recall": 0.963254593175853, "eval_runtime": 17.6855, "eval_samples_per_second": 43.086, "eval_steps_per_second": 2.714, "step": 500 } ], "logging_steps": 5, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 6.186131303461724e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }