{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010416666666666666, "grad_norm": 177.44975633333866, "learning_rate": 1.0116157446871364e-08, "logits/chosen": -2.590585231781006, "logits/rejected": -2.5664222240448, "logps/chosen": -80.29847717285156, "logps/rejected": -53.10200881958008, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.10416666666666667, "grad_norm": 157.38886271050058, "learning_rate": 1.0116157446871364e-07, "logits/chosen": -2.556668758392334, "logits/rejected": -2.5388104915618896, "logps/chosen": -87.92106628417969, "logps/rejected": -81.02932739257812, "loss": 0.6933, "rewards/accuracies": 0.25, "rewards/chosen": -0.0015910979127511382, "rewards/margins": -0.004063541069626808, "rewards/rejected": 0.002472442574799061, "step": 10 }, { "epoch": 0.20833333333333334, "grad_norm": 138.46519913851827, "learning_rate": 2.0232314893742728e-07, "logits/chosen": -2.614779233932495, "logits/rejected": -2.566148042678833, "logps/chosen": -103.30540466308594, "logps/rejected": -89.8634262084961, "loss": 0.6884, "rewards/accuracies": 0.3125, "rewards/chosen": 0.012269504368305206, "rewards/margins": 0.006630963645875454, "rewards/rejected": 0.005638539791107178, "step": 20 }, { "epoch": 0.3125, "grad_norm": 182.67677013741292, "learning_rate": 2.92621969179504e-07, "logits/chosen": -2.5195202827453613, "logits/rejected": -2.5337605476379395, "logps/chosen": -67.20966339111328, "logps/rejected": -74.8441162109375, "loss": 0.68, "rewards/accuracies": 0.3125, "rewards/chosen": 0.06131690740585327, "rewards/margins": 0.03928915411233902, "rewards/rejected": 0.0220277551561594, "step": 30 }, { "epoch": 0.4166666666666667, "grad_norm": 122.9129643173066, "learning_rate": 2.851560013818488e-07, "logits/chosen": -2.575892448425293, "logits/rejected": -2.563741683959961, "logps/chosen": -71.55787658691406, "logps/rejected": -70.44486236572266, "loss": 0.6694, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": 0.15858440101146698, "rewards/margins": 0.08626840263605118, "rewards/rejected": 0.0723160058259964, "step": 40 }, { "epoch": 0.5208333333333334, "grad_norm": 125.21896044373509, "learning_rate": 2.776900335841936e-07, "logits/chosen": -2.4858391284942627, "logits/rejected": -2.4980547428131104, "logps/chosen": -50.068031311035156, "logps/rejected": -58.41581344604492, "loss": 0.6626, "rewards/accuracies": 0.23125000298023224, "rewards/chosen": 0.22443357110023499, "rewards/margins": 0.09153502434492111, "rewards/rejected": 0.13289853930473328, "step": 50 }, { "epoch": 0.625, "grad_norm": 129.16794274257964, "learning_rate": 2.702240657865384e-07, "logits/chosen": -2.5691611766815186, "logits/rejected": -2.550184726715088, "logps/chosen": -77.5196762084961, "logps/rejected": -78.63575744628906, "loss": 0.651, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": 0.4174376130104065, "rewards/margins": 0.22838692367076874, "rewards/rejected": 0.18905068933963776, "step": 60 }, { "epoch": 0.7291666666666666, "grad_norm": 127.48114805532535, "learning_rate": 2.627580979888832e-07, "logits/chosen": -2.5489492416381836, "logits/rejected": -2.539600133895874, "logps/chosen": -94.18212890625, "logps/rejected": -84.33738708496094, "loss": 0.6407, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": 0.5826455354690552, "rewards/margins": 0.28286492824554443, "rewards/rejected": 0.2997806966304779, "step": 70 }, { "epoch": 0.8333333333333334, "grad_norm": 204.05352307435749, "learning_rate": 2.55292130191228e-07, "logits/chosen": -2.549272298812866, "logits/rejected": -2.494861602783203, "logps/chosen": -83.9472427368164, "logps/rejected": -79.15623474121094, "loss": 0.6197, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 0.45742616057395935, "rewards/margins": 0.39647966623306274, "rewards/rejected": 0.06094648689031601, "step": 80 }, { "epoch": 0.9375, "grad_norm": 177.818739531135, "learning_rate": 2.4782616239357287e-07, "logits/chosen": -2.481269359588623, "logits/rejected": -2.476210355758667, "logps/chosen": -53.21185302734375, "logps/rejected": -63.6622200012207, "loss": 0.6402, "rewards/accuracies": 0.28125, "rewards/chosen": 0.20689637959003448, "rewards/margins": 0.25917547941207886, "rewards/rejected": -0.05227910727262497, "step": 90 }, { "epoch": 1.0416666666666667, "grad_norm": 42.3875191112844, "learning_rate": 2.403601945959176e-07, "logits/chosen": -2.4865407943725586, "logits/rejected": -2.4648594856262207, "logps/chosen": -71.56224060058594, "logps/rejected": -73.2033920288086, "loss": 0.529, "rewards/accuracies": 0.40625, "rewards/chosen": 0.5279342532157898, "rewards/margins": 0.8453754186630249, "rewards/rejected": -0.3174411952495575, "step": 100 }, { "epoch": 1.0416666666666667, "eval_logits/chosen": -2.5210909843444824, "eval_logits/rejected": -2.504479169845581, "eval_logps/chosen": -73.25910949707031, "eval_logps/rejected": -81.07815551757812, "eval_loss": 0.6391177773475647, "eval_rewards/accuracies": 0.335317462682724, "eval_rewards/chosen": 0.33193930983543396, "eval_rewards/margins": 0.3184153139591217, "eval_rewards/rejected": 0.01352396234869957, "eval_runtime": 113.5699, "eval_samples_per_second": 17.61, "eval_steps_per_second": 0.555, "step": 100 }, { "epoch": 1.1458333333333333, "grad_norm": 32.2449798694013, "learning_rate": 2.3289422679826245e-07, "logits/chosen": -2.494868516921997, "logits/rejected": -2.528102159500122, "logps/chosen": -61.715614318847656, "logps/rejected": -84.56453704833984, "loss": 0.4071, "rewards/accuracies": 0.46875, "rewards/chosen": 0.9841395616531372, "rewards/margins": 1.814012885093689, "rewards/rejected": -0.8298734426498413, "step": 110 }, { "epoch": 1.25, "grad_norm": 34.815563904415676, "learning_rate": 2.2542825900060728e-07, "logits/chosen": -2.561702251434326, "logits/rejected": -2.5297372341156006, "logps/chosen": -103.7978744506836, "logps/rejected": -98.90431213378906, "loss": 0.4202, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 1.1336109638214111, "rewards/margins": 2.633657932281494, "rewards/rejected": -1.5000473260879517, "step": 120 }, { "epoch": 1.3541666666666667, "grad_norm": 36.65231392815614, "learning_rate": 2.1796229120295209e-07, "logits/chosen": -2.4902865886688232, "logits/rejected": -2.4940361976623535, "logps/chosen": -85.04226684570312, "logps/rejected": -96.18626403808594, "loss": 0.4074, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.41049623489379883, "rewards/margins": 2.2558376789093018, "rewards/rejected": -1.8453413248062134, "step": 130 }, { "epoch": 1.4583333333333333, "grad_norm": 77.16009856421289, "learning_rate": 2.104963234052969e-07, "logits/chosen": -2.4936976432800293, "logits/rejected": -2.509275197982788, "logps/chosen": -47.730525970458984, "logps/rejected": -70.30806732177734, "loss": 0.414, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.41952085494995117, "rewards/margins": 1.8167709112167358, "rewards/rejected": -1.3972498178482056, "step": 140 }, { "epoch": 1.5625, "grad_norm": 26.196738217281826, "learning_rate": 2.030303556076417e-07, "logits/chosen": -2.5342118740081787, "logits/rejected": -2.5107994079589844, "logps/chosen": -68.2138671875, "logps/rejected": -77.74583435058594, "loss": 0.4107, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 1.0598348379135132, "rewards/margins": 2.18320894241333, "rewards/rejected": -1.1233742237091064, "step": 150 }, { "epoch": 1.6666666666666665, "grad_norm": 17.275725311131126, "learning_rate": 1.955643878099865e-07, "logits/chosen": -2.5043556690216064, "logits/rejected": -2.513214588165283, "logps/chosen": -74.29962921142578, "logps/rejected": -91.91645050048828, "loss": 0.399, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.7849496603012085, "rewards/margins": 2.8569164276123047, "rewards/rejected": -1.0719670057296753, "step": 160 }, { "epoch": 1.7708333333333335, "grad_norm": 41.27028375416643, "learning_rate": 1.880984200123313e-07, "logits/chosen": -2.47822904586792, "logits/rejected": -2.469449281692505, "logps/chosen": -68.61357879638672, "logps/rejected": -85.9351806640625, "loss": 0.4081, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 1.7798223495483398, "rewards/margins": 2.5742733478546143, "rewards/rejected": -0.794451117515564, "step": 170 }, { "epoch": 1.875, "grad_norm": 59.43887874577346, "learning_rate": 1.806324522146761e-07, "logits/chosen": -2.500208616256714, "logits/rejected": -2.481628656387329, "logps/chosen": -68.31395721435547, "logps/rejected": -78.98301696777344, "loss": 0.3901, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.9541478157043457, "rewards/margins": 2.4010345935821533, "rewards/rejected": -1.4468867778778076, "step": 180 }, { "epoch": 1.9791666666666665, "grad_norm": 36.95333305414057, "learning_rate": 1.731664844170209e-07, "logits/chosen": -2.5415730476379395, "logits/rejected": -2.540619373321533, "logps/chosen": -85.87374114990234, "logps/rejected": -99.43651580810547, "loss": 0.3978, "rewards/accuracies": 0.53125, "rewards/chosen": 0.6972099542617798, "rewards/margins": 3.0720162391662598, "rewards/rejected": -2.3748061656951904, "step": 190 }, { "epoch": 2.0833333333333335, "grad_norm": 1.2362916215000161, "learning_rate": 1.657005166193657e-07, "logits/chosen": -2.4930806159973145, "logits/rejected": -2.5152084827423096, "logps/chosen": -85.92092895507812, "logps/rejected": -112.84880065917969, "loss": 0.3646, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.8306751251220703, "rewards/margins": 3.8782944679260254, "rewards/rejected": -3.047619342803955, "step": 200 }, { "epoch": 2.0833333333333335, "eval_logits/chosen": -2.541405439376831, "eval_logits/rejected": -2.5242767333984375, "eval_logps/chosen": -75.4660415649414, "eval_logps/rejected": -84.29215240478516, "eval_loss": 0.6853565573692322, "eval_rewards/accuracies": 0.3253968358039856, "eval_rewards/chosen": -0.2626197934150696, "eval_rewards/margins": 0.589722752571106, "eval_rewards/rejected": -0.8523425459861755, "eval_runtime": 113.3948, "eval_samples_per_second": 17.637, "eval_steps_per_second": 0.556, "step": 200 }, { "epoch": 2.1875, "grad_norm": 2.0446238829126155, "learning_rate": 1.5823454882171052e-07, "logits/chosen": -2.539818286895752, "logits/rejected": -2.533048391342163, "logps/chosen": -59.99456787109375, "logps/rejected": -76.70134735107422, "loss": 0.3844, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.9697097539901733, "rewards/margins": 3.1277875900268555, "rewards/rejected": -2.1580779552459717, "step": 210 }, { "epoch": 2.2916666666666665, "grad_norm": 1.7408027431292374, "learning_rate": 1.5076858102405535e-07, "logits/chosen": -2.5433623790740967, "logits/rejected": -2.4930663108825684, "logps/chosen": -97.69389343261719, "logps/rejected": -106.0047607421875, "loss": 0.332, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.1734516620635986, "rewards/margins": 4.781111240386963, "rewards/rejected": -3.607659101486206, "step": 220 }, { "epoch": 2.3958333333333335, "grad_norm": 1.4060304822058385, "learning_rate": 1.4330261322640012e-07, "logits/chosen": -2.526247978210449, "logits/rejected": -2.49235200881958, "logps/chosen": -56.86894607543945, "logps/rejected": -65.83305358886719, "loss": 0.3811, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 0.7982641458511353, "rewards/margins": 3.461902618408203, "rewards/rejected": -2.663638114929199, "step": 230 }, { "epoch": 2.5, "grad_norm": 2.2210458558484496, "learning_rate": 1.3583664542874493e-07, "logits/chosen": -2.505704164505005, "logits/rejected": -2.455660104751587, "logps/chosen": -83.47601318359375, "logps/rejected": -90.42607116699219, "loss": 0.3758, "rewards/accuracies": 0.5, "rewards/chosen": 1.1906569004058838, "rewards/margins": 4.154426574707031, "rewards/rejected": -2.9637694358825684, "step": 240 }, { "epoch": 2.6041666666666665, "grad_norm": 8.8054104852066, "learning_rate": 1.2837067763108973e-07, "logits/chosen": -2.450004816055298, "logits/rejected": -2.437671184539795, "logps/chosen": -72.790771484375, "logps/rejected": -84.4940185546875, "loss": 0.3479, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.03280387073755264, "rewards/margins": 3.7539596557617188, "rewards/rejected": -3.7867636680603027, "step": 250 }, { "epoch": 2.7083333333333335, "grad_norm": 1.6270602506146907, "learning_rate": 1.2090470983343453e-07, "logits/chosen": -2.443758726119995, "logits/rejected": -2.461164712905884, "logps/chosen": -55.83058547973633, "logps/rejected": -86.06732940673828, "loss": 0.3802, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.6197060346603394, "rewards/margins": 4.14993143081665, "rewards/rejected": -3.5302250385284424, "step": 260 }, { "epoch": 2.8125, "grad_norm": 6.344724508571099, "learning_rate": 1.1343874203577934e-07, "logits/chosen": -2.443955659866333, "logits/rejected": -2.4808030128479004, "logps/chosen": -74.16885375976562, "logps/rejected": -91.22604370117188, "loss": 0.3551, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.6749483346939087, "rewards/margins": 4.160412788391113, "rewards/rejected": -3.4854648113250732, "step": 270 }, { "epoch": 2.9166666666666665, "grad_norm": 10.707111276605081, "learning_rate": 1.0597277423812416e-07, "logits/chosen": -2.5015740394592285, "logits/rejected": -2.465491771697998, "logps/chosen": -81.94514465332031, "logps/rejected": -95.30039978027344, "loss": 0.3654, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.9193007349967957, "rewards/margins": 4.401783466339111, "rewards/rejected": -3.482482433319092, "step": 280 }, { "epoch": 3.0, "step": 288, "total_flos": 0.0, "train_loss": 0.4762688593731986, "train_runtime": 3179.8752, "train_samples_per_second": 5.767, "train_steps_per_second": 0.091 } ], "logging_steps": 10, "max_steps": 288, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }