|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.09142857142857143, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.563270330429077, |
|
"learning_rate": 4e-05, |
|
"loss": 2.4127, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.333299160003662, |
|
"learning_rate": 8e-05, |
|
"loss": 2.3102, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.8613317012786865, |
|
"learning_rate": 0.00012, |
|
"loss": 2.1983, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.620126485824585, |
|
"learning_rate": 0.00016, |
|
"loss": 2.0917, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.0352119207382202, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9613, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.8601917624473572, |
|
"learning_rate": 0.00019979899497487438, |
|
"loss": 1.6927, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.273868203163147, |
|
"learning_rate": 0.00019959798994974876, |
|
"loss": 1.6828, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.0873847007751465, |
|
"learning_rate": 0.00019939698492462313, |
|
"loss": 1.5088, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.5800795555114746, |
|
"learning_rate": 0.0001991959798994975, |
|
"loss": 1.3702, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.693160355091095, |
|
"learning_rate": 0.00019899497487437187, |
|
"loss": 1.3718, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.49409618973731995, |
|
"learning_rate": 0.00019879396984924622, |
|
"loss": 1.3583, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.47029319405555725, |
|
"learning_rate": 0.00019859296482412062, |
|
"loss": 1.2791, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.43618088960647583, |
|
"learning_rate": 0.000198391959798995, |
|
"loss": 1.3161, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3907912075519562, |
|
"learning_rate": 0.00019819095477386937, |
|
"loss": 1.2954, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.6292415857315063, |
|
"learning_rate": 0.0001979899497487437, |
|
"loss": 1.3397, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.37423521280288696, |
|
"learning_rate": 0.0001977889447236181, |
|
"loss": 1.3983, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3845643699169159, |
|
"learning_rate": 0.00019758793969849249, |
|
"loss": 1.3349, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3657298982143402, |
|
"learning_rate": 0.00019738693467336683, |
|
"loss": 1.2767, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3727971315383911, |
|
"learning_rate": 0.0001971859296482412, |
|
"loss": 1.3672, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.35123032331466675, |
|
"learning_rate": 0.0001969849246231156, |
|
"loss": 1.3692, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.4003850221633911, |
|
"learning_rate": 0.00019678391959798995, |
|
"loss": 1.3412, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3638221323490143, |
|
"learning_rate": 0.00019658291457286432, |
|
"loss": 1.2813, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.391216903924942, |
|
"learning_rate": 0.0001963819095477387, |
|
"loss": 1.2853, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.4370620846748352, |
|
"learning_rate": 0.0001961809045226131, |
|
"loss": 1.2524, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3566085696220398, |
|
"learning_rate": 0.00019597989949748744, |
|
"loss": 1.3192, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.35438084602355957, |
|
"learning_rate": 0.00019577889447236181, |
|
"loss": 1.2858, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3968108296394348, |
|
"learning_rate": 0.0001955778894472362, |
|
"loss": 1.3112, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.36512017250061035, |
|
"learning_rate": 0.00019537688442211056, |
|
"loss": 1.278, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3982504606246948, |
|
"learning_rate": 0.00019517587939698493, |
|
"loss": 1.2392, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.38377949595451355, |
|
"learning_rate": 0.0001949748743718593, |
|
"loss": 1.2843, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3582867980003357, |
|
"learning_rate": 0.00019477386934673368, |
|
"loss": 1.3008, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.3572194576263428, |
|
"learning_rate": 0.00019457286432160805, |
|
"loss": 1.294, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.35502907633781433, |
|
"learning_rate": 0.00019437185929648243, |
|
"loss": 1.3877, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3649040460586548, |
|
"learning_rate": 0.0001941708542713568, |
|
"loss": 1.2966, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3649256229400635, |
|
"learning_rate": 0.00019396984924623117, |
|
"loss": 1.2354, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.36085084080696106, |
|
"learning_rate": 0.00019376884422110552, |
|
"loss": 1.2409, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.35929059982299805, |
|
"learning_rate": 0.00019356783919597992, |
|
"loss": 1.243, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3897881805896759, |
|
"learning_rate": 0.0001933668341708543, |
|
"loss": 1.3945, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.35484543442726135, |
|
"learning_rate": 0.00019316582914572864, |
|
"loss": 1.3433, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.35691192746162415, |
|
"learning_rate": 0.000192964824120603, |
|
"loss": 1.3243, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3804129958152771, |
|
"learning_rate": 0.0001927638190954774, |
|
"loss": 1.2509, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3623339831829071, |
|
"learning_rate": 0.00019256281407035178, |
|
"loss": 1.1799, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3411855697631836, |
|
"learning_rate": 0.00019236180904522613, |
|
"loss": 1.2372, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.36590930819511414, |
|
"learning_rate": 0.0001921608040201005, |
|
"loss": 1.2585, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.30974116921424866, |
|
"learning_rate": 0.0001919597989949749, |
|
"loss": 1.2974, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.32794803380966187, |
|
"learning_rate": 0.00019175879396984925, |
|
"loss": 1.2696, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.33263906836509705, |
|
"learning_rate": 0.00019155778894472362, |
|
"loss": 1.3209, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.31748828291893005, |
|
"learning_rate": 0.000191356783919598, |
|
"loss": 1.278, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.34738561511039734, |
|
"learning_rate": 0.0001911557788944724, |
|
"loss": 1.2105, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3313944339752197, |
|
"learning_rate": 0.00019095477386934674, |
|
"loss": 1.2527, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.33137476444244385, |
|
"learning_rate": 0.0001907537688442211, |
|
"loss": 1.2984, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.31752490997314453, |
|
"learning_rate": 0.00019055276381909548, |
|
"loss": 1.307, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3111082911491394, |
|
"learning_rate": 0.00019035175879396986, |
|
"loss": 1.2769, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.31065696477890015, |
|
"learning_rate": 0.00019015075376884423, |
|
"loss": 1.3082, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3382773697376251, |
|
"learning_rate": 0.0001899497487437186, |
|
"loss": 1.2744, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.34320947527885437, |
|
"learning_rate": 0.00018974874371859298, |
|
"loss": 1.3013, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.33131280541419983, |
|
"learning_rate": 0.00018954773869346732, |
|
"loss": 1.4066, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3357389569282532, |
|
"learning_rate": 0.00018934673366834172, |
|
"loss": 1.2841, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3200838267803192, |
|
"learning_rate": 0.0001891457286432161, |
|
"loss": 1.2654, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3336584270000458, |
|
"learning_rate": 0.00018894472361809047, |
|
"loss": 1.1716, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3128441274166107, |
|
"learning_rate": 0.00018874371859296481, |
|
"loss": 1.3009, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.30249112844467163, |
|
"learning_rate": 0.00018854271356783921, |
|
"loss": 1.2311, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3263241946697235, |
|
"learning_rate": 0.0001883417085427136, |
|
"loss": 1.2344, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.32348358631134033, |
|
"learning_rate": 0.00018814070351758793, |
|
"loss": 1.3023, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6508419513702393, |
|
"learning_rate": 0.0001879396984924623, |
|
"loss": 1.2028, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.34560996294021606, |
|
"learning_rate": 0.0001877386934673367, |
|
"loss": 1.389, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.36555996537208557, |
|
"learning_rate": 0.00018753768844221108, |
|
"loss": 1.3653, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3195466697216034, |
|
"learning_rate": 0.00018733668341708543, |
|
"loss": 1.2412, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.30555933713912964, |
|
"learning_rate": 0.0001871356783919598, |
|
"loss": 1.2357, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.30776411294937134, |
|
"learning_rate": 0.0001869346733668342, |
|
"loss": 1.3112, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.31933915615081787, |
|
"learning_rate": 0.00018673366834170854, |
|
"loss": 1.1951, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3241545259952545, |
|
"learning_rate": 0.00018653266331658292, |
|
"loss": 1.2717, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3117482364177704, |
|
"learning_rate": 0.0001863316582914573, |
|
"loss": 1.3031, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.33056551218032837, |
|
"learning_rate": 0.0001861306532663317, |
|
"loss": 1.2098, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.32441195845603943, |
|
"learning_rate": 0.00018592964824120604, |
|
"loss": 1.2135, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.34216779470443726, |
|
"learning_rate": 0.0001857286432160804, |
|
"loss": 1.2531, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.32885247468948364, |
|
"learning_rate": 0.00018552763819095478, |
|
"loss": 1.3054, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.34541794657707214, |
|
"learning_rate": 0.00018532663316582915, |
|
"loss": 1.3207, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.30329057574272156, |
|
"learning_rate": 0.00018512562814070353, |
|
"loss": 1.2652, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.31469491124153137, |
|
"learning_rate": 0.0001849246231155779, |
|
"loss": 1.1961, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3181230127811432, |
|
"learning_rate": 0.00018472361809045227, |
|
"loss": 1.3111, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3181725740432739, |
|
"learning_rate": 0.00018452261306532662, |
|
"loss": 1.3353, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3154084384441376, |
|
"learning_rate": 0.00018432160804020102, |
|
"loss": 1.2418, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.35061103105545044, |
|
"learning_rate": 0.0001841206030150754, |
|
"loss": 1.2332, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3259966969490051, |
|
"learning_rate": 0.00018391959798994977, |
|
"loss": 1.3633, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.31192857027053833, |
|
"learning_rate": 0.0001837185929648241, |
|
"loss": 1.1886, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.32024237513542175, |
|
"learning_rate": 0.0001835175879396985, |
|
"loss": 1.2141, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.302498459815979, |
|
"learning_rate": 0.00018331658291457288, |
|
"loss": 1.237, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3569789230823517, |
|
"learning_rate": 0.00018311557788944723, |
|
"loss": 1.3015, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3121156692504883, |
|
"learning_rate": 0.0001829145728643216, |
|
"loss": 1.281, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.31279826164245605, |
|
"learning_rate": 0.000182713567839196, |
|
"loss": 1.2924, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3210877478122711, |
|
"learning_rate": 0.00018251256281407038, |
|
"loss": 1.3082, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.331406831741333, |
|
"learning_rate": 0.00018231155778894472, |
|
"loss": 1.2434, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3135213255882263, |
|
"learning_rate": 0.0001821105527638191, |
|
"loss": 1.2188, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.31146401166915894, |
|
"learning_rate": 0.0001819095477386935, |
|
"loss": 1.2484, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.32071712613105774, |
|
"learning_rate": 0.00018170854271356784, |
|
"loss": 1.1927, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3343571722507477, |
|
"learning_rate": 0.00018150753768844221, |
|
"loss": 1.3443, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3510550558567047, |
|
"learning_rate": 0.0001813065326633166, |
|
"loss": 1.2832, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.33436939120292664, |
|
"learning_rate": 0.00018110552763819096, |
|
"loss": 1.252, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3175451159477234, |
|
"learning_rate": 0.00018090452261306533, |
|
"loss": 1.251, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.32603979110717773, |
|
"learning_rate": 0.0001807035175879397, |
|
"loss": 1.228, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3073003590106964, |
|
"learning_rate": 0.00018050251256281408, |
|
"loss": 1.2659, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3285619616508484, |
|
"learning_rate": 0.00018030150753768845, |
|
"loss": 1.2826, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3038572072982788, |
|
"learning_rate": 0.00018010050251256282, |
|
"loss": 1.217, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.35778746008872986, |
|
"learning_rate": 0.0001798994974874372, |
|
"loss": 1.2901, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2900612950325012, |
|
"learning_rate": 0.00017969849246231157, |
|
"loss": 1.2651, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.32928743958473206, |
|
"learning_rate": 0.00017949748743718592, |
|
"loss": 1.3143, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.32471874356269836, |
|
"learning_rate": 0.00017929648241206032, |
|
"loss": 1.1834, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.30989256501197815, |
|
"learning_rate": 0.0001790954773869347, |
|
"loss": 1.2216, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3371771275997162, |
|
"learning_rate": 0.00017889447236180906, |
|
"loss": 1.197, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.31041428446769714, |
|
"learning_rate": 0.0001786934673366834, |
|
"loss": 1.27, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3152185082435608, |
|
"learning_rate": 0.0001784924623115578, |
|
"loss": 1.2436, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3227459490299225, |
|
"learning_rate": 0.00017829145728643218, |
|
"loss": 1.2401, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3246959149837494, |
|
"learning_rate": 0.00017809045226130653, |
|
"loss": 1.2703, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.38032859563827515, |
|
"learning_rate": 0.0001778894472361809, |
|
"loss": 1.3266, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.33325478434562683, |
|
"learning_rate": 0.0001776884422110553, |
|
"loss": 1.2954, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3178690969944, |
|
"learning_rate": 0.00017748743718592967, |
|
"loss": 1.1793, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.31393784284591675, |
|
"learning_rate": 0.00017728643216080402, |
|
"loss": 1.277, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3150279223918915, |
|
"learning_rate": 0.0001770854271356784, |
|
"loss": 1.293, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.32476913928985596, |
|
"learning_rate": 0.0001768844221105528, |
|
"loss": 1.2569, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.36075925827026367, |
|
"learning_rate": 0.00017668341708542714, |
|
"loss": 1.205, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.33134496212005615, |
|
"learning_rate": 0.0001764824120603015, |
|
"loss": 1.2299, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.30507662892341614, |
|
"learning_rate": 0.00017628140703517588, |
|
"loss": 1.2883, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.34049952030181885, |
|
"learning_rate": 0.00017608040201005026, |
|
"loss": 1.214, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3405919373035431, |
|
"learning_rate": 0.00017587939698492463, |
|
"loss": 1.2738, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3306083679199219, |
|
"learning_rate": 0.000175678391959799, |
|
"loss": 1.2415, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.33770737051963806, |
|
"learning_rate": 0.00017547738693467338, |
|
"loss": 1.3233, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3261878788471222, |
|
"learning_rate": 0.00017527638190954775, |
|
"loss": 1.2695, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3433193266391754, |
|
"learning_rate": 0.00017507537688442212, |
|
"loss": 1.2052, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3111405670642853, |
|
"learning_rate": 0.0001748743718592965, |
|
"loss": 1.2798, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3630310297012329, |
|
"learning_rate": 0.00017467336683417087, |
|
"loss": 1.2567, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.31963038444519043, |
|
"learning_rate": 0.00017447236180904521, |
|
"loss": 1.2455, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.299695760011673, |
|
"learning_rate": 0.00017427135678391961, |
|
"loss": 1.207, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3167514503002167, |
|
"learning_rate": 0.000174070351758794, |
|
"loss": 1.2378, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.31375688314437866, |
|
"learning_rate": 0.00017386934673366836, |
|
"loss": 1.2658, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.34311383962631226, |
|
"learning_rate": 0.0001736683417085427, |
|
"loss": 1.2004, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.31706517934799194, |
|
"learning_rate": 0.0001734673366834171, |
|
"loss": 1.1879, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.31296172738075256, |
|
"learning_rate": 0.00017326633165829148, |
|
"loss": 1.1866, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3254072368144989, |
|
"learning_rate": 0.00017306532663316582, |
|
"loss": 1.1952, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3165453374385834, |
|
"learning_rate": 0.0001728643216080402, |
|
"loss": 1.3459, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.35455992817878723, |
|
"learning_rate": 0.0001726633165829146, |
|
"loss": 1.2494, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3116908073425293, |
|
"learning_rate": 0.00017246231155778897, |
|
"loss": 1.2225, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3141638934612274, |
|
"learning_rate": 0.00017226130653266332, |
|
"loss": 1.3385, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3096507787704468, |
|
"learning_rate": 0.0001720603015075377, |
|
"loss": 1.3257, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3160630464553833, |
|
"learning_rate": 0.00017185929648241206, |
|
"loss": 1.2683, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3342824876308441, |
|
"learning_rate": 0.00017165829145728644, |
|
"loss": 1.2551, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3086145222187042, |
|
"learning_rate": 0.0001714572864321608, |
|
"loss": 1.2456, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3001709282398224, |
|
"learning_rate": 0.00017125628140703518, |
|
"loss": 1.2287, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3277103304862976, |
|
"learning_rate": 0.00017105527638190955, |
|
"loss": 1.3302, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.33616161346435547, |
|
"learning_rate": 0.00017085427135678393, |
|
"loss": 1.2604, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3231915533542633, |
|
"learning_rate": 0.0001706532663316583, |
|
"loss": 1.2367, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3305569291114807, |
|
"learning_rate": 0.00017045226130653267, |
|
"loss": 1.2387, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.35031118988990784, |
|
"learning_rate": 0.00017025125628140705, |
|
"loss": 1.2464, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3142334222793579, |
|
"learning_rate": 0.00017005025125628142, |
|
"loss": 1.3614, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.31159430742263794, |
|
"learning_rate": 0.0001698492462311558, |
|
"loss": 1.2556, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3273050785064697, |
|
"learning_rate": 0.00016964824120603016, |
|
"loss": 1.3519, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3299296796321869, |
|
"learning_rate": 0.0001694472361809045, |
|
"loss": 1.1763, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.33138513565063477, |
|
"learning_rate": 0.0001692462311557789, |
|
"loss": 1.17, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.30424776673316956, |
|
"learning_rate": 0.00016904522613065328, |
|
"loss": 1.123, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3452983498573303, |
|
"learning_rate": 0.00016884422110552766, |
|
"loss": 1.2999, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.33614206314086914, |
|
"learning_rate": 0.000168643216080402, |
|
"loss": 1.262, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.32416558265686035, |
|
"learning_rate": 0.0001684422110552764, |
|
"loss": 1.2514, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.29827457666397095, |
|
"learning_rate": 0.00016824120603015078, |
|
"loss": 1.2461, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.32572871446609497, |
|
"learning_rate": 0.00016804020100502512, |
|
"loss": 1.2393, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.32171282172203064, |
|
"learning_rate": 0.0001678391959798995, |
|
"loss": 1.3045, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.34592801332473755, |
|
"learning_rate": 0.0001676381909547739, |
|
"loss": 1.2669, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.33795440196990967, |
|
"learning_rate": 0.00016743718592964827, |
|
"loss": 1.1404, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.32598641514778137, |
|
"learning_rate": 0.0001672361809045226, |
|
"loss": 1.2495, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.31816181540489197, |
|
"learning_rate": 0.00016703517587939699, |
|
"loss": 1.3003, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3340943157672882, |
|
"learning_rate": 0.00016683417085427136, |
|
"loss": 1.2615, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3242477476596832, |
|
"learning_rate": 0.00016663316582914573, |
|
"loss": 1.2527, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.308652400970459, |
|
"learning_rate": 0.0001664321608040201, |
|
"loss": 1.3241, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.31818273663520813, |
|
"learning_rate": 0.00016623115577889448, |
|
"loss": 1.3712, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.32885751128196716, |
|
"learning_rate": 0.00016603015075376885, |
|
"loss": 1.2583, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.32561740279197693, |
|
"learning_rate": 0.00016582914572864322, |
|
"loss": 1.2458, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3278496563434601, |
|
"learning_rate": 0.0001656281407035176, |
|
"loss": 1.2205, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.32530438899993896, |
|
"learning_rate": 0.00016542713567839197, |
|
"loss": 1.2235, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.31232836842536926, |
|
"learning_rate": 0.00016522613065326634, |
|
"loss": 1.199, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3209743797779083, |
|
"learning_rate": 0.00016502512562814072, |
|
"loss": 1.2717, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.329940527677536, |
|
"learning_rate": 0.0001648241206030151, |
|
"loss": 1.2425, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3144824802875519, |
|
"learning_rate": 0.00016462311557788946, |
|
"loss": 1.2444, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3218553066253662, |
|
"learning_rate": 0.0001644221105527638, |
|
"loss": 1.2815, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.33460506796836853, |
|
"learning_rate": 0.0001642211055276382, |
|
"loss": 1.3774, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3300727605819702, |
|
"learning_rate": 0.00016402010050251258, |
|
"loss": 1.3436, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3530360460281372, |
|
"learning_rate": 0.00016381909547738695, |
|
"loss": 1.2605, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3326485753059387, |
|
"learning_rate": 0.0001636180904522613, |
|
"loss": 1.2202, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.31355732679367065, |
|
"learning_rate": 0.0001634170854271357, |
|
"loss": 1.2798, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3162304759025574, |
|
"learning_rate": 0.00016321608040201007, |
|
"loss": 1.2118, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.32264095544815063, |
|
"learning_rate": 0.00016301507537688442, |
|
"loss": 1.2775, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.30425918102264404, |
|
"learning_rate": 0.0001628140703517588, |
|
"loss": 1.1438, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.33907556533813477, |
|
"learning_rate": 0.00016261306532663316, |
|
"loss": 1.4077, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.32334232330322266, |
|
"learning_rate": 0.00016241206030150756, |
|
"loss": 1.2673, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.32999834418296814, |
|
"learning_rate": 0.0001622110552763819, |
|
"loss": 1.257, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3223746120929718, |
|
"learning_rate": 0.00016201005025125628, |
|
"loss": 1.2125, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3236989378929138, |
|
"learning_rate": 0.00016180904522613066, |
|
"loss": 1.3645, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3303336203098297, |
|
"learning_rate": 0.00016160804020100503, |
|
"loss": 1.2786, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3135005831718445, |
|
"learning_rate": 0.0001614070351758794, |
|
"loss": 1.2775, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3185466527938843, |
|
"learning_rate": 0.00016120603015075378, |
|
"loss": 1.2128, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3355714976787567, |
|
"learning_rate": 0.00016100502512562815, |
|
"loss": 1.307, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.339216023683548, |
|
"learning_rate": 0.00016080402010050252, |
|
"loss": 1.2845, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"total_flos": 9.299042116288512e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|