MohamedAhmedAE commited on
Commit
fe43866
1 Parent(s): fd59336

Training in progress, step 95000, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "down_proj",
24
  "up_proj",
25
- "q_proj",
26
  "k_proj",
27
  "gate_proj",
 
 
28
  "v_proj",
29
- "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "up_proj",
 
24
  "k_proj",
25
  "gate_proj",
26
+ "down_proj",
27
+ "o_proj",
28
  "v_proj",
29
+ "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:588a15f7cea8d0a814b43f40bdcbefaf553e3f1ea3fe8f93ab138197ad9ca78c
3
  size 5544997664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c8c2bc861c786b2bf9d8ddb8858babedad8fc42c6e26fb00fe13b35096c6de7
3
  size 5544997664
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ebee6fad5e5c1bb9d7e488846379921805f8c0c20003d8a91f2e25aed77a83a
3
  size 674093138
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36ebe5440e2bcb412e4131df2efca8e8fc88b5200168c85a419cb901604336b6
3
  size 674093138
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d8f8f9743ec8d3f95f6d09874e8c8e1665b1753c549b2fad6b80c9e2a59f8a6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0610f4aa7ed2f34398fce8dc77c3d7b14d52dfb0bc17dc7f64e8f6c2438e189b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:def38347a8476c414f9a59b485e01231d01480373c9bf5d7882acb65a1218490
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17feec1222485652df46ab05d04d0cb1b6896f1f053ea3ae8ca19c7cd689e6b7
3
  size 1064
last-checkpoint/tokenizer_config.json CHANGED
@@ -2072,7 +2072,6 @@
2072
  "bos_token": "<|im_start|>",
2073
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
2074
  "clean_up_tokenization_spaces": true,
2075
- "device_map": "auto",
2076
  "eos_token": "<|im_end|>",
2077
  "max_length": 4096,
2078
  "model_input_names": [
 
2072
  "bos_token": "<|im_start|>",
2073
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
2074
  "clean_up_tokenization_spaces": true,
 
2075
  "eos_token": "<|im_end|>",
2076
  "max_length": 4096,
2077
  "model_input_names": [
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.06611336694777863,
5
  "eval_steps": 200,
6
- "global_step": 88900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6230,6 +6230,433 @@
6230
  "learning_rate": 1.9991374234676826e-05,
6231
  "loss": 1.5551,
6232
  "step": 88900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6233
  }
6234
  ],
6235
  "logging_steps": 100,
@@ -6249,7 +6676,7 @@
6249
  "attributes": {}
6250
  }
6251
  },
6252
- "total_flos": 1.211678461812007e+18,
6253
  "train_batch_size": 1,
6254
  "trial_name": null,
6255
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.07064982969672631,
5
  "eval_steps": 200,
6
+ "global_step": 95000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6230
  "learning_rate": 1.9991374234676826e-05,
6231
  "loss": 1.5551,
6232
  "step": 88900
6233
+ },
6234
+ {
6235
+ "epoch": 0.06618773518956465,
6236
+ "grad_norm": 0.6733551621437073,
6237
+ "learning_rate": 1.999135481992186e-05,
6238
+ "loss": 1.4334,
6239
+ "step": 89000
6240
+ },
6241
+ {
6242
+ "epoch": 0.06626210343135068,
6243
+ "grad_norm": 0.8035016059875488,
6244
+ "learning_rate": 1.999133538335166e-05,
6245
+ "loss": 1.4872,
6246
+ "step": 89100
6247
+ },
6248
+ {
6249
+ "epoch": 0.06633647167313671,
6250
+ "grad_norm": 0.4339046776294708,
6251
+ "learning_rate": 1.9991315924966277e-05,
6252
+ "loss": 1.4869,
6253
+ "step": 89200
6254
+ },
6255
+ {
6256
+ "epoch": 0.06641083991492273,
6257
+ "grad_norm": 0.6680594086647034,
6258
+ "learning_rate": 1.9991296444765747e-05,
6259
+ "loss": 1.5103,
6260
+ "step": 89300
6261
+ },
6262
+ {
6263
+ "epoch": 0.06648520815670876,
6264
+ "grad_norm": 0.697487473487854,
6265
+ "learning_rate": 1.9991276942750117e-05,
6266
+ "loss": 1.4239,
6267
+ "step": 89400
6268
+ },
6269
+ {
6270
+ "epoch": 0.06655957639849479,
6271
+ "grad_norm": 0.587734043598175,
6272
+ "learning_rate": 1.9991257418919424e-05,
6273
+ "loss": 1.5856,
6274
+ "step": 89500
6275
+ },
6276
+ {
6277
+ "epoch": 0.06663394464028081,
6278
+ "grad_norm": 0.8574571013450623,
6279
+ "learning_rate": 1.999123787327372e-05,
6280
+ "loss": 1.4818,
6281
+ "step": 89600
6282
+ },
6283
+ {
6284
+ "epoch": 0.06670831288206684,
6285
+ "grad_norm": 1.0861676931381226,
6286
+ "learning_rate": 1.9991218305813035e-05,
6287
+ "loss": 1.4883,
6288
+ "step": 89700
6289
+ },
6290
+ {
6291
+ "epoch": 0.06678268112385287,
6292
+ "grad_norm": 1.0139306783676147,
6293
+ "learning_rate": 1.9991198716537422e-05,
6294
+ "loss": 1.5099,
6295
+ "step": 89800
6296
+ },
6297
+ {
6298
+ "epoch": 0.0668570493656389,
6299
+ "grad_norm": 0.6741511225700378,
6300
+ "learning_rate": 1.999117910544692e-05,
6301
+ "loss": 1.4746,
6302
+ "step": 89900
6303
+ },
6304
+ {
6305
+ "epoch": 0.06693141760742492,
6306
+ "grad_norm": 0.9702801704406738,
6307
+ "learning_rate": 1.999115947254157e-05,
6308
+ "loss": 1.5166,
6309
+ "step": 90000
6310
+ },
6311
+ {
6312
+ "epoch": 0.06700578584921095,
6313
+ "grad_norm": 0.7757803797721863,
6314
+ "learning_rate": 1.9991139817821416e-05,
6315
+ "loss": 1.5031,
6316
+ "step": 90100
6317
+ },
6318
+ {
6319
+ "epoch": 0.06708015409099698,
6320
+ "grad_norm": 0.7200698256492615,
6321
+ "learning_rate": 1.9991120141286502e-05,
6322
+ "loss": 1.5834,
6323
+ "step": 90200
6324
+ },
6325
+ {
6326
+ "epoch": 0.067154522332783,
6327
+ "grad_norm": 0.7415780425071716,
6328
+ "learning_rate": 1.999110044293687e-05,
6329
+ "loss": 1.5689,
6330
+ "step": 90300
6331
+ },
6332
+ {
6333
+ "epoch": 0.06722889057456903,
6334
+ "grad_norm": 0.5777677297592163,
6335
+ "learning_rate": 1.9991080722772564e-05,
6336
+ "loss": 1.5139,
6337
+ "step": 90400
6338
+ },
6339
+ {
6340
+ "epoch": 0.06730325881635506,
6341
+ "grad_norm": 0.6991866827011108,
6342
+ "learning_rate": 1.999106098079363e-05,
6343
+ "loss": 1.5073,
6344
+ "step": 90500
6345
+ },
6346
+ {
6347
+ "epoch": 0.06737762705814108,
6348
+ "grad_norm": 0.6112390160560608,
6349
+ "learning_rate": 1.9991041217000105e-05,
6350
+ "loss": 1.4773,
6351
+ "step": 90600
6352
+ },
6353
+ {
6354
+ "epoch": 0.06745199529992713,
6355
+ "grad_norm": 0.8287676572799683,
6356
+ "learning_rate": 1.9991021431392033e-05,
6357
+ "loss": 1.5425,
6358
+ "step": 90700
6359
+ },
6360
+ {
6361
+ "epoch": 0.06752636354171315,
6362
+ "grad_norm": 0.8582881689071655,
6363
+ "learning_rate": 1.999100162396946e-05,
6364
+ "loss": 1.5581,
6365
+ "step": 90800
6366
+ },
6367
+ {
6368
+ "epoch": 0.06760073178349918,
6369
+ "grad_norm": 0.5585276484489441,
6370
+ "learning_rate": 1.999098179473243e-05,
6371
+ "loss": 1.5015,
6372
+ "step": 90900
6373
+ },
6374
+ {
6375
+ "epoch": 0.0676751000252852,
6376
+ "grad_norm": 0.4237435460090637,
6377
+ "learning_rate": 1.9990961943680984e-05,
6378
+ "loss": 1.523,
6379
+ "step": 91000
6380
+ },
6381
+ {
6382
+ "epoch": 0.06774946826707123,
6383
+ "grad_norm": 0.5455594658851624,
6384
+ "learning_rate": 1.999094207081517e-05,
6385
+ "loss": 1.5448,
6386
+ "step": 91100
6387
+ },
6388
+ {
6389
+ "epoch": 0.06782383650885726,
6390
+ "grad_norm": 0.48855817317962646,
6391
+ "learning_rate": 1.999092217613502e-05,
6392
+ "loss": 1.4535,
6393
+ "step": 91200
6394
+ },
6395
+ {
6396
+ "epoch": 0.06789820475064329,
6397
+ "grad_norm": 0.5199916958808899,
6398
+ "learning_rate": 1.999090225964059e-05,
6399
+ "loss": 1.4921,
6400
+ "step": 91300
6401
+ },
6402
+ {
6403
+ "epoch": 0.06797257299242931,
6404
+ "grad_norm": 0.5790271162986755,
6405
+ "learning_rate": 1.9990882321331916e-05,
6406
+ "loss": 1.5773,
6407
+ "step": 91400
6408
+ },
6409
+ {
6410
+ "epoch": 0.06804694123421534,
6411
+ "grad_norm": 0.5524342656135559,
6412
+ "learning_rate": 1.9990862361209043e-05,
6413
+ "loss": 1.4619,
6414
+ "step": 91500
6415
+ },
6416
+ {
6417
+ "epoch": 0.06812130947600137,
6418
+ "grad_norm": 0.7153291702270508,
6419
+ "learning_rate": 1.999084237927202e-05,
6420
+ "loss": 1.6042,
6421
+ "step": 91600
6422
+ },
6423
+ {
6424
+ "epoch": 0.0681956777177874,
6425
+ "grad_norm": 0.957635223865509,
6426
+ "learning_rate": 1.9990822375520882e-05,
6427
+ "loss": 1.538,
6428
+ "step": 91700
6429
+ },
6430
+ {
6431
+ "epoch": 0.06827004595957342,
6432
+ "grad_norm": 0.38240477442741394,
6433
+ "learning_rate": 1.9990802349955678e-05,
6434
+ "loss": 1.5937,
6435
+ "step": 91800
6436
+ },
6437
+ {
6438
+ "epoch": 0.06834441420135945,
6439
+ "grad_norm": 0.8961233496665955,
6440
+ "learning_rate": 1.999078230257645e-05,
6441
+ "loss": 1.5119,
6442
+ "step": 91900
6443
+ },
6444
+ {
6445
+ "epoch": 0.06841878244314548,
6446
+ "grad_norm": 0.47433900833129883,
6447
+ "learning_rate": 1.999076223338324e-05,
6448
+ "loss": 1.5449,
6449
+ "step": 92000
6450
+ },
6451
+ {
6452
+ "epoch": 0.0684931506849315,
6453
+ "grad_norm": 0.8222399353981018,
6454
+ "learning_rate": 1.9990742142376098e-05,
6455
+ "loss": 1.5334,
6456
+ "step": 92100
6457
+ },
6458
+ {
6459
+ "epoch": 0.06856751892671753,
6460
+ "grad_norm": 0.464373916387558,
6461
+ "learning_rate": 1.999072202955506e-05,
6462
+ "loss": 1.5003,
6463
+ "step": 92200
6464
+ },
6465
+ {
6466
+ "epoch": 0.06864188716850356,
6467
+ "grad_norm": 0.8799763321876526,
6468
+ "learning_rate": 1.9990701894920176e-05,
6469
+ "loss": 1.581,
6470
+ "step": 92300
6471
+ },
6472
+ {
6473
+ "epoch": 0.06871625541028958,
6474
+ "grad_norm": 0.9567086100578308,
6475
+ "learning_rate": 1.999068173847149e-05,
6476
+ "loss": 1.4373,
6477
+ "step": 92400
6478
+ },
6479
+ {
6480
+ "epoch": 0.06879062365207561,
6481
+ "grad_norm": 0.440479576587677,
6482
+ "learning_rate": 1.999066156020904e-05,
6483
+ "loss": 1.5571,
6484
+ "step": 92500
6485
+ },
6486
+ {
6487
+ "epoch": 0.06886499189386165,
6488
+ "grad_norm": 0.7486180663108826,
6489
+ "learning_rate": 1.9990641360132876e-05,
6490
+ "loss": 1.4437,
6491
+ "step": 92600
6492
+ },
6493
+ {
6494
+ "epoch": 0.06893936013564768,
6495
+ "grad_norm": 0.7576742172241211,
6496
+ "learning_rate": 1.9990621138243037e-05,
6497
+ "loss": 1.5306,
6498
+ "step": 92700
6499
+ },
6500
+ {
6501
+ "epoch": 0.0690137283774337,
6502
+ "grad_norm": 0.6755186915397644,
6503
+ "learning_rate": 1.9990600894539574e-05,
6504
+ "loss": 1.5769,
6505
+ "step": 92800
6506
+ },
6507
+ {
6508
+ "epoch": 0.06908809661921973,
6509
+ "grad_norm": 0.6093853712081909,
6510
+ "learning_rate": 1.9990580629022526e-05,
6511
+ "loss": 1.5777,
6512
+ "step": 92900
6513
+ },
6514
+ {
6515
+ "epoch": 0.06916246486100576,
6516
+ "grad_norm": 0.5788242220878601,
6517
+ "learning_rate": 1.9990560341691938e-05,
6518
+ "loss": 1.494,
6519
+ "step": 93000
6520
+ },
6521
+ {
6522
+ "epoch": 0.06923683310279179,
6523
+ "grad_norm": 0.828676700592041,
6524
+ "learning_rate": 1.9990540032547855e-05,
6525
+ "loss": 1.5651,
6526
+ "step": 93100
6527
+ },
6528
+ {
6529
+ "epoch": 0.06931120134457781,
6530
+ "grad_norm": 0.5612863302230835,
6531
+ "learning_rate": 1.9990519701590322e-05,
6532
+ "loss": 1.5584,
6533
+ "step": 93200
6534
+ },
6535
+ {
6536
+ "epoch": 0.06938556958636384,
6537
+ "grad_norm": 0.965107262134552,
6538
+ "learning_rate": 1.999049934881938e-05,
6539
+ "loss": 1.497,
6540
+ "step": 93300
6541
+ },
6542
+ {
6543
+ "epoch": 0.06945993782814987,
6544
+ "grad_norm": 0.46939852833747864,
6545
+ "learning_rate": 1.9990478974235078e-05,
6546
+ "loss": 1.5716,
6547
+ "step": 93400
6548
+ },
6549
+ {
6550
+ "epoch": 0.0695343060699359,
6551
+ "grad_norm": 0.4986964464187622,
6552
+ "learning_rate": 1.999045857783746e-05,
6553
+ "loss": 1.5762,
6554
+ "step": 93500
6555
+ },
6556
+ {
6557
+ "epoch": 0.06960867431172192,
6558
+ "grad_norm": 0.4267128109931946,
6559
+ "learning_rate": 1.9990438159626566e-05,
6560
+ "loss": 1.5101,
6561
+ "step": 93600
6562
+ },
6563
+ {
6564
+ "epoch": 0.06968304255350795,
6565
+ "grad_norm": 0.411811888217926,
6566
+ "learning_rate": 1.9990417719602445e-05,
6567
+ "loss": 1.5623,
6568
+ "step": 93700
6569
+ },
6570
+ {
6571
+ "epoch": 0.06975741079529398,
6572
+ "grad_norm": 0.8761053681373596,
6573
+ "learning_rate": 1.999039725776514e-05,
6574
+ "loss": 1.4294,
6575
+ "step": 93800
6576
+ },
6577
+ {
6578
+ "epoch": 0.06983177903708,
6579
+ "grad_norm": 0.9531000852584839,
6580
+ "learning_rate": 1.99903767741147e-05,
6581
+ "loss": 1.4925,
6582
+ "step": 93900
6583
+ },
6584
+ {
6585
+ "epoch": 0.06990614727886603,
6586
+ "grad_norm": 0.516830325126648,
6587
+ "learning_rate": 1.999035626865116e-05,
6588
+ "loss": 1.5802,
6589
+ "step": 94000
6590
+ },
6591
+ {
6592
+ "epoch": 0.06998051552065206,
6593
+ "grad_norm": 0.47061294317245483,
6594
+ "learning_rate": 1.9990335741374572e-05,
6595
+ "loss": 1.5668,
6596
+ "step": 94100
6597
+ },
6598
+ {
6599
+ "epoch": 0.07005488376243808,
6600
+ "grad_norm": 0.7790777683258057,
6601
+ "learning_rate": 1.9990315192284978e-05,
6602
+ "loss": 1.5568,
6603
+ "step": 94200
6604
+ },
6605
+ {
6606
+ "epoch": 0.07012925200422411,
6607
+ "grad_norm": 0.75156170129776,
6608
+ "learning_rate": 1.9990294621382426e-05,
6609
+ "loss": 1.5217,
6610
+ "step": 94300
6611
+ },
6612
+ {
6613
+ "epoch": 0.07020362024601014,
6614
+ "grad_norm": 1.195028305053711,
6615
+ "learning_rate": 1.999027402866696e-05,
6616
+ "loss": 1.5662,
6617
+ "step": 94400
6618
+ },
6619
+ {
6620
+ "epoch": 0.07027798848779618,
6621
+ "grad_norm": 0.6215851306915283,
6622
+ "learning_rate": 1.999025341413862e-05,
6623
+ "loss": 1.5208,
6624
+ "step": 94500
6625
+ },
6626
+ {
6627
+ "epoch": 0.0703523567295822,
6628
+ "grad_norm": 0.509843647480011,
6629
+ "learning_rate": 1.9990232777797458e-05,
6630
+ "loss": 1.489,
6631
+ "step": 94600
6632
+ },
6633
+ {
6634
+ "epoch": 0.07042672497136823,
6635
+ "grad_norm": 1.2951029539108276,
6636
+ "learning_rate": 1.9990212119643516e-05,
6637
+ "loss": 1.4729,
6638
+ "step": 94700
6639
+ },
6640
+ {
6641
+ "epoch": 0.07050109321315426,
6642
+ "grad_norm": 0.5028135776519775,
6643
+ "learning_rate": 1.9990191439676838e-05,
6644
+ "loss": 1.5579,
6645
+ "step": 94800
6646
+ },
6647
+ {
6648
+ "epoch": 0.07057546145494029,
6649
+ "grad_norm": 0.7202877998352051,
6650
+ "learning_rate": 1.9990170737897473e-05,
6651
+ "loss": 1.5282,
6652
+ "step": 94900
6653
+ },
6654
+ {
6655
+ "epoch": 0.07064982969672631,
6656
+ "grad_norm": 0.9731516242027283,
6657
+ "learning_rate": 1.9990150014305462e-05,
6658
+ "loss": 1.5194,
6659
+ "step": 95000
6660
  }
6661
  ],
6662
  "logging_steps": 100,
 
6676
  "attributes": {}
6677
  }
6678
  },
6679
+ "total_flos": 1.2945898144897352e+18,
6680
  "train_batch_size": 1,
6681
  "trial_name": null,
6682
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bb89e5e4b20648cd50836d8df065bde229d29cb5c6085310a18725c84aab824
3
- size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccc6594b62fe53f0b1bfeab5cb36a3d9d52c3d027d521d24a54039f0b55f3bd6
3
+ size 5560