Muennighoff
commited on
Commit
•
f4bcf01
1
Parent(s):
f9fc05c
This view is limited to 50 files because it contains too many changes.
See raw diff
- 146m174b400m/3326770.err +0 -0
- 146m174b400m/3326770.out +367 -0
- 146m174b400m/3425869.err +0 -0
- 146m174b400m/3425869.out +0 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- 146m174b400m/global_step331103/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3 -0
146m174b400m/3326770.err
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m174b400m/3326770.out
ADDED
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model parameters: d_model 768 ffw_size 3072 kv_size 64 n_heads 12 n_layers 15
|
2 |
+
Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 15 --hidden-size 768 --num-attention-heads 12 --kv-channels 64 --ffn-hidden-size 3072 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 84_762_549 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --loss-scale 12 --clip-grad 1.0 --kill-switch-path kill-switch-146m174b400m --bf16 --checkpoint-activations --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 84_762_549 --lr-warmup-samples 847_625 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 100 --save-interval 10000 --eval-interval 10000 --eval-iters 1 --tensorboard-dir tensorboard_146m174b400m --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_146m174b400m --load checkpoints_146m174b400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3326770.json --zero-stage 0
|
3 |
+
START 3326770: Fri 17 Mar 2023 01:18:59 PM EET
|
4 |
+
0:
|
5 |
+
0:
|
6 |
+
0: ======================= ROCm System Management Interface =======================
|
7 |
+
0: ================================= Concise Info =================================
|
8 |
+
0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
9 |
+
0: 0 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
10 |
+
0: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
11 |
+
0: 2 37.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
12 |
+
0: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
13 |
+
0: 4 42.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
14 |
+
0: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
15 |
+
0: 6 39.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
16 |
+
0: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
17 |
+
0: ================================================================================
|
18 |
+
0: ============================= End of ROCm SMI Log ==============================
|
19 |
+
7:
|
20 |
+
7:
|
21 |
+
7: ======================= ROCm System Management Interface =======================
|
22 |
+
7: ================================= Concise Info =================================
|
23 |
+
7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
24 |
+
7: 0 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
25 |
+
7: 1 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
26 |
+
7: 2 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
27 |
+
7: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
28 |
+
7: 4 43.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
29 |
+
7: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
30 |
+
7: 6 37.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
31 |
+
7: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
32 |
+
7: ================================================================================
|
33 |
+
7: ============================= End of ROCm SMI Log ==============================
|
34 |
+
1:
|
35 |
+
1:
|
36 |
+
1: ======================= ROCm System Management Interface =======================
|
37 |
+
1: ================================= Concise Info =================================
|
38 |
+
1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
39 |
+
1: 0 50.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
40 |
+
1: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
41 |
+
1: 2 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
42 |
+
1: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
43 |
+
1: 4 47.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
44 |
+
1: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
45 |
+
1: 6 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
46 |
+
1: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
47 |
+
1: ================================================================================
|
48 |
+
1: ============================= End of ROCm SMI Log ==============================
|
49 |
+
2:
|
50 |
+
2:
|
51 |
+
2: ======================= ROCm System Management Interface =======================
|
52 |
+
2: ================================= Concise Info =================================
|
53 |
+
2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
54 |
+
2: 0 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
55 |
+
2: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
56 |
+
2: 2 46.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
57 |
+
2: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
58 |
+
2: 4 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
59 |
+
2: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
60 |
+
2: 6 39.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
61 |
+
2: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
62 |
+
2: ================================================================================
|
63 |
+
2: ============================= End of ROCm SMI Log ==============================
|
64 |
+
6:
|
65 |
+
6:
|
66 |
+
6: ======================= ROCm System Management Interface =======================
|
67 |
+
6: ================================= Concise Info =================================
|
68 |
+
6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
69 |
+
6: 0 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
70 |
+
6: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
71 |
+
6: 2 38.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
72 |
+
6: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
73 |
+
6: 4 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
74 |
+
6: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
75 |
+
6: 6 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
76 |
+
6: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
77 |
+
6: ================================================================================
|
78 |
+
6: ============================= End of ROCm SMI Log ==============================
|
79 |
+
3:
|
80 |
+
3:
|
81 |
+
3: ======================= ROCm System Management Interface =======================
|
82 |
+
3: ================================= Concise Info =================================
|
83 |
+
3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
84 |
+
3: 0 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
85 |
+
3: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
86 |
+
3: 2 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
87 |
+
3: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
88 |
+
3: 4 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
89 |
+
3: 5 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
90 |
+
3: 6 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
91 |
+
3: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
92 |
+
3: ================================================================================
|
93 |
+
3: ============================= End of ROCm SMI Log ==============================
|
94 |
+
5:
|
95 |
+
5:
|
96 |
+
5: ======================= ROCm System Management Interface =======================
|
97 |
+
5: ================================= Concise Info =================================
|
98 |
+
5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
99 |
+
5: 0 47.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
100 |
+
5: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
101 |
+
5: 2 36.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
102 |
+
5: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
103 |
+
5: 4 39.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
104 |
+
5: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
105 |
+
5: 6 37.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
106 |
+
5: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
107 |
+
5: ================================================================================
|
108 |
+
5: ============================= End of ROCm SMI Log ==============================
|
109 |
+
4:
|
110 |
+
4:
|
111 |
+
4: ======================= ROCm System Management Interface =======================
|
112 |
+
4: ================================= Concise Info =================================
|
113 |
+
4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
|
114 |
+
4: 0 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
115 |
+
4: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
116 |
+
4: 2 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
117 |
+
4: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
118 |
+
4: 4 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
119 |
+
4: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
120 |
+
4: 6 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0%
|
121 |
+
4: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0%
|
122 |
+
4: ================================================================================
|
123 |
+
4: ============================= End of ROCm SMI Log ==============================
|
124 |
+
6: Launching on nid005626 (6/8), master nid005620 port 9999, GPUs 8, CUDA: True
|
125 |
+
5: Launching on nid005625 (5/8), master nid005620 port 9999, GPUs 8, CUDA: True
|
126 |
+
4: Launching on nid005624 (4/8), master nid005620 port 9999, GPUs 8, CUDA: True
|
127 |
+
3: Launching on nid005623 (3/8), master nid005620 port 9999, GPUs 8, CUDA: True
|
128 |
+
2: Launching on nid005622 (2/8), master nid005620 port 9999, GPUs 8, CUDA: True
|
129 |
+
7: Launching on nid005627 (7/8), master nid005620 port 9999, GPUs 8, CUDA: True
|
130 |
+
1: Launching on nid005621 (1/8), master nid005620 port 9999, GPUs 8, CUDA: True
|
131 |
+
0: Launching on nid005620 (0/8), master nid005620 port 9999, GPUs 8, CUDA: True
|
132 |
+
7: > setting tensorboard ...
|
133 |
+
0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1
|
134 |
+
0: accumulate and all-reduce gradients in fp32 for bfloat16 data type.
|
135 |
+
0: using torch.bfloat16 for parameters ...
|
136 |
+
0: ------------------------ arguments ------------------------
|
137 |
+
0: abort_on_unmet_fused_kernel_constraints ......... False
|
138 |
+
0: accumulate_allreduce_grads_in_fp32 .............. True
|
139 |
+
0: adam_beta1 ...................................... 0.9
|
140 |
+
0: adam_beta2 ...................................... 0.999
|
141 |
+
0: adam_eps ........................................ 1e-08
|
142 |
+
0: adlr_autoresume ................................. False
|
143 |
+
0: adlr_autoresume_interval ........................ 1000
|
144 |
+
0: apply_query_key_layer_scaling ................... True
|
145 |
+
0: apply_residual_connection_post_layernorm ........ False
|
146 |
+
0: attention_dropout ............................... 0.1
|
147 |
+
0: attention_softmax_in_fp32 ....................... False
|
148 |
+
0: bert_binary_head ................................ True
|
149 |
+
0: bert_load ....................................... None
|
150 |
+
0: bf16 ............................................ True
|
151 |
+
0: bias_dropout_fusion ............................. True
|
152 |
+
0: bias_gelu_fusion ................................ True
|
153 |
+
0: biencoder_projection_dim ........................ 0
|
154 |
+
0: biencoder_shared_query_context_model ............ False
|
155 |
+
0: block_data_path ................................. None
|
156 |
+
0: checkpoint_activations .......................... True
|
157 |
+
0: checkpoint_in_cpu ............................... False
|
158 |
+
0: checkpoint_num_layers ........................... 1
|
159 |
+
0: clip_grad ....................................... 1.0
|
160 |
+
0: codecarbon_dir .................................. None
|
161 |
+
0: consumed_train_samples .......................... 0
|
162 |
+
0: consumed_train_tokens ........................... 0
|
163 |
+
0: consumed_valid_samples .......................... 0
|
164 |
+
0: contigious_checkpointing ........................ False
|
165 |
+
0: cpu_optimizer ................................... False
|
166 |
+
0: cpu_torch_adam .................................. False
|
167 |
+
0: curriculum_learning ............................. False
|
168 |
+
0: data_impl ....................................... mmap
|
169 |
+
0: data_parallel_size .............................. 64
|
170 |
+
0: data_path ....................................... None
|
171 |
+
0: dataloader_type ................................. single
|
172 |
+
0: DDP_impl ........................................ local
|
173 |
+
0: decoder_seq_length .............................. None
|
174 |
+
0: deepscale ....................................... False
|
175 |
+
0: deepscale_config ................................ None
|
176 |
+
0: deepspeed ....................................... True
|
177 |
+
0: deepspeed_activation_checkpointing .............. False
|
178 |
+
0: deepspeed_config ................................ ds_configs/3326770.json
|
179 |
+
0: deepspeed_mpi ................................... False
|
180 |
+
0: distribute_checkpointed_activations ............. False
|
181 |
+
0: distributed_backend ............................. nccl
|
182 |
+
0: embed_layernorm ................................. False
|
183 |
+
0: embedding_path .................................. None
|
184 |
+
0: encoder_seq_length .............................. 2048
|
185 |
+
0: eod_mask_loss ................................... False
|
186 |
+
0: eval_interval ................................... 10000
|
187 |
+
0: eval_iters ...................................... 1
|
188 |
+
0: eval_only ....................................... None
|
189 |
+
0: evidence_data_path .............................. None
|
190 |
+
0: exit_duration_in_mins ........................... None
|
191 |
+
0: exit_interval ................................... None
|
192 |
+
0: ffn_hidden_size ................................. 3072
|
193 |
+
0: finetune ........................................ False
|
194 |
+
0: fp16 ............................................ False
|
195 |
+
0: fp16_lm_cross_entropy ........................... False
|
196 |
+
0: fp32_residual_connection ........................ False
|
197 |
+
0: gigaflos_no_embeds .............................. 0
|
198 |
+
0: global_batch_size ............................... 256
|
199 |
+
0: glu_activation .................................. None
|
200 |
+
0: hidden_dropout .................................. 0.1
|
201 |
+
0: hidden_size ..................................... 768
|
202 |
+
0: hysteresis ...................................... 2
|
203 |
+
0: ict_head_size ................................... None
|
204 |
+
0: ict_load ........................................ None
|
205 |
+
0: img_dim ......................................... 224
|
206 |
+
0: indexer_batch_size .............................. 128
|
207 |
+
0: indexer_log_interval ............................ 1000
|
208 |
+
0: inference ....................................... False
|
209 |
+
0: init_method_std ................................. 0.02
|
210 |
+
0: init_method_xavier_uniform ...................... False
|
211 |
+
0: initial_loss_scale .............................. 4294967296
|
212 |
+
0: kill_switch_path ................................ kill-switch-146m174b400m
|
213 |
+
0: kv_channels ..................................... 64
|
214 |
+
0: layer_norm_fusion ............................... True
|
215 |
+
0: layernorm_epsilon ............................... 1e-05
|
216 |
+
0: lazy_mpu_init ................................... None
|
217 |
+
0: load ............................................ checkpoints_146m174b400m
|
218 |
+
0: local_rank ...................................... None
|
219 |
+
0: log_batch_size_to_tensorboard ................... True
|
220 |
+
0: log_interval .................................... 100
|
221 |
+
0: log_learning_rate_to_tensorboard ................ True
|
222 |
+
0: log_level ....................................... None
|
223 |
+
0: log_level_replica ............................... None
|
224 |
+
0: log_loss_scale_to_tensorboard ................... True
|
225 |
+
0: log_num_zeros_in_grad ........................... False
|
226 |
+
0: log_params_norm ................................. False
|
227 |
+
0: log_path ........................................ None
|
228 |
+
0: log_timers_to_tensorboard ....................... True
|
229 |
+
0: log_validation_ppl_to_tensorboard ............... True
|
230 |
+
0: loss_on_targets_only ............................ False
|
231 |
+
0: loss_scale ...................................... 12.0
|
232 |
+
0: loss_scale_window ............................... 1000
|
233 |
+
0: lr .............................................. 0.0002
|
234 |
+
0: lr_decay_iters .................................. None
|
235 |
+
0: lr_decay_samples ................................ 84762549
|
236 |
+
0: lr_decay_style .................................. cosine
|
237 |
+
0: lr_decay_tokens ................................. None
|
238 |
+
0: lr_warmup_fraction .............................. None
|
239 |
+
0: lr_warmup_iters ................................. 0
|
240 |
+
0: lr_warmup_samples ............................... 847625
|
241 |
+
0: make_vocab_size_divisible_by .................... 128
|
242 |
+
0: mask_prob ....................................... 0.15
|
243 |
+
0: masked_softmax_fusion ........................... True
|
244 |
+
0: max_position_embeddings ......................... 2048
|
245 |
+
0: mean_noise_span_length .......................... None
|
246 |
+
0: memory_centric_tiled_linear ..................... False
|
247 |
+
0: merge_file ...................................... gpt2/merges.txt
|
248 |
+
0: micro_batch_size ................................ 4
|
249 |
+
0: min_loss_scale .................................. 1.0
|
250 |
+
0: min_lr .......................................... 2e-05
|
251 |
+
0: mmap_warmup ..................................... False
|
252 |
+
0: no_load_optim ................................... None
|
253 |
+
0: no_load_rng ..................................... None
|
254 |
+
0: no_save_optim ................................... None
|
255 |
+
0: no_save_rng ..................................... None
|
256 |
+
0: noise_density ................................... None
|
257 |
+
0: num_attention_heads ............................. 12
|
258 |
+
0: num_channels .................................... 3
|
259 |
+
0: num_classes ..................................... 1000
|
260 |
+
0: num_layers ...................................... 15
|
261 |
+
0: num_layers_per_virtual_pipeline_stage ........... None
|
262 |
+
0: num_workers ..................................... 2
|
263 |
+
0: onnx_safe ....................................... None
|
264 |
+
0: openai_gelu ..................................... False
|
265 |
+
0: optimizer ....................................... adam
|
266 |
+
0: optimizer_fusion ................................ True
|
267 |
+
0: override_lr_scheduler ........................... False
|
268 |
+
0: pad_vocab_size_to ............................... None
|
269 |
+
0: params_dtype .................................... torch.bfloat16
|
270 |
+
0: partition_activations ........................... False
|
271 |
+
0: patch_dim ....................................... 16
|
272 |
+
0: pipeline_model_parallel_size .................... 1
|
273 |
+
0: position_embedding_type ......................... PositionEmbeddingType.absolute
|
274 |
+
0: pp_partition_method ............................. None
|
275 |
+
0: profile_backward ................................ False
|
276 |
+
0: query_in_block_prob ............................. 0.1
|
277 |
+
0: rampup_batch_size ............................... None
|
278 |
+
0: rank ............................................ 0
|
279 |
+
0: remote_device ................................... none
|
280 |
+
0: reset_attention_mask ............................ False
|
281 |
+
0: reset_position_ids .............................. False
|
282 |
+
0: reset_progress .................................. None
|
283 |
+
0: retriever_report_topk_accuracies ................ []
|
284 |
+
0: retriever_score_scaling ......................... False
|
285 |
+
0: retriever_seq_length ............................ 256
|
286 |
+
0: reweight_loss_based_on_position_frequency ....... False
|
287 |
+
0: sample_rate ..................................... 1.0
|
288 |
+
0: save ............................................ checkpoints_146m174b400m
|
289 |
+
0: save_interval ................................... 10000
|
290 |
+
0: scatter_gather_tensors_in_pipeline .............. True
|
291 |
+
0: scattered_embeddings ............................ False
|
292 |
+
0: seed ............................................ 1234
|
293 |
+
0: seq_length ...................................... 2048
|
294 |
+
0: sgd_momentum .................................... 0.9
|
295 |
+
0: short_seq_prob .................................. 0.1
|
296 |
+
0: skip_train_iteration_range ...................... None
|
297 |
+
0: split ........................................... None
|
298 |
+
0: split_transformers .............................. False
|
299 |
+
0: sync_tp_duplicated_parameters ................... False
|
300 |
+
0: synchronize_each_layer .......................... False
|
301 |
+
0: tensor_model_parallel_size ...................... 1
|
302 |
+
0: tensorboard_dir ................................. tensorboard_146m174b400m
|
303 |
+
0: tensorboard_log_interval ........................ 1
|
304 |
+
0: tensorboard_queue_size .......................... 5
|
305 |
+
0: test_weighted_split_paths ....................... None
|
306 |
+
0: test_weighted_split_paths_path .................. None
|
307 |
+
0: tile_factor ..................................... 1
|
308 |
+
0: titles_data_path ................................ None
|
309 |
+
0: tokenizer_name_or_path .......................... None
|
310 |
+
0: tokenizer_type .................................. GPT2BPETokenizer
|
311 |
+
0: train_iters ..................................... None
|
312 |
+
0: train_samples ................................... 84762549
|
313 |
+
0: train_tokens .................................... None
|
314 |
+
0: train_weighted_split_names ...................... ['train']
|
315 |
+
0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']]
|
316 |
+
0: train_weighted_split_paths_path ................. None
|
317 |
+
0: train_weighted_split_splits ..................... [['0:1']]
|
318 |
+
0: train_weighted_split_weights .................... [['1.0']]
|
319 |
+
0: universal_checkpoint ............................ False
|
320 |
+
0: use_bnb_optimizer ............................... False
|
321 |
+
0: use_checkpoint_lr_scheduler ..................... False
|
322 |
+
0: use_contiguous_buffers_in_ddp ................... True
|
323 |
+
0: use_cpu_initialization .......................... None
|
324 |
+
0: use_one_sent_docs ............................... False
|
325 |
+
0: use_pin_memory .................................. False
|
326 |
+
0: valid_num_workers ............................... 2
|
327 |
+
0: valid_weighted_split_names ...................... ['validation']
|
328 |
+
0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']]
|
329 |
+
0: valid_weighted_split_paths_path ................. None
|
330 |
+
0: valid_weighted_split_splits ..................... [['0:1']]
|
331 |
+
0: valid_weighted_split_weights .................... [['1.0']]
|
332 |
+
0: virtual_pipeline_model_parallel_size ............ None
|
333 |
+
0: vocab_extra_ids ................................. 0
|
334 |
+
0: vocab_file ...................................... gpt2/vocab.json
|
335 |
+
0: weight_decay .................................... 0.1
|
336 |
+
0: world_size ...................................... 64
|
337 |
+
0: zero_allgather_bucket_size ...................... 0.0
|
338 |
+
0: zero_contigious_gradients ....................... False
|
339 |
+
0: zero_reduce_bucket_size ......................... 0.0
|
340 |
+
0: zero_reduce_scatter ............................. False
|
341 |
+
0: zero_stage ...................................... 0
|
342 |
+
0: -------------------- end of arguments ---------------------
|
343 |
+
0: setting number of micro-batches to constant 1
|
344 |
+
0: > building GPT2BPETokenizer tokenizer ...
|
345 |
+
0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)
|
346 |
+
0: DeepSpeed general environment info:
|
347 |
+
0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch']
|
348 |
+
0: torch version .................... 1.13.0+rocm5.2
|
349 |
+
0: torch cuda version ............... None
|
350 |
+
0: torch hip version ................ 5.2.21151-afdc89f8
|
351 |
+
0: nvcc version ..................... None
|
352 |
+
0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed']
|
353 |
+
0: deepspeed info ................... 0.7.5, unknown, unknown
|
354 |
+
0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1
|
355 |
+
0: **** Git info for Megatron: git_hash=unknown git_branch=unknown ****
|
356 |
+
0: > initializing torch distributed ...
|
357 |
+
0: [2023-03-17 13:22:27,336] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
358 |
+
0: > initializing tensor model parallel with size 1
|
359 |
+
0: > initializing pipeline model parallel with size 1
|
360 |
+
0: > setting random seeds to 1234 ...
|
361 |
+
0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
|
362 |
+
0: > compiling dataset index builder ...
|
363 |
+
0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
|
364 |
+
0: make: Nothing to be done for 'default'.
|
365 |
+
0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data'
|
366 |
+
0: >>> done with dataset index builder. Compilation time: 0.071 seconds
|
367 |
+
0: > compiling and loading fused kernels ...
|
146m174b400m/3425869.err
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m174b400m/3425869.out
ADDED
The diff for this file is too large to render.
See raw diff
|
|
146m174b400m/global_step331103/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d87a5ec7d1a728050c334ba1c1159f4b52490a0b29550ef0a4e55e8958eb6771
|
3 |
+
size 27478295
|
146m174b400m/global_step331103/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:15844728dc8299c1039742957b30eea7b7756a1d66024be6e4d877c584fce21f
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6902ea9c925405dcc3d261cefbd4397b69a963ca8f8e86533807733fa00d7d05
|
3 |
+
size 27478242
|
146m174b400m/global_step331103/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:928debe253186a0428d324ab1150d9498e683831d22a7b0551e176e41a776fe1
|
3 |
+
size 27478242
|
146m174b400m/global_step331103/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:67b285f7cd611c593202f82959f089945b63682c9610329dd18125d3533d61de
|
3 |
+
size 27478178
|
146m174b400m/global_step331103/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcb8b8c9fc41f6f08a293a421655c107a33d954175a4d8757c35fd69b2bf4c12
|
3 |
+
size 27478370
|
146m174b400m/global_step331103/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c57c9b63e444b5efce4b829b28d63849f7b435c3ac8e361169480ac56d044c35
|
3 |
+
size 27478178
|
146m174b400m/global_step331103/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d11219ca0006e1e4849b0c6891adc4e006d1a9dba607a95ce03b78868e69d13
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8398b6b2c7f9030d255a066f4f384a4c230ecc6ac9e628f80ebcab2268727784
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db8ab8089dfb48beb172d2c269b52e8846ecd9dfaa5ce387a2cc7afcfa033f67
|
3 |
+
size 27478242
|
146m174b400m/global_step331103/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a95e485c8da49a87a8deacbc6b13629225e92ac4e3acbb3252e4028680e7bce8
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e71ce3eaf07b24477bf0736005813904a70cc83b773b5bc26a668d5b8be5249f
|
3 |
+
size 27478231
|
146m174b400m/global_step331103/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad3ada667fa57759ca658d181f2733a6791886710458f85d7e8ced8ff832ce1c
|
3 |
+
size 27478178
|
146m174b400m/global_step331103/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff7fb668c4c79ce396ff830fa368d00d4df4fe97bf36c529dba3b146cce4e089
|
3 |
+
size 27478242
|
146m174b400m/global_step331103/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:15329c4cec48aaa5ae3927c9e06ba31c0defb9c1cbbe9ffc14f2a0e3c00176d6
|
3 |
+
size 27478242
|
146m174b400m/global_step331103/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8cfbd63f8cdaa1e5cfea96523db07cda76bfd278f1d18ed291f18a73585eff7e
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e322d625d8d9a866268bf8f15b8cf41a88ad90011b9b20fab6925ee5dd610da4
|
3 |
+
size 27478178
|
146m174b400m/global_step331103/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d3c39caa86bad034801f02ebabb15a89dc3fd0f076c77a0b534645f90c9b8ec
|
3 |
+
size 27478370
|
146m174b400m/global_step331103/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfa53875473b3e80a003715bc690a2a055635d3b8be5be34c7b43d5e6bffe0e8
|
3 |
+
size 27478178
|
146m174b400m/global_step331103/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d0c93c8744985186e907a4c6ea9f64fc0ea910effc377305400c63c6e3b538e
|
3 |
+
size 27478370
|
146m174b400m/global_step331103/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9d3f9dbc62b19c6647ef5128bfdab5c874f6fb423d8059035756e752464a93d
|
3 |
+
size 27478242
|
146m174b400m/global_step331103/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90fc7b3ac1e83456e74c97c2ab72ee9468bce84c6f5e8ac39653d4bbf13bf03d
|
3 |
+
size 27478370
|
146m174b400m/global_step331103/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc9a03a1572f4f2f2c8efea5128b48408fca3500cb2c2793774ce70911706fd2
|
3 |
+
size 27478231
|
146m174b400m/global_step331103/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e94d750d6b32a697203eccb1cf5de56fca2480f00457a4a3da94d31d16919af
|
3 |
+
size 27478242
|
146m174b400m/global_step331103/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:963600de746f54e8a53931d40e6cf30c0b3624c03f6d7ac8e8c24dbf24a86c4a
|
3 |
+
size 27478370
|
146m174b400m/global_step331103/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac3985e94efd0442dd00e6f0cd1f821b9f696bb097251914fb56166b5049eac0
|
3 |
+
size 27478178
|
146m174b400m/global_step331103/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9858a0e0cd22031d2c7591d2c841aa32202756e0effa7ff8bc537957d9ea6f92
|
3 |
+
size 27478434
|
146m174b400m/global_step331103/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:223209fc322c276405e908e1c8845f3cb97eeaa078a4bfdeb9ebd27e8e2d287a
|
3 |
+
size 27478370
|
146m174b400m/global_step331103/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ab234497bc3e9c2dc8d327e0ba90f641509aace4bfc7ac08a79033094793a1af
|
3 |
+
size 27478242
|
146m174b400m/global_step331103/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ffc82bc1224e82b6a78439dd2555d5f88088f82eff281f2fc356bd696c3e396
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6b8ce2fe4bf408924acc58a4b9d89854ab51743413b39f18a40ced709c95d75
|
3 |
+
size 27478114
|
146m174b400m/global_step331103/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ec90772020ebf7196bdfaa6837622b4c4c07fab6a7f3cae7971d93830151276
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1e9352b4ef4c7b1f46369832276d8ae8dffbcabda8483f0fff1fa741c7293f8
|
3 |
+
size 27478434
|
146m174b400m/global_step331103/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9facd2d68e99e551262e6faa8828a10dc512282f2bb040f1137f51f1673dce6
|
3 |
+
size 27478231
|
146m174b400m/global_step331103/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4421591b14d517fe92e494285d711657d2cf579ce1f63e80821ee07884610759
|
3 |
+
size 27478242
|
146m174b400m/global_step331103/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec976f0c60e3b19ee2d9d8ddbfc72aa28bb30d022c6eb52b4b3708e2ea264ecb
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0dd981fd2c47d9490108fb4b24b8ba290444d07e8c9082e95a8c0ef95db21c9
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:37b92cef051d6fdf02a727bfff86739467aa0870f81ebb3a799c28a3492fce63
|
3 |
+
size 27478178
|
146m174b400m/global_step331103/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52246faec5d7fc04dcac3214a21a80febf6efcfbd6b0b6a4af424ee43758ec30
|
3 |
+
size 27478434
|
146m174b400m/global_step331103/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d74ca676275ee150f4a71de96d2d5a443046410591cf3dd6b53f6519000d727
|
3 |
+
size 27478114
|
146m174b400m/global_step331103/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca21e23fee531d933ad9a82f6f90d64ca5b72a214958e68a557db8a3ed37f393
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b49d064dc2b9395f721ca5a812ef1687a29d5005c43ab3340ac754dc4be7659
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55d4d1eb485b13bdec3a3dc65dee3565434c575d8e906f54e38800abe0eea0dd
|
3 |
+
size 27478306
|
146m174b400m/global_step331103/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12ca2017ada61a90c5edacec63327dcb521871f1674b399e3b0d5e2cc265c6b9
|
3 |
+
size 27478370
|
146m174b400m/global_step331103/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1cceb62b1d516ffe44441a0ff102abbeb84b580c5a01a7493f1534631fce21ba
|
3 |
+
size 27478231
|
146m174b400m/global_step331103/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:26c1597d2f686e21743ace67b762b65f86903638b00a71bc24e2baea7890d952
|
3 |
+
size 27478242
|