File size: 2,514 Bytes
6926a80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir):
    '''
    DeepSpeed configration
    https://huggingface.co/docs/transformers/main_classes/deepspeed
    '''

    if nvme_offload_dir:
        ds_config = {
            "fp16": {
                "enabled": not ds_bf16,
            },
            "bf16": {
                "enabled": ds_bf16,
            },
            "zero_optimization": {
                "stage": 3,
                "offload_param": {
                    "device": "nvme",
                    "nvme_path": nvme_offload_dir,
                    "pin_memory": True,
                    "buffer_count": 5,
                    "buffer_size": 1e9,
                    "max_in_cpu": 1e9
                },
                "overlap_comm": True,
                "reduce_bucket_size": "auto",
                "contiguous_gradients": True,
                "sub_group_size": 1e8,
                "stage3_prefetch_bucket_size": "auto",
                "stage3_param_persistence_threshold": "auto",
                "stage3_max_live_parameters": "auto",
                "stage3_max_reuse_distance": "auto",
            },
            "aio": {
                "block_size": 262144,
                "queue_depth": 32,
                "thread_count": 1,
                "single_submit": False,
                "overlap_events": True
            },
            "steps_per_print": 2000,
            "train_batch_size": train_batch_size,
            "train_micro_batch_size_per_gpu": 1,
            "wall_clock_breakdown": False
        }
    else:
        ds_config = {
            "fp16": {
                "enabled": not ds_bf16,
            },
            "bf16": {
                "enabled": ds_bf16,
            },
            "zero_optimization": {
                "stage": 3,
                "offload_param": {
                    "device": "cpu",
                    "pin_memory": True
                },
                "overlap_comm": True,
                "contiguous_gradients": True,
                "reduce_bucket_size": "auto",
                "stage3_prefetch_bucket_size": "auto",
                "stage3_param_persistence_threshold": "auto",
                "stage3_max_live_parameters": "auto",
                "stage3_max_reuse_distance": "auto",
            },
            "steps_per_print": 2000,
            "train_batch_size": train_batch_size,
            "train_micro_batch_size_per_gpu": 1,
            "wall_clock_breakdown": False
        }

    return ds_config