File size: 6,461 Bytes
4fa1155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Paths
model = '/workspace/model'
output_dir = '/workspace/out'

# Lora configuration
# can use full_fine_tune=true and no quantization to train the whole model instead of a LoRA
#full_fine_tune = true
lora_rank = 16
lora_alpha = 32
lora_dropout = 0.05

# Train only specific modules. This is passed to the parameter of the same name in the LoraConfig.
# If not set, adapt all linear modules.
# Note, this ALSO affects full fine tuning. In that case, if this is set, only weights containing one
# of these keys as substring will have requires_grad. If not set everything is trained.
#target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']

# can specify layers to adapt with LoRA if you want
#layers_to_transform = '16:31'

# for Mixtral, set the load balancing coefficient
# load_balancing_loss_coef = 0.02

# Optimization configuration
epochs = 2
lr_scheduler = 'cosine'  # can also be 'constant'
warmup_steps = 50

# might be useful if resuming from a checkpoint and you want to change the LR and force it to something
#force_constant_lr = 5e-5

# hard clamp the magnitude of the LoRA weights
#scale_weight_norms = 1.0

# dynamic batch size, targeting this many tokens per batch, per device
# if set, completely ignores the batch size in the deepspeed JSON config file
# can be thought of as a replacement for sample packing
batch_size_tokens = 10000

# Performance settings
pipeline_stages = 8  # number of pipeline parallel stages, must evenly divide the number of GPUs you launch the script with
logging_steps = 10  # how often to log in Tensorboard
eval_steps = 500
save_steps = 500
checkpoint_every_n_minutes = 60
eval_before_first_step = false  # do an eval before any training happens
# dtype to load the underlying model weights in
model_weight_dtype = 'bfloat16'
# dtype for the LoRA weights
lora_weight_dtype = 'bfloat16'
# Can have the saved weights be different dtype. Don't need to set this. Could be useful for
# training in float32 but saving with float16.
#save_dtype = 'bfloat16'
# Keep this number of stepXXXX (model saves) and global_stepXXX (checkpoint saves) and delete the rest
# (this only applies to the current training session, and resumed training sessions will not touch
# old saves)
keep_states = 5

# sort examples by length before dividing them into batches
# this makes all examples in a batch approximately the same length, to minimize padding
# the batches are still shuffled after that
# you should probably always have this set to true
group_by_length = true

# This can also be 'unsloth' to offload hidden states to CPU, saving potentially a lot of VRAM
# for a minor performance hit.
# Example: 4x4090, PCIE 3.0 16x, pipeline_stages=4, training QLoRA on Llama 3 70B with 4096 sequence length.
# true: 75s step time, 19.7G peak per-GPU VRAM usage.
# 'unsloth': 78s step time, 16.2G peak per-GPU VRAM usage.
activation_checkpointing = 'unsloth'

# Keep MLP weights on system RAM until they are needed. Can save a ton of VRAM with a
# moderate hit to performance. If using an MoE model, this can also be an integer, in
# which case only that many experts are offloaded (tradeoff between VRAM and speed).
offload_mlp_to_cpu = 2

# Resume a prior run
# if true, we attempt to resume training from the most recent directory inside output_dir (the directory names are timestamps)
# so, to resume, just run the exact same command but set this to true first
resume_from_checkpoint = false

# Loading the optimizer states seems to cause some kind of unavoidable VRAM memory leak.
# It's very small, only about 0.2 GB in cases I've seen. But if you are very close to the
# limit, it can cause resuming from checkpoint to OOM. As a last resort, you can uncomment
# this to not load the optimizer states and hopefully the resumption won't OOM.
#load_optimizer_states = false


# Dataset configuration

# How to combine multiple datasets if you have more than one.
# Can be 'concatenate' or 'interleave'. Will be 'concatenate' if not set.
dataset_combination_mode = 'interleave'
# When to stop interleaving datasets when using mode 'interleave'. Either 'first_exhausted' or 'all_exhausted'.
# Default if not set: 'first_exhausted'
dataset_interleave_stopping_strategy = 'all_exhausted'
# Can set this lower than training, so we don't drop as many examples when trying to make equal-sized batches.
# Default if not set: same as training GAS.
eval_gradient_accumulation_steps = 1

# bitsandbytes 4 bit quantization. The parameters here become arguments to Transformers BitsAndBytesConfig.
#[quantization.bnb]
#load_in_4bit = true
#bnb_4bit_use_double_quant = false
#bnb_4bit_compute_dtype = 'bfloat16'

# HQQ quantization. The parameters here become arguments to CustomHQQConfig.
# [quantization.hqq]
# nbits = 4
# group_size = 64
# compute_dtype = 'bfloat16'

# (Optional) You can override the quant params for certain modules. This does substring matching, e.g. if 'gate_proj'
# is a substring of the full module name, anything specified overwrites the defaults in [quantization.hqq].
# [quantization.hqq.dynamic_config]
# gate_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
# up_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
# down_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}

[optimizer]
# options: adamw_kahan, AdamW, AdamW8bit
type = 'adamw_kahan'
lr = 5e-5
beta1 = 0.9
beta2 = 0.99
weight_decay = 0.1

[[datasets]]
# Arbitrary name, used only for separately logging eval metrics. Will be dataset0, dataset1, etc if not set.
name = 'c2'
dataset_type = 'axolotl'
dataset_path = '../axolotl/sorc.yml'
sequence_len = 8192
eval_size = 0.01
# Relative sampling weight, when using combination mode 'interleave'. Will be 1 if not set.
sample_weight = 1

#[[datasets]]
#name = 'capybara'
#dataset_type = 'axolotl'
#dataset_path = 'examples/capybara.yml'
#sequence_len = 2048
#eval_size = 0.02
#sample_weight = 1.5

# In addition to using eval_size which splits off some of the dataset, we can have completely separate datasets for eval.
# This can be useful if you're training on raw text data, so that the eval set remains completely fixed, even if
# you change training sequence_len, etc.
# This is just an example, typically you wouldn't have this overlap a training dataset.
# [[eval_datasets]]
# name = 'capybara'
# dataset_type = 'axolotl'
# dataset_path = 'examples/capybara.yml'
# sequence_len = 2048