db
commited on
Commit
•
05a0e7d
1
Parent(s):
2e25161
init
Browse files- bench.py +117 -0
- config/eval_gpt2.py +8 -0
- config/eval_gpt2_large.py +8 -0
- config/eval_gpt2_medium.py +8 -0
- config/eval_gpt2_xl.py +8 -0
- config/finetune_shakespeare.py +25 -0
- config/train_gpt2.py +25 -0
- config/train_shakespeare_char.py +37 -0
- configurator.py +47 -0
bench.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
A much shorter version of train.py for benchmarking
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from contextlib import nullcontext
|
6 |
+
import numpy as np
|
7 |
+
import time
|
8 |
+
import torch
|
9 |
+
from model import GPTConfig, GPT
|
10 |
+
|
11 |
+
# -----------------------------------------------------------------------------
|
12 |
+
batch_size = 12
|
13 |
+
block_size = 1024
|
14 |
+
bias = False
|
15 |
+
real_data = True
|
16 |
+
seed = 1337
|
17 |
+
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
|
18 |
+
dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
|
19 |
+
compile = True # use PyTorch 2.0 to compile the model to be faster
|
20 |
+
profile = False # use pytorch profiler, or just simple benchmarking?
|
21 |
+
exec(open('configurator.py').read()) # overrides from command line or config file
|
22 |
+
# -----------------------------------------------------------------------------
|
23 |
+
|
24 |
+
torch.manual_seed(seed)
|
25 |
+
torch.cuda.manual_seed(seed)
|
26 |
+
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
27 |
+
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
28 |
+
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
|
29 |
+
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
30 |
+
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
|
31 |
+
|
32 |
+
# data loading init
|
33 |
+
if real_data:
|
34 |
+
dataset = 'openwebtext'
|
35 |
+
data_dir = os.path.join('data', dataset)
|
36 |
+
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
|
37 |
+
def get_batch(split):
|
38 |
+
data = train_data # note ignore split in benchmarking script
|
39 |
+
ix = torch.randint(len(data) - block_size, (batch_size,))
|
40 |
+
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
|
41 |
+
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
|
42 |
+
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
|
43 |
+
return x, y
|
44 |
+
else:
|
45 |
+
# alternatively, if fixed data is desired to not care about data loading
|
46 |
+
x = torch.randint(50304, (batch_size, block_size), device=device)
|
47 |
+
y = torch.randint(50304, (batch_size, block_size), device=device)
|
48 |
+
get_batch = lambda split: (x, y)
|
49 |
+
|
50 |
+
# model init
|
51 |
+
gptconf = GPTConfig(
|
52 |
+
block_size = block_size, # how far back does the model look? i.e. context size
|
53 |
+
n_layer = 12, n_head = 12, n_embd = 768, # size of the model
|
54 |
+
dropout = 0, # for determinism
|
55 |
+
bias = bias,
|
56 |
+
)
|
57 |
+
model = GPT(gptconf)
|
58 |
+
model.to(device)
|
59 |
+
|
60 |
+
optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
|
61 |
+
|
62 |
+
if compile:
|
63 |
+
print("Compiling model...")
|
64 |
+
model = torch.compile(model) # pytorch 2.0
|
65 |
+
|
66 |
+
if profile:
|
67 |
+
# useful docs on pytorch profiler:
|
68 |
+
# - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
|
69 |
+
# - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
|
70 |
+
wait, warmup, active = 5, 5, 5
|
71 |
+
num_steps = wait + warmup + active
|
72 |
+
with torch.profiler.profile(
|
73 |
+
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
|
74 |
+
schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
|
75 |
+
on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
|
76 |
+
record_shapes=False,
|
77 |
+
profile_memory=False,
|
78 |
+
with_stack=False, # incurs an additional overhead, disable if not needed
|
79 |
+
with_flops=True,
|
80 |
+
with_modules=False, # only for torchscript models atm
|
81 |
+
) as prof:
|
82 |
+
|
83 |
+
X, Y = get_batch('train')
|
84 |
+
for k in range(num_steps):
|
85 |
+
with ctx:
|
86 |
+
logits, loss = model(X, Y)
|
87 |
+
X, Y = get_batch('train')
|
88 |
+
optimizer.zero_grad(set_to_none=True)
|
89 |
+
loss.backward()
|
90 |
+
optimizer.step()
|
91 |
+
lossf = loss.item()
|
92 |
+
print(f"{k}/{num_steps} loss: {lossf:.4f}")
|
93 |
+
|
94 |
+
prof.step() # notify the profiler at end of each step
|
95 |
+
|
96 |
+
else:
|
97 |
+
|
98 |
+
# simple benchmarking
|
99 |
+
torch.cuda.synchronize()
|
100 |
+
for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
|
101 |
+
t0 = time.time()
|
102 |
+
X, Y = get_batch('train')
|
103 |
+
for k in range(num_steps):
|
104 |
+
with ctx:
|
105 |
+
logits, loss = model(X, Y)
|
106 |
+
X, Y = get_batch('train')
|
107 |
+
optimizer.zero_grad(set_to_none=True)
|
108 |
+
loss.backward()
|
109 |
+
optimizer.step()
|
110 |
+
lossf = loss.item()
|
111 |
+
print(f"{k}/{num_steps} loss: {lossf:.4f}")
|
112 |
+
torch.cuda.synchronize()
|
113 |
+
t1 = time.time()
|
114 |
+
dt = t1-t0
|
115 |
+
mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
|
116 |
+
if stage == 1:
|
117 |
+
print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
|
config/eval_gpt2.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# evaluate the base gpt2
|
2 |
+
# n_layer=12, n_head=12, n_embd=768
|
3 |
+
# 124M parameters
|
4 |
+
batch_size = 8
|
5 |
+
eval_iters = 500 # use more iterations to get good estimate
|
6 |
+
eval_only = True
|
7 |
+
wandb_log = False
|
8 |
+
init_from = 'gpt2'
|
config/eval_gpt2_large.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# evaluate the base gpt2
|
2 |
+
# n_layer=36, n_head=20, n_embd=1280
|
3 |
+
# 774M parameters
|
4 |
+
batch_size = 8
|
5 |
+
eval_iters = 500 # use more iterations to get good estimate
|
6 |
+
eval_only = True
|
7 |
+
wandb_log = False
|
8 |
+
init_from = 'gpt2-large'
|
config/eval_gpt2_medium.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# evaluate the base gpt2
|
2 |
+
# n_layer=24, n_head=16, n_embd=1024
|
3 |
+
# 350M parameters
|
4 |
+
batch_size = 8
|
5 |
+
eval_iters = 500 # use more iterations to get good estimate
|
6 |
+
eval_only = True
|
7 |
+
wandb_log = False
|
8 |
+
init_from = 'gpt2-medium'
|
config/eval_gpt2_xl.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# evaluate the base gpt2
|
2 |
+
# n_layer=48, n_head=25, n_embd=1600
|
3 |
+
# 1558M parameters
|
4 |
+
batch_size = 8
|
5 |
+
eval_iters = 500 # use more iterations to get good estimate
|
6 |
+
eval_only = True
|
7 |
+
wandb_log = False
|
8 |
+
init_from = 'gpt2-xl'
|
config/finetune_shakespeare.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
out_dir = 'out-shakespeare'
|
4 |
+
eval_interval = 5
|
5 |
+
eval_iters = 40
|
6 |
+
wandb_log = False # feel free to turn on
|
7 |
+
wandb_project = 'shakespeare'
|
8 |
+
wandb_run_name = 'ft-' + str(time.time())
|
9 |
+
|
10 |
+
dataset = 'shakespeare'
|
11 |
+
init_from = 'gpt2-xl' # this is the largest GPT-2 model
|
12 |
+
|
13 |
+
# only save checkpoints if the validation loss improves
|
14 |
+
always_save_checkpoint = False
|
15 |
+
|
16 |
+
# the number of examples per iter:
|
17 |
+
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
|
18 |
+
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
|
19 |
+
batch_size = 1
|
20 |
+
gradient_accumulation_steps = 32
|
21 |
+
max_iters = 20
|
22 |
+
|
23 |
+
# finetune at constant LR
|
24 |
+
learning_rate = 3e-5
|
25 |
+
decay_lr = False
|
config/train_gpt2.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
|
2 |
+
# launch as the following (e.g. in a screen session) and wait ~5 days:
|
3 |
+
# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
|
4 |
+
|
5 |
+
wandb_log = True
|
6 |
+
wandb_project = 'owt'
|
7 |
+
wandb_run_name='gpt2-124M'
|
8 |
+
|
9 |
+
# these make the total batch size be ~0.5M
|
10 |
+
# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
|
11 |
+
batch_size = 12
|
12 |
+
block_size = 1024
|
13 |
+
gradient_accumulation_steps = 5 * 8
|
14 |
+
|
15 |
+
# this makes total number of tokens be 300B
|
16 |
+
max_iters = 600000
|
17 |
+
lr_decay_iters = 600000
|
18 |
+
|
19 |
+
# eval stuff
|
20 |
+
eval_interval = 1000
|
21 |
+
eval_iters = 200
|
22 |
+
log_interval = 10
|
23 |
+
|
24 |
+
# weight decay
|
25 |
+
weight_decay = 1e-1
|
config/train_shakespeare_char.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# train a miniature character-level shakespeare model
|
2 |
+
# good for debugging and playing on macbooks and such
|
3 |
+
|
4 |
+
out_dir = 'out-shakespeare-char'
|
5 |
+
eval_interval = 250 # keep frequent because we'll overfit
|
6 |
+
eval_iters = 200
|
7 |
+
log_interval = 10 # don't print too too often
|
8 |
+
|
9 |
+
# we expect to overfit on this small dataset, so only save when val improves
|
10 |
+
always_save_checkpoint = False
|
11 |
+
|
12 |
+
wandb_log = False # override via command line if you like
|
13 |
+
wandb_project = 'shakespeare-char'
|
14 |
+
wandb_run_name = 'mini-gpt'
|
15 |
+
|
16 |
+
dataset = 'shakespeare_char'
|
17 |
+
gradient_accumulation_steps = 1
|
18 |
+
batch_size = 64
|
19 |
+
block_size = 256 # context of up to 256 previous characters
|
20 |
+
|
21 |
+
# baby GPT model :)
|
22 |
+
n_layer = 6
|
23 |
+
n_head = 6
|
24 |
+
n_embd = 384
|
25 |
+
dropout = 0.2
|
26 |
+
|
27 |
+
learning_rate = 1e-3 # with baby networks can afford to go a bit higher
|
28 |
+
max_iters = 5000
|
29 |
+
lr_decay_iters = 5000 # make equal to max_iters usually
|
30 |
+
min_lr = 1e-4 # learning_rate / 10 usually
|
31 |
+
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
|
32 |
+
|
33 |
+
warmup_iters = 100 # not super necessary potentially
|
34 |
+
|
35 |
+
# on macbook also add
|
36 |
+
# device = 'cpu' # run on cpu only
|
37 |
+
# compile = False # do not torch compile the model
|
configurator.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Poor Man's Configurator. Probably a terrible idea. Example usage:
|
3 |
+
$ python train.py config/override_file.py --batch_size=32
|
4 |
+
this will first run config/override_file.py, then override batch_size to 32
|
5 |
+
|
6 |
+
The code in this file will be run as follows from e.g. train.py:
|
7 |
+
>>> exec(open('configurator.py').read())
|
8 |
+
|
9 |
+
So it's not a Python module, it's just shuttling this code away from train.py
|
10 |
+
The code in this script then overrides the globals()
|
11 |
+
|
12 |
+
I know people are not going to love this, I just really dislike configuration
|
13 |
+
complexity and having to prepend config. to every single variable. If someone
|
14 |
+
comes up with a better simple Python solution I am all ears.
|
15 |
+
"""
|
16 |
+
|
17 |
+
import sys
|
18 |
+
from ast import literal_eval
|
19 |
+
|
20 |
+
for arg in sys.argv[1:]:
|
21 |
+
if '=' not in arg:
|
22 |
+
# assume it's the name of a config file
|
23 |
+
assert not arg.startswith('--')
|
24 |
+
config_file = arg
|
25 |
+
print(f"Overriding config with {config_file}:")
|
26 |
+
with open(config_file) as f:
|
27 |
+
print(f.read())
|
28 |
+
exec(open(config_file).read())
|
29 |
+
else:
|
30 |
+
# assume it's a --key=value argument
|
31 |
+
assert arg.startswith('--')
|
32 |
+
key, val = arg.split('=')
|
33 |
+
key = key[2:]
|
34 |
+
if key in globals():
|
35 |
+
try:
|
36 |
+
# attempt to eval it it (e.g. if bool, number, or etc)
|
37 |
+
attempt = literal_eval(val)
|
38 |
+
except (SyntaxError, ValueError):
|
39 |
+
# if that goes wrong, just use the string
|
40 |
+
attempt = val
|
41 |
+
# ensure the types match ok
|
42 |
+
assert type(attempt) == type(globals()[key])
|
43 |
+
# cross fingers
|
44 |
+
print(f"Overriding: {key} = {attempt}")
|
45 |
+
globals()[key] = attempt
|
46 |
+
else:
|
47 |
+
raise ValueError(f"Unknown config key: {key}")
|