Spaces:

DB2323
/

nanoGPT

Paused

App Files Files Community

db commited on May 7, 2023

Commit

05a0e7d

•

1 Parent(s): 2e25161

init

Browse files

Files changed (9) hide show

bench.py +117 -0
config/eval_gpt2.py +8 -0
config/eval_gpt2_large.py +8 -0
config/eval_gpt2_medium.py +8 -0
config/eval_gpt2_xl.py +8 -0
config/finetune_shakespeare.py +25 -0
config/train_gpt2.py +25 -0
config/train_shakespeare_char.py +37 -0
configurator.py +47 -0

bench.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+A much shorter version of train.py for benchmarking
+"""
+import os
+from contextlib import nullcontext
+import numpy as np
+import time
+import torch
+from model import GPTConfig, GPT
+# -----------------------------------------------------------------------------
+batch_size = 12
+block_size = 1024
+bias = False
+real_data = True
+seed = 1337
+device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
+dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
+compile = True # use PyTorch 2.0 to compile the model to be faster
+profile = False # use pytorch profiler, or just simple benchmarking?
+exec(open('configurator.py').read()) # overrides from command line or config file
+# -----------------------------------------------------------------------------
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
+ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+# data loading init
+if real_data:
+    dataset = 'openwebtext'
+    data_dir = os.path.join('data', dataset)
+    train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
+    def get_batch(split):
+        data = train_data # note ignore split in benchmarking script
+        ix = torch.randint(len(data) - block_size, (batch_size,))
+        x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
+        y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
+        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
+        return x, y
+else:
+    # alternatively, if fixed data is desired to not care about data loading
+    x = torch.randint(50304, (batch_size, block_size), device=device)
+    y = torch.randint(50304, (batch_size, block_size), device=device)
+    get_batch = lambda split: (x, y)
+# model init
+gptconf = GPTConfig(
+    block_size = block_size, # how far back does the model look? i.e. context size
+    n_layer = 12, n_head = 12, n_embd = 768, # size of the model
+    dropout = 0, # for determinism
+    bias = bias,
+)
+model = GPT(gptconf)
+model.to(device)
+optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
+if compile:
+    print("Compiling model...")
+    model = torch.compile(model) # pytorch 2.0
+if profile:
+    # useful docs on pytorch profiler:
+    # - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
+    # - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
+    wait, warmup, active = 5, 5, 5
+    num_steps = wait + warmup + active
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+        schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
+        record_shapes=False,
+        profile_memory=False,
+        with_stack=False, # incurs an additional overhead, disable if not needed
+        with_flops=True,
+        with_modules=False, # only for torchscript models atm
+    ) as prof:
+        X, Y = get_batch('train')
+        for k in range(num_steps):
+            with ctx:
+                logits, loss = model(X, Y)
+            X, Y = get_batch('train')
+            optimizer.zero_grad(set_to_none=True)
+            loss.backward()
+            optimizer.step()
+            lossf = loss.item()
+            print(f"{k}/{num_steps} loss: {lossf:.4f}")
+            prof.step() # notify the profiler at end of each step
+else:
+    # simple benchmarking
+    torch.cuda.synchronize()
+    for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
+        t0 = time.time()
+        X, Y = get_batch('train')
+        for k in range(num_steps):
+            with ctx:
+                logits, loss = model(X, Y)
+            X, Y = get_batch('train')
+            optimizer.zero_grad(set_to_none=True)
+            loss.backward()
+            optimizer.step()
+            lossf = loss.item()
+            print(f"{k}/{num_steps} loss: {lossf:.4f}")
+        torch.cuda.synchronize()
+        t1 = time.time()
+        dt = t1-t0
+        mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
+        if stage == 1:
+            print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")

config/eval_gpt2.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# evaluate the base gpt2
+# n_layer=12, n_head=12, n_embd=768
+# 124M parameters
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'gpt2'

config/eval_gpt2_large.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# evaluate the base gpt2
+# n_layer=36, n_head=20, n_embd=1280
+# 774M parameters
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'gpt2-large'

config/eval_gpt2_medium.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# evaluate the base gpt2
+# n_layer=24, n_head=16, n_embd=1024
+# 350M parameters
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'gpt2-medium'

config/eval_gpt2_xl.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# evaluate the base gpt2
+# n_layer=48, n_head=25, n_embd=1600
+# 1558M parameters
+batch_size = 8
+eval_iters = 500 # use more iterations to get good estimate
+eval_only = True
+wandb_log = False
+init_from = 'gpt2-xl'

config/finetune_shakespeare.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import time
+out_dir = 'out-shakespeare'
+eval_interval = 5
+eval_iters = 40
+wandb_log = False # feel free to turn on
+wandb_project = 'shakespeare'
+wandb_run_name = 'ft-' + str(time.time())
+dataset = 'shakespeare'
+init_from = 'gpt2-xl' # this is the largest GPT-2 model
+# only save checkpoints if the validation loss improves
+always_save_checkpoint = False
+# the number of examples per iter:
+# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
+# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
+batch_size = 1
+gradient_accumulation_steps = 32
+max_iters = 20
+# finetune at constant LR
+learning_rate = 3e-5
+decay_lr = False

config/train_gpt2.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+wandb_log = True
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 12
+block_size = 1024
+gradient_accumulation_steps = 5 * 8
+# this makes total number of tokens be 300B
+max_iters = 600000
+lr_decay_iters = 600000
+# eval stuff
+eval_interval = 1000
+eval_iters = 200
+log_interval = 10
+# weight decay
+weight_decay = 1e-1

config/train_shakespeare_char.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# train a miniature character-level shakespeare model
+# good for debugging and playing on macbooks and such
+out_dir = 'out-shakespeare-char'
+eval_interval = 250 # keep frequent because we'll overfit
+eval_iters = 200
+log_interval = 10 # don't print too too often
+# we expect to overfit on this small dataset, so only save when val improves
+always_save_checkpoint = False
+wandb_log = False # override via command line if you like
+wandb_project = 'shakespeare-char'
+wandb_run_name = 'mini-gpt'
+dataset = 'shakespeare_char'
+gradient_accumulation_steps = 1
+batch_size = 64
+block_size = 256 # context of up to 256 previous characters
+# baby GPT model :)
+n_layer = 6
+n_head = 6
+n_embd = 384
+dropout = 0.2
+learning_rate = 1e-3 # with baby networks can afford to go a bit higher
+max_iters = 5000
+lr_decay_iters = 5000 # make equal to max_iters usually
+min_lr = 1e-4 # learning_rate / 10 usually
+beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
+warmup_iters = 100 # not super necessary potentially
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+# compile = False # do not torch compile the model

configurator.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+Poor Man's Configurator. Probably a terrible idea. Example usage:
+$ python train.py config/override_file.py --batch_size=32
+this will first run config/override_file.py, then override batch_size to 32
+The code in this file will be run as follows from e.g. train.py:
+>>> exec(open('configurator.py').read())
+So it's not a Python module, it's just shuttling this code away from train.py
+The code in this script then overrides the globals()
+I know people are not going to love this, I just really dislike configuration
+complexity and having to prepend config. to every single variable. If someone
+comes up with a better simple Python solution I am all ears.
+"""
+import sys
+from ast import literal_eval
+for arg in sys.argv[1:]:
+    if '=' not in arg:
+        # assume it's the name of a config file
+        assert not arg.startswith('--')
+        config_file = arg
+        print(f"Overriding config with {config_file}:")
+        with open(config_file) as f:
+            print(f.read())
+        exec(open(config_file).read())
+    else:
+        # assume it's a --key=value argument
+        assert arg.startswith('--')
+        key, val = arg.split('=')
+        key = key[2:]
+        if key in globals():
+            try:
+                # attempt to eval it it (e.g. if bool, number, or etc)
+                attempt = literal_eval(val)
+            except (SyntaxError, ValueError):
+                # if that goes wrong, just use the string
+                attempt = val
+            # ensure the types match ok
+            assert type(attempt) == type(globals()[key])
+            # cross fingers
+            print(f"Overriding: {key} = {attempt}")
+            globals()[key] = attempt
+        else:
+            raise ValueError(f"Unknown config key: {key}")