Edit model card

Pythia-2.8B Deduped 4K is a Pythia-2.8B Deduped model fine-tuned with a 4096 context length. Training resumed from their 143,000 step checkpoint and continued on The Pile v1 Deduped (threshold=0.87). This particular model is from a checkpoint captured at step 175,500 for an extra 134,217,728,000 tokens of training.

Note: Sequence length warmup was not used to move up from 2048 but, in hindsight, should have been applied.

Config

{
  # 8 Nodes 8xA100 40GB
  "eval_batch_size": 2,

  "pipe-parallel-size": 1,
  "model-parallel-size": 1,

  "num-layers": 32,
  "hidden-size": 2560,
  "num-attention-heads": 32,
  "seq-length": 4096,
  "max-position-embeddings": 4096,

  "norm": "layernorm",
  "pos-emb": "rotary",
  "rotary-pct": 0.25,
  "no-weight-tying": true,
  "gpt-j-residual": true,
  "output-layer-parallelism": "column",

  "init_method": "small_init",
  "output_layer_init_method": "wang_init",
  
  "attention-config": [[["flash"], 32]],
  "scaled-upper-triang-masked-softmax-fusion": true,
  "bias-gelu-fusion": true,

  "optimizer": {
    "type": "Adam",
    "params": {
      "lr": 1.6e-5,
      "betas": [0.9, 0.95],
      "eps": 1.0e-08
    },
  },
  "min_lr": 8.0e-06,

  "zero_optimization":{
      "stage": 1,
      "allgather_partitions": true,
      "allgather_bucket_size": 500000000,
      "overlap_comm": true,
      "reduce_scatter": true,
      "reduce_bucket_size": 500000000,
      "contiguous_gradients": true,
      "cpu_offload": false,
  },
  "train_micro_batch_size_per_gpu": 4,
  "gradient-accumulation-steps": 4,
  "data-impl": "mmap",

  "checkpoint-activations": true,
  "checkpoint-num-layers": 1,
  "partition-activations": true,
  "synchronize-each-layer": true,

  "gradient_clipping": 1.0,
  "weight-decay": 0.1,
  "hidden-dropout": 0,
  "attention-dropout": 0,

  "fp16": {
      "fp16": true,
      "enabled": true,
      "loss_scale": 0,
      "loss_scale_window": 1000,
      "initial_scale_power": 12,
      "hysteresis": 2,
      "min_loss_scale": 1,
  },

  "train-iters": 318000, 
  "lr-decay-iters": 318000,
  "distributed-backend": "nccl",
  "lr-decay-style": "cosine",
  "warmup": 0.01,
  "checkpoint-factor": 500,
  "eval-interval": 50000,
  "eval-iters": 10,
  "extra-save-iters": [0, 512, 152001],

  "train-data-paths": ["pile_0.87_deduped_text_document"],
  "valid-data-paths": ["pile_0.87_deduped_text_document"],
  "test-data-paths": ["pile_0.87_deduped_text_document"],

  "tokenizer_type": "HFTokenizer",
  "vocab-file": "20B_tokenizer.json",

  "log-interval": 10,
  "steps_per_print": 10,
  "wall_clock_breakdown": true,
  "log-grad-norm": true,

  "launcher": "slurm",
  "deepspeed_slurm": true,
}

Acknoweldgements

This work would not have been possible without the support of Stability AI.

Downloads last month
14
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train CarperAI/pythia-2.8b-deduped-4k