Spaces:

hf-accelerate
/

accelerate_examples

Running on CPU Upgrade

App Files Files Community

muellerzr HF staff commited on Feb 1, 2023

Commit

06a60a3

•

1 Parent(s): b91e31d

Refactor

Browse files

Files changed (20) hide show

Makefile +19 -0
code_samples/accelerate +0 -17
code_samples/basic +0 -31
code_samples/calculating_metrics +0 -51
code_samples/checkpointing +0 -29
code_samples/experiment_tracking +0 -32
code_samples/gradient_accumulation +0 -33
code_samples/initial +0 -11
code_samples/initial_with_metrics +0 -27
code_samples/large_scale_training/aws_sagemaker +0 -77
code_samples/training_configuration/aws_sagemaker +51 -0
code_samples/{large_scale_training → training_configuration}/deepspeed +33 -43
code_samples/{large_scale_training → training_configuration}/megatron-lm +57 -68
code_samples/{large_scale_training → training_configuration}/multi_gpu +21 -37
code_samples/{large_scale_training → training_configuration}/multi_node_multi_gpu +36 -47
code_samples/{large_scale_training → training_configuration}/pytorch_fsdp +33 -40
setup.cfg +19 -0
src/app.py +90 -48
src/markup.py +1 -0
src/template.py +4 -1

Makefile ADDED Viewed

	@@ -0,0 +1,19 @@

+.PHONY: quality style test docs
+# Check that source code meets quality standards
+extra_quality_checks:
+	doc-builder style src --max_len 119
+# this target runs checks on all files
+quality:
+	black --check src
+	isort --check-only src
+	flake8 src
+	doc-builder style src --max_len 119 --check_only
+# Format source code automatically and check is there are any problems left that need manual fixing
+style:
+	black src
+	isort src
+	doc-builder style src --max_len 119

code_samples/accelerate DELETED Viewed

@@ -1,17 +0,0 @@
-<pre>
-from accelerate import Accelerator
-accelerator = Accelerator()
-train_dataloader, model, optimizer scheduler = accelerator.prepare(
-        dataloader, model, optimizer, scheduler
-)
-model.train()
-for batch in train_dataloader:
-    optimizer.zero_grad()
-    inputs, targets = batch
-    outputs = model(inputs)
-    loss = loss_function(outputs, targets)
-    accelerator.backward(loss)
-    optimizer.step()
-    scheduler.step()
-</pre>

code_samples/basic DELETED Viewed

@@ -1,31 +0,0 @@
-##
-<pre>
-+from accelerate import Accelerator
-+accelerator = Accelerator()
-+dataloader, model, optimizer scheduler = accelerator.prepare(
-+        dataloader, model, optimizer, scheduler
-+)
-for batch in dataloader:
-    optimizer.zero_grad()
-    inputs, targets = batch
--    inputs = inputs.to(device)
--    targets = targets.to(device)
-    outputs = model(inputs)
-    loss = loss_function(outputs, targets)
--    loss.backward()
-+    accelerator.backward(loss)
-    optimizer.step()
-    scheduler.step()</pre>
-##
-Everything around `accelerate` occurs with the `Accelerator` class. To use it, first make an object.
-Then call `.prepare` passing in the PyTorch objects that you would normally train with. This will
-return the same objects, but they will be on the correct device and distributed if needed. Then
-you can train as normal, but instead of calling `loss.backward()` you call `accelerator.backward(loss)`.
-Also note that you don't need to call `model.to(device)` or `inputs.to(device)` anymore, as this
-is done automatically by `accelerator.prepare()`.
-##
-To learn more checkout the related documentation:
-- <a href="https://huggingface.co/docs/accelerate/basic_tutorials/migration" target="_blank">Migrating to 🤗 Accelerate</a>
-- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator" target="_blank">The Accelerator</a>

code_samples/calculating_metrics DELETED Viewed

@@ -1,51 +0,0 @@
-##
-<pre>
-import evaluate
-+from accelerate import Accelerator
-+accelerator = Accelerator()
-+train_dataloader, eval_dataloader, model, optimizer, scheduler = (
-+    accelerator.prepare(
-+        train_dataloader, eval_dataloader,
-+        model, optimizer, scheduler
-+    )
-+)
-metric = evaluate.load("accuracy")
-for batch in train_dataloader:
-    optimizer.zero_grad()
-    inputs, targets = batch
--    inputs = inputs.to(device)
--    targets = targets.to(device)
-    outputs = model(inputs)
-    loss = loss_function(outputs, targets)
-    loss.backward()
-    optimizer.step()
-    scheduler.step()
-model.eval()
-for batch in eval_dataloader:
-    inputs, targets = batch
--    inputs = inputs.to(device)
--    targets = targets.to(device)
-    with torch.no_grad():
-        outputs = model(inputs)
-    predictions = outputs.argmax(dim=-1)
-+    predictions, references = accelerator.gather_for_metrics(
-+        (predictions, references)
-+    )
-    metric.add_batch(
-        predictions = predictions,
-        references = references
-    )
-print(metric.compute())</pre>
-##
-When calculating metrics on a validation set, you can use the `Accelerator.gather_for_metrics`
-method to gather the predictions and references from all devices and then calculate the metric on the gathered values.
-This will also *automatically* drop the padded values from the gathered tensors that were added to ensure
-that all tensors have the same length. This ensures that the metric is calculated on the correct values.
-##
-To learn more checkout the related documentation:
-- <a href="https://huggingface.co/docs/accelerate/en/quicktour#distributed-evaluation" target="_blank">Quicktour - Calculating metrics</a>
-- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.gather_for_metrics" target="_blank">API reference</a>
-- <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/multi_process_metrics.py" target="_blank">Example script</a>

code_samples/checkpointing DELETED Viewed

@@ -1,29 +0,0 @@
-##
-<pre>
-from accelerate import Accelerator
-accelerator = Accelerator()
-dataloader, model, optimizer scheduler = accelerator.prepare(
-        dataloader, model, optimizer, scheduler
-)
-for batch in dataloader:
-    optimizer.zero_grad()
-    inputs, targets = batch
-    outputs = model(inputs)
-    loss = loss_function(outputs, targets)
-    accelerator.backward(loss)
-    optimizer.step()
-    scheduler.step()
-+accelerator.save_state("checkpoint_dir")
-+accelerator.load_state("checkpoint_dir")</pre>
-##
-To save or load a checkpoint in, `Accelerator` provides the `save_state` and `load_state` methods.
-These methods will save or load the state of the model, optimizer, scheduler, as well as random states and
-any custom registered objects from the main process on each device to a passed in folder.
-**This API is designed to save and resume training states only from within the same python script or training setup.**
-##
-To learn more checkout the related documentation:
-- <a href="https://huggingface.co/docs/accelerate/usage_guides/checkpoint" target="_blank">Saving and loading training states</a>
-- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" target="_blank">`save_state` API reference</a>
-- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.load_state" target="_blank">`load_state` API reference</a>
-- <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/checkpointing.py" target="_blank">Example script</a>

code_samples/experiment_tracking DELETED Viewed

@@ -1,32 +0,0 @@
-##
-<pre>
-from accelerate import Accelerator
--accelerator = Accelerator()
-+accelerator = Accelerator(log_with="wandb")
-train_dataloader, model, optimizer scheduler = accelerator.prepare(
-        dataloader, model, optimizer, scheduler
-)
-+accelerator.init_trackers()
-model.train()
-for batch in train_dataloader:
-    optimizer.zero_grad()
-    inputs, targets = batch
-    outputs = model(inputs)
-    loss = loss_function(outputs, targets)
-+    accelerator.log({"loss":loss})
-    accelerator.backward(loss)
-    optimizer.step()
-    scheduler.step()
-+accelerator.end_training()
-</pre>
-##
-To use experiment trackers with `accelerate`, simply pass the desired tracker to the `log_with` parameter
-when building the `Accelerator` object. Then initialize the tracker(s) by running `Accelerator.init_trackers()`
-passing in any configurations they may need. Afterwards call `Accelerator.log` to log a particular value to your tracker.
-At the end of training call `accelerator.end_training()` to call any finalization functions a tracking library
-may need automatically.
-##
-To learn more checkout the related documentation:
-- <a href="https://huggingface.co/docs/accelerate/usage_guides/tracking" target="_blank">Using experiment trackers</a>
-- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.log" target="_blank">Accelerator API Reference</a>
-- <a href="https://huggingface.co/docs/accelerate/package_reference/tracking" target="_blank">Tracking API Reference</a>

code_samples/gradient_accumulation DELETED Viewed

@@ -1,33 +0,0 @@
-##
-<pre>
-from accelerate import Accelerator
-accelerator = Accelerator(
-+    gradient_accumulation_steps=2,
-)
-dataloader, model, optimizer scheduler = accelerator.prepare(
-        dataloader, model, optimizer, scheduler
-)
-for batch in dataloader:
-+  with accelerator.accumulate(model):
-      optimizer.zero_grad()
-      inputs, targets = batch
-      outputs = model(inputs)
-      loss = loss_function(outputs, targets)
-      accelerator.backward(loss)
-      optimizer.step()
-      scheduler.step()</pre>
-##
-When performing gradient accumulation in a distributed setup, there are many opportunities for efficiency mistakes
-to occur. `Accelerator` provides a context manager that will take care of the details for you and ensure that the
-model is training correctly. Simply wrap the training loop in the `Accelerator.accumulate` context manager
-while passing in the model you are training on and during training the gradients will accumulate and synchronize
-automatically when needed.
-##
-To learn more checkout the related documentation:
-- <a href="https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation" target="_blank">Performing gradient accumulation</a>
-- <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.accumulate" target="_blank">API reference</a>
-- <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/gradient_accumulation.py" target="_blank">Example script</a>
-- <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/automatic_gradient_accumulation.py" target="_blank">Performing automatic gradient accumulation example script</a>

code_samples/initial DELETED Viewed

@@ -1,11 +0,0 @@
-<pre>
-for batch in dataloader:
-    optimizer.zero_grad()
-    inputs, targets = batch
-    inputs = inputs.to(device)
-    targets = targets.to(device)
-    outputs = model(inputs)
-    loss = loss_function(outputs, targets)
-    loss.backward()
-    optimizer.step()
-    scheduler.step()</pre>

code_samples/initial_with_metrics DELETED Viewed

@@ -1,27 +0,0 @@
-<pre>
-import evaluate
-metric = evaluate.load("accuracy")
-for batch in train_dataloader:
-    optimizer.zero_grad()
-    inputs, targets = batch
-    inputs = inputs.to(device)
-    targets = targets.to(device)
-    outputs = model(inputs)
-    loss = loss_function(outputs, targets)
-    loss.backward()
-    optimizer.step()
-    scheduler.step()
-model.eval()
-for batch in eval_dataloader:
-    inputs, targets = batch
-    inputs = inputs.to(device)
-    targets = targets.to(device)
-    with torch.no_grad():
-        outputs = model(inputs)
-    predictions = outputs.argmax(dim=-1)
-    metric.add_batch(
-        predictions = predictions,
-        references = references
-    )
-print(metric.compute())</pre>

code_samples/large_scale_training/aws_sagemaker DELETED Viewed

@@ -1,77 +0,0 @@
-##
-Run  `accelerate config` on and answer the questionnaire accordingly.
-Below is an example yaml for running code remotely on AWS SageMaker. Replace placeholder `xxxxx` with
-appropriate values.
-<pre>
-base_job_name: accelerate-sagemaker-1
-compute_environment: AMAZON_SAGEMAKER
-distributed_type: 'NO'
-dynamo_backend: 'NO'
-ec2_instance_type: ml.p3.2xlarge
-gpu_ids: all
-iam_role_name: xxxxx
-mixed_precision: 'no'
-num_machines: 1
-profile: xxxxx
-py_version: py38
-pytorch_version: 1.10.2
-region: us-east-1
-transformers_version: 4.17.0
-use_cpu: false
-</pre>
-##
-<pre>
-from accelerate import Accelerator
-def parse_args():
-    parser = argparse.ArgumentParser(description="sample task")
-    parser.add_argument(
-        "--pad_to_max_length",
--        action="store_true",
-+        type=bool,
-+        default=False,
-        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
-    )
-    ...
-+ def main():
-      accelerator = Accelerator()
-      model, optimizer, training_dataloader, scheduler = accelerator.prepare(
-          model, optimizer, training_dataloader, scheduler
-      )
-      for batch in training_dataloader:
-          optimizer.zero_grad()
-          inputs, targets = batch
-          outputs = model(inputs)
-          loss = loss_function(outputs, targets)
-          accelerator.backward(loss)
-          optimizer.step()
-          scheduler.step()
--    torch.save('/opt/ml/model`)
-+    accelerator.save('/opt/ml/model')
-+ if __name__ == "__main__":
-+     main()
-</pre>
-Launching a script using default accelerate config file looks like the following:
-```
-accelerate launch {script_name.py} {--arg1} {--arg2} ...
-```
-##
-SageMaker doesn’t support argparse actions. If you want to use, for example, boolean hyperparameters, you need to specify type as bool in your script and provide an explicit True or False value for this hyperparameter. An example for the same is shown above for `pad_to_max_length` argument. Another important point is to save all the output artifacts to `/opt/ml/model` or use `os.environ["SM_MODEL_DIR"]` as your save directory. After training, artifacts in this directory are uploaded to S3, an example is shown in above code snippet.
-You can provide custom docker image, input channels pointing to S3 data locations and use SageMaker metrics logging
-as part of advanced features. Please refer <a href="https://github.com/huggingface/notebooks/tree/main/sagemaker/22_accelerate_sagemaker_examples" target="_blank">Examples showcasing AWS SageMaker integration of 🤗 Accelerate</a>
-##
-To learn more checkout the related documentation:
-- <a href="https://huggingface.co/docs/accelerate/usage_guides/sagemaker" target="_blank">How to use 🤗 Accelerate with SageMaker</a>
-- <a href="https://github.com/huggingface/notebooks/tree/main/sagemaker/22_accelerate_sagemaker_examples" target="_blank">Examples showcasing AWS SageMaker integration of 🤗 Accelerate</a>
-- <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Accelerate CLI</a>

code_samples/training_configuration/aws_sagemaker ADDED Viewed

	@@ -0,0 +1,51 @@

+##
+<pre>
++base_job_name: accelerate-sagemaker-1
++compute_environment: AMAZON_SAGEMAKER
+distributed_type: 'NO'
+dynamo_backend: 'NO'
++ec2_instance_type: ml.p3.2xlarge
++gpu_ids: all
++iam_role_name: MY_IAM_ROLE_NAME
+mixed_precision: 'no'
++num_machines: 1
++profile: MY_PROFILE_NAME
++py_version: py38
++pytorch_version: 1.10.2
++region: us-east-1
++transformers_version: 4.17.0
+use_cpu: false
+</pre>
+##
+<pre>
+def parse_args():
+    parser = argparse.ArgumentParse(
+        description="sample task"
+    )
+    parser.add_argument(
+        "--some_bool_arg",
+-        action="store_true",
++        type=bool,
++        default=False,
+    )
+</pre>
+##
+If the YAML was generated through the `accelerate config` command:
+```
+accelerate launch {script_name.py} {--arg1} {--arg2} ...
+```
+If the YAML is saved to a `~/config.yaml` file:
+```
+accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
+```
+##
+SageMaker does not support argparse actions. As a result if a script parameter would normally be a boolean, you need to specify the type as `bool` in the script and provide an explicit `True` or `False` value.
+Also, when using SageMaker all output artifacts should use `/opt/ml/model` or `os.environ["SM_MODEL_DIR"]` as your save directory. After training, artifacts in this directory are uploaded to S3.
+##
+To learn more checkout the related documentation:
+- <a href="https://huggingface.co/docs/accelerate/usage_guides/sagemaker" target="_blank">How to use 🤗 Accelerate with SageMaker</a>
+- <a href="https://github.com/huggingface/notebooks/tree/main/sagemaker/22_accelerate_sagemaker_examples" target="_blank">Examples showcasing AWS SageMaker integration of 🤗 Accelerate</a>
+- <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Command Line</a>

code_samples/{large_scale_training → training_configuration}/deepspeed RENAMED Viewed

@@ -1,17 +1,16 @@
 ##
-Run  `accelerate config` and answer the questionnaire accordingly.
-Below is an example yaml for mixed-precision training using DeepSpeed ZeRO Stage-3 with CPU offloading on 8 GPUs.
 <pre>
 compute_environment: LOCAL_MACHINE
-deepspeed_config:
-  gradient_accumulation_steps: 1
-  gradient_clipping: 1.0
-  offload_optimizer_device: cpu
-  offload_param_device: cpu
-  zero3_init_flag: true
-  zero3_save_16bit_model: true
-  zero_stage: 3
-distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
@@ -19,61 +18,52 @@ machine_rank: 0
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: fp16
-num_machines: 1
-num_processes: 8
 rdzv_backend: static
 same_network: true
 use_cpu: false
 </pre>
 ##
 <pre>
-  from accelerate import Accelerator
-+ def main():
     accelerator = Accelerator()
     model, optimizer, training_dataloader, scheduler = accelerator.prepare(
         model, optimizer, training_dataloader, scheduler
     )
-    for batch in training_dataloader:
-        optimizer.zero_grad()
-        inputs, targets = batch
-        outputs = model(inputs)
-        loss = loss_function(outputs, targets)
-        accelerator.backward(loss)
-        optimizer.step()
-        scheduler.step()
-    ...
     generated_tokens = accelerator.unwrap_model(model).generate(
-                    batch["input_ids"],
-                    attention_mask=batch["attention_mask"],
-                    **gen_kwargs,
-+                    synced_gpus=True #required for ZeRO Stage 3
-                )
     ...
     accelerator.unwrap_model(model).save_pretrained(
             args.output_dir,
             is_main_process=accelerator.is_main_process,
             save_function=accelerator.save,
-+            state_dict=accelerator.get_state_dict(model), #required for ZeRO Stage 3
-        )
     ...
-+ if __name__ == "__main__":
-+     main()
 </pre>
-Launching a script using default accelerate config file looks like the following:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
-Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
 ```
 accelerate launch \
     --use_deepspeed \
@@ -90,12 +80,12 @@ accelerate launch \
 ```
 ##
-For core DeepSpeed features supported via accelerate config file, no changes are required for ZeRO Stages 1 and 2. For ZeRO Stage-3, transformers' `generate` function requires `synced_gpus=True` and `save_pretrained` requires the `state_dict` param due to the fact that model parameters are sharded across the GPUs.
-For advanced users who like granular control via DeepSpeed config file, it is supported wherein you can pass its loaction when running `accelerate config` command. You can also specify values of most of the fields in DeepSpeed config file as `auto` and they are filled automatically via the arguments of `accelerate launch` command and `accelerator.prepare` call thereby making life simple for users. Please refer docs on <a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed#deepspeed-config-file" target="_blank">DeepSpeed Config File</a>
 ##
 To learn more checkout the related documentation:
 - <a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed" target="_blank">How to use DeepSpeed</a>
 - <a href="https://huggingface.co/blog/accelerate-deepspeed" target="_blank">Accelerate Large Model Training using DeepSpeed</a>
 - <a href="https://huggingface.co/docs/accelerate/package_reference/deepspeed" target="_blank">DeepSpeed Utilities</a>

 ##
+Below is an example yaml for mixed precision training using DeepSpeed ZeRO Stage-3 with CPU offloading on 8 GPUs.
 <pre>
 compute_environment: LOCAL_MACHINE
++deepspeed_config:
++  gradient_accumulation_steps: 1
++  gradient_clipping: 1.0
++  offload_optimizer_device: cpu
++  offload_param_device: cpu
++  zero3_init_flag: true
++  zero3_save_16bit_model: true
++  zero_stage: 3
++distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: fp16
++num_machines: 1
++num_processes: 8
 rdzv_backend: static
 same_network: true
 use_cpu: false
 </pre>
 ##
+Assume that `model` is created utilizing the `transformers` library.
 <pre>
+from accelerate import Accelerator
+def main():
     accelerator = Accelerator()
     model, optimizer, training_dataloader, scheduler = accelerator.prepare(
         model, optimizer, training_dataloader, scheduler
     )
     generated_tokens = accelerator.unwrap_model(model).generate(
+        batch["input_ids"],
+        attention_mask=batch["attention_mask"],
+        **gen_kwargs,
++        synced_gpus=True
+    )
     ...
     accelerator.unwrap_model(model).save_pretrained(
             args.output_dir,
             is_main_process=accelerator.is_main_process,
             save_function=accelerator.save,
++            state_dict=accelerator.get_state_dict(model)
+    )
     ...
 </pre>
+##
+If the YAML was generated through the `accelerate config` command:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
+If the YAML is saved to a `~/config.yaml` file:
+```
+accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
+```
+Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
 ```
 accelerate launch \
     --use_deepspeed \
 ```
 ##
+For core DeepSpeed features (ZeRO stages 1 and 2), Accelerate requires no code changes. For ZeRO Stage-3, `transformers`' `generate` function requires `synced_gpus=True` and `save_pretrained` requires the `state_dict` param due to the fact that model parameters are sharded across the GPUs.
+You can also specify values of most of the fields in the `DeepSpeed` config file to `auto` and they will be automatically filled when performing `accelerate launch`.
 ##
 To learn more checkout the related documentation:
 - <a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed" target="_blank">How to use DeepSpeed</a>
+<a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed#deepspeed-config-file" target="_blank">DeepSpeed Config File</a>
 - <a href="https://huggingface.co/blog/accelerate-deepspeed" target="_blank">Accelerate Large Model Training using DeepSpeed</a>
 - <a href="https://huggingface.co/docs/accelerate/package_reference/deepspeed" target="_blank">DeepSpeed Utilities</a>

code_samples/{large_scale_training → training_configuration}/megatron-lm RENAMED Viewed

@@ -1,23 +1,22 @@
 ##
-Run  `accelerate config` and answer the questionnaire accordingly.
-Below is an example yaml for BF16 mixed-precision training using Megatron-LM with DPxTPxPP=2x2x2 degrees on 8 GPUs. (DP-Data Parallelism, PP-Pipeline Parallelism, TP-Tensor Parallelism). It is also using Sequence Parallelism and selective activation checkpointing along with sharded optimizer.
 <pre>
 compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
-distributed_type: MEGATRON_LM
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
 machine_rank: 0
 main_training_function: main
-megatron_lm_config:
-  megatron_lm_gradient_clipping: 1.0
-  megatron_lm_num_micro_batches: 2
-  megatron_lm_pp_degree: 2
-  megatron_lm_recompute_activations: true
-  megatron_lm_sequence_parallelism: true
-  megatron_lm_tp_degree: 2
-  megatron_lm_use_distributed_optimizer: true
 mixed_precision: bf16
 num_machines: 1
 num_processes: 8
@@ -27,67 +26,52 @@ use_cpu: false
 </pre>
 ##
 <pre>
-  from accelerate import Accelerator
-+ def main():
-    accelerator = Accelerator()
-    ...
--    lr_scheduler = get_scheduler(
--        name=args.lr_scheduler_type,
-+    lr_scheduler = accelerate.utils.MegatronLMDummyScheduler(
-        optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
-        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
-    )
-    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
-    )
-    total_batch_size = (
--            args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-+            accelerator.state.megatron_lm_plugin.global_batch_size
-        )
-    for batch in training_dataloader:
-        optimizer.zero_grad()
-        inputs, targets = batch
-        outputs = model(inputs)
-        loss = loss_function(outputs, targets)
-        accelerator.backward(loss)
-        optimizer.step()
-        scheduler.step()
-    ...
-    # in eval loop
-    for step, batch in enumerate(eval_dataloader):
-        with torch.no_grad():
-            outputs = model(**batch)
-        loss = outputs.loss
--        losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
-+        losses.append(loss) # For Megatron-LM, the losses are already averaged across the data parallel group
--    losses = torch.cat(losses)
-+    losses = torch.tensor(losses)
-    eval_loss = torch.mean(losses)
-    perplexity = math.exp(eval_loss)
-    logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
-+    accelerator.save_state(output_dir)
-+ if __name__ == "__main__":
-+     main()
 </pre>
-Launching a script using default accelerate config file looks like the following:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
-Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
 ```
 accelerate launch \
     --use_megatron_lm \
@@ -109,9 +93,14 @@ For Megatron-LM, the supported models Transformers GPT2, Megatron-BERT and T5 mo
     3. Losses are already averaged across the data parallel group
     4. save the model using `accelerator.save_state` instead of transformers `from_pretrianed`
-These changes have been highlited in the code snippet above.
-Megatron-LM intergration supports many advanced features such as ability to leverage custom train step, using Megatron-LM indexed datasets, checkpoint reshaping and interoperabiloity utilities, `megatron_generate` function for text generation using Tensor and Pipeline Parallelism and support for ROPE/ALibi Positional embeddings and Multi-Query Attention. However, these require more changes owing to the complexity; worth it for getting the highest performance.
 ##
 To learn more checkout the related documentation:

 ##
+Below is an example yaml for BF16 mixed-precision training using Megatron-LM with 2x Data Parallelism, 2x Pipeline Parallelism, and 2x Tensor Parallelism on 8 GPUs. It is also using Sequence Parallelism, selective activation checkpointing, and a sharded optimizer.
 <pre>
 compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
++distributed_type: MEGATRON_LM
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
 machine_rank: 0
 main_training_function: main
++megatron_lm_config:
++  megatron_lm_gradient_clipping: 1.0
++  megatron_lm_num_micro_batches: 2
++  megatron_lm_pp_degree: 2
++  megatron_lm_recompute_activations: true
++  megatron_lm_sequence_parallelism: true
++  megatron_lm_tp_degree: 2
++  megatron_lm_use_distributed_optimizer: true
 mixed_precision: bf16
 num_machines: 1
 num_processes: 8
 </pre>
 ##
 <pre>
+from accelerate import Accelerator
++from accelerate.utils import MegatronLMDummyScheduler
+accelerator = Accelerator()
+...
+-lr_scheduler = get_scheduler(
+-    name=args.lr_scheduler_type,
+-    ...
+-)
++lr_scheduler = MegatronLMDummyScheduler(
++    optimizer=optimizer,
++    num_warmup_steps=...,
++    num_training_steps=...,
++)
+model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+)
+total_batch_size = (
+-    args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
++    accelerator.state.megatron_lm_plugin.global_batch_size
+)
+# in evaluation loop
+for step, batch in enumerate(eval_dataloader):
+    with torch.no_grad():
+        outputs = model(**batch)
+    loss = outputs.loss
+-    losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
++    losses.append(loss) # For Megatron-LM, the losses are already averaged across the data parallel group
+-losses = torch.cat(losses)
++losses = torch.tensor(losses)
 </pre>
+##
+If the YAML was generated through the `accelerate config` command:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
+If the YAML is saved to a `~/config.yaml` file:
+```
+accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
+```
+Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
 ```
 accelerate launch \
     --use_megatron_lm \
     3. Losses are already averaged across the data parallel group
     4. save the model using `accelerator.save_state` instead of transformers `from_pretrianed`
+The Accelerate Megatron-LM integration supports many advanced features such as:
+- Leveraging custom training steps
+- Using Megatron-LM indexed datasets
+- Checkpoint reshaping and interoperabiloity utilities
+- Using `megatron_generate` for text generation using Tensor and Pipeline Parallism
+- Support for ROPE/ALibi Positional embeddings and Multi-Query Attention
+However, each of these require more changes to your source code than what is presented here.
 ##
 To learn more checkout the related documentation:

code_samples/{large_scale_training → training_configuration}/multi_gpu RENAMED Viewed

@@ -1,60 +1,44 @@
 ##
-Run  `accelerate config` and answer the questionnaire accordingly.
-Below is an example yaml for using multi-gpu training with 4 GPUs.
 <pre>
-compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
-distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
-gpu_ids: all
-machine_rank: 0
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: 'no'
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
 use_cpu: false</pre>
 ##
-<pre>
-  from accelerate import Accelerator
-+ def main():
-    accelerator = Accelerator()
-    model, optimizer, training_dataloader, scheduler = accelerator.prepare(
-        model, optimizer, training_dataloader, scheduler
-    )
-    for batch in training_dataloader:
-        optimizer.zero_grad()
-        inputs, targets = batch
-        outputs = model(inputs)
-        loss = loss_function(outputs, targets)
-        accelerator.backward(loss)
-        optimizer.step()
-        scheduler.step()
-+ if __name__ == "__main__":
-+     main()
-</pre>
-Launching a script using default accelerate config file looks like the following:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
-Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
 ```
 accelerate launch --multi_gpu --num_processes=4 {script_name.py} {--arg1} {--arg2} ...
 ```
 ##
-Using this feature involves no changes to the code apart from the ones mentioned in the tab `Simplify your code and improve efficieny`.
 ##
 To learn more checkout the related documentation:
 - <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
-- <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Accelerate CLI</a>

 ##
 <pre>
+compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
++distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
++gpu_ids: all
++machine_rank: 0
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: 'no'
++num_machines: 1
++num_processes: 4
++rdzv_backend: static
++same_network: true
 use_cpu: false</pre>
 ##
+None
+##
+If the YAML was generated through the `accelerate config` command:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
+If the YAML is saved to a `~/config.yaml` file:
+```
+accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
+```
+Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
 ```
 accelerate launch --multi_gpu --num_processes=4 {script_name.py} {--arg1} {--arg2} ...
 ```
 ##
+Launching on multi-GPU instances requires a different launch command than just `python myscript.py`. Accelerate will wrap around the proper launching script to delegate and call, reading in how to set their configuration based on the parameters passed in. It is a passthrough to the `torchrun` command.
+**Remember that you can always use the `accelerate launch` functionality, even if the code in your script does not use the `Accelerator`**
 ##
 To learn more checkout the related documentation:
 - <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
+- <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Command Line</a>

code_samples/{large_scale_training → training_configuration}/multi_node_multi_gpu RENAMED Viewed

@@ -1,89 +1,78 @@
 ##
-Run  `accelerate config` on and answer the questionnaire accordingly.
-Below is an example yaml for using multi-gpu training with 4 GPUs on 2 nodes/machines.
-On Node/Machine 1:
 <pre>
-compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
-distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
 gpu_ids: all
-machine_rank: 0
-main_process_ip: 192.168.20.1
-main_process_port: 8080
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: 'no'
-num_machines: 2
-num_processes: 8
-rdzv_backend: static
-same_network: true
 use_cpu: false
 </pre>
-On Node/Machine 2:
 <pre>
-compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
-distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
 gpu_ids: all
 -machine_rank: 0
 +machine_rank: 1
-main_process_ip: 192.168.20.1
-main_process_port: 8080
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: 'no'
-num_machines: 2
-num_processes: 8
-rdzv_backend: static
-same_network: true
 use_cpu: false
 </pre>
 ##
-<pre>
-  from accelerate import Accelerator
-+ def main():
-      accelerator = Accelerator()
-      model, optimizer, training_dataloader, scheduler = accelerator.prepare(
-          model, optimizer, training_dataloader, scheduler
-      )
-      for batch in training_dataloader:
-          optimizer.zero_grad()
-          inputs, targets = batch
-          outputs = model(inputs)
-          loss = loss_function(outputs, targets)
-          accelerator.backward(loss)
-          optimizer.step()
-          scheduler.step()
-+ if __name__ == "__main__":
-+     main()
-</pre>
-Launching a script using default accelerate config file looks like the following:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
-Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below. Replace `{node_number}` with appropriate number.
 ```
 accelerate launch --multi_gpu --num_machines=2 --num_processes=8 --main_process_ip="192.168.20.1" --main_process_port=8080
  --machine_rank={node_number} {script_name.py} {--arg1} {--arg2} ...
 ```
 ##
-Using this feature involves no changes to the code apart from the ones mentioned in the tab `Simplify your code and improve efficieny`.
 ##
 To learn more checkout the related documentation:
 - <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
-- <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Accelerate CLI</a>

 ##
+Below are example yamls for using multi-gpu training with 4 GPUs on two machines (nodes) where each machine has two GPUs:
+On machine 1 (host):
 <pre>
+compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
++distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
 gpu_ids: all
++machine_rank: 0
++main_process_ip: 192.168.20.1
++main_process_port: 8080
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: 'no'
++num_machines: 2
++num_processes: 8
++rdzv_backend: static
++same_network: true
 use_cpu: false
 </pre>
+On machine 2:
 <pre>
+compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
++distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
 fsdp_config: {}
 gpu_ids: all
 -machine_rank: 0
 +machine_rank: 1
++main_process_ip: 192.168.20.1
++main_process_port: 8080
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: 'no'
++num_machines: 2
++num_processes: 8
++rdzv_backend: static
++same_network: true
 use_cpu: false
 </pre>
 ##
+None
+##
+To launch a script, on each machine run one of the following variations:
+If the YAML was generated through the `accelerate config` command:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
+If the YAML is saved to a `~/config.yaml` file:
+```
+accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
+```
+Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
+Replace `{node_number}` with appropriate machine number (0 for host, 1+ if not).
 ```
 accelerate launch --multi_gpu --num_machines=2 --num_processes=8 --main_process_ip="192.168.20.1" --main_process_port=8080
  --machine_rank={node_number} {script_name.py} {--arg1} {--arg2} ...
 ```
 ##
+When utilizing multiple machines (nodes) for training, the config file needs to know how each machine will be able to communicate (the IP address and port), how many *total* GPUs there are, and whether the current machine is either the host or a client.
+**Remember that you can always use the `accelerate launch` functionality, even if the code in your script does not use the `Accelerator`**
 ##
 To learn more checkout the related documentation:
 - <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
+- <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Command Line</a>

code_samples/{large_scale_training → training_configuration}/pytorch_fsdp RENAMED Viewed

@@ -1,63 +1,55 @@
 ##
-Run  `accelerate config` and answer the questionnaire accordingly.
-Below is an example yaml for BF16 mixed-precision training using PyTorch FSDP with CPU offloading on 8 GPUs.
 <pre>
-compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
-distributed_type: FSDP
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
-fsdp_config:
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_backward_prefetch_policy: BACKWARD_PRE
-  fsdp_offload_params: true
-  fsdp_sharding_strategy: 1
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_transformer_layer_cls_to_wrap: T5Block
 machine_rank: 0
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: bf16
 num_machines: 1
-num_processes: 8
 rdzv_backend: static
 same_network: true
 use_cpu: false
 </pre>
 ##
 <pre>
-  from accelerate import Accelerator
-+ def main():
-    accelerator = Accelerator()
-    model = accelerator.prepare(model)
-    optimizer, training_dataloader, scheduler = accelerator.prepare(
-        optimizer, training_dataloader, scheduler
-    )
-    for batch in training_dataloader:
-        optimizer.zero_grad()
-        inputs, targets = batch
-        outputs = model(inputs)
-        loss = loss_function(outputs, targets)
-        accelerator.backward(loss)
-        optimizer.step()
-        scheduler.step()
-    ...
-+ if __name__ == "__main__":
-+     main()
 </pre>
-Launching a script using default accelerate config file looks like the following:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
-Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
 ```
 accelerate launch \
     --use_fsdp \
@@ -71,10 +63,11 @@ accelerate launch \
 ```
 ##
-For PyTorch FDSP, you need to prepare the model first before preparing the optimizer since FSDP will shard parameters in-place and this will break any previously initialized optimizers. Same in outlined in the above code snippet. For transformer models, please use `TRANSFORMER_BASED_WRAP` auto wrap policy as shown in the config above.
 ##
 To learn more checkout the related documentation:
-- <a href="https://huggingface.co/docs/accelerate/usage_guides/fsdp" target="_blank">How to use FSDP</a>
 - <a href="https://huggingface.co/blog/pytorch-fsdp" target="_blank">Accelerate Large Model Training using PyTorch Fully Sharded Data Parallel</a>

 ##
+Below is an example yaml for BF16 mixed-precision training using PyTorch Fully Sharded Data Parallism (FSDP) with CPU offloading on 8 GPUs.
 <pre>
+compute_environment: LOCAL_MACHINE
 deepspeed_config: {}
++distributed_type: FSDP
 downcast_bf16: 'no'
 dynamo_backend: 'NO'
++fsdp_config:
++  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
++  fsdp_backward_prefetch_policy: BACKWARD_PRE
++  fsdp_offload_params: true
++  fsdp_sharding_strategy: 1
++  fsdp_state_dict_type: FULL_STATE_DICT
++  fsdp_transformer_layer_cls_to_wrap: T5Block
 machine_rank: 0
 main_training_function: main
 megatron_lm_config: {}
 mixed_precision: bf16
 num_machines: 1
++num_processes: 8
 rdzv_backend: static
 same_network: true
 use_cpu: false
 </pre>
 ##
 <pre>
+from accelerate import Accelerator
+accelerator = Accelerator()
+- model, optimizer, dataloader, scheduler = accelerator.prepare(
+-  model, optimizer, dataloader, scheduler
+-)
++model = accelerator.prepare(model)
++# Optimizer can be any PyTorch optimizer class
++optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
++optimizer, dataloader, scheduler = accelerator.prepare(
++  optimizer, dataloader, scheduler
++)
 </pre>
+##
+If the YAML was generated through the `accelerate config` command:
 ```
 accelerate launch {script_name.py} {--arg1} {--arg2} ...
 ```
+If the YAML is saved to a `~/config.yaml` file:
+```
+accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
+```
+Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
 ```
 accelerate launch \
     --use_fsdp \
 ```
 ##
+For PyTorch FDSP, you need to prepare the model first **before** preparing the optimizer since FSDP will shard parameters in-place and this will break any previously initialized optimizers.
+For transformer models, please use `TRANSFORMER_BASED_WRAP` auto wrap policy as shown in the config above.
 ##
 To learn more checkout the related documentation:
+- <a href="https://huggingface.co/docs/accelerate/usage_guides/fsdp" target="_blank">How to use Fully Sharded Data Parallelism</a>
 - <a href="https://huggingface.co/blog/pytorch-fsdp" target="_blank">Accelerate Large Model Training using PyTorch Fully Sharded Data Parallel</a>

setup.cfg ADDED Viewed

	@@ -0,0 +1,19 @@

+[isort]
+default_section = FIRSTPARTY
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = accelerate
+known_third_party =
+    numpy
+    torch
+    torch_xla
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+[flake8]
+ignore = E203, E722, E501, E741, W503, W605
+max-line-length = 119

src/app.py CHANGED Viewed

@@ -1,10 +1,35 @@
 import gradio as gr
-from markup import highlight, get_text
 from template import get_templates
 templates = get_templates()
 def change(inp, textbox):
     """Based on an `inp`, render and highlight the appropriate code sample.
@@ -20,65 +45,80 @@ def change(inp, textbox):
     if textbox == "base":
         code, explanation, docs = get_text(inp, textbox)
         if inp == "Basic":
-            return (highlight(code), "## Accelerate Code (Base Integration)", explanation, docs)
         elif inp == "Calculating Metrics":
             return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
         else:
             return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
-    elif textbox == "large_scale_training":
-        config, code, explanation, docs = get_text(inp, textbox)
-        return (highlight(config), highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
-default = change("Basic", "base")
 def base_features(textbox):
-    # textbox.value = "base"
     inp = gr.Radio(
-        ["Basic", "Calculating Metrics", "Checkpointing", "Experiment Tracking", "Gradient Accumulation"],
         label="Select a feature you would like to integrate",
         value="Basic",
     )
-    with gr.Row():
-        with gr.Column():
-            feature = gr.Markdown("## Accelerate Code")
-            out = gr.Markdown(default[0])
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("## Explanation")
-            explanation = gr.Markdown(default[2])
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("## Documentation Links")
-            docs = gr.Markdown(default[3])
-    inp.change(fn=change, inputs=[inp, textbox], outputs=[out, feature, explanation, docs])
-def large_scale_training(textbox):
-    # textbox.value = "large_scale_training"
     inp = gr.Radio(
-        ["Multi GPU", "Multi Node Multi GPU", "AWS SageMaker", "DeepSpeed", "PyTorch FSDP", "Megatron-LM"],
-        label="Select a feature you would like to integrate",
-        value="Basic",
     )
-    with gr.Row():
-        with gr.Column():
-            feature = gr.Markdown("## Accelerate Config")
-            config = gr.Markdown("")
-    with gr.Row():
-        with gr.Column():
-            feature = gr.Markdown("## Accelerate Code")
-            out = gr.Markdown("")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("## Explanation")
-            explanation = gr.Markdown("")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("## Documentation Links")
-            docs = gr.Markdown("")
-    inp.change(fn=change, inputs=[inp, textbox], outputs=[config, out, feature, explanation, docs])
 # def big_model_inference():
@@ -126,16 +166,18 @@ def large_scale_training(textbox):
 with gr.Blocks() as demo:
     with gr.Tabs():
-        with gr.TabItem("Simplify your code and improve efficieny"):
             textbox = gr.Textbox(label="tab_name", visible=False, value="base")
             base_features(textbox)
-        with gr.TabItem("Large Scale Training"):
-            textbox = gr.Textbox(label="tab_name", visible=False, value="large_scale_training")
-            large_scale_training(textbox)
         with gr.TabItem("Big Model Inference"):
             # big_model_inference()
             pass
-        with gr.TabItem("Notebook Launcher Intergation"):
             # notebook_launcher()
             pass

+from contextlib import contextmanager
 import gradio as gr
+from markup import get_text, highlight
 from template import get_templates
 templates = get_templates()
+def fill_tab(title, explanation):
+    """
+    Fill the tab with the appropriate title and explanation.
+    """
+    return gr.Markdown(title), gr.Markdown(explanation)
+@contextmanager
+def new_section():
+    """
+    A context manager to create a new section in the interface. Equivalent of:
+    ```python
+    with gr.Row():
+        with gr.Column():
+            ...
+    ```
+    """
+    with gr.Row():
+        with gr.Column():
+            yield
 def change(inp, textbox):
     """Based on an `inp`, render and highlight the appropriate code sample.
     if textbox == "base":
         code, explanation, docs = get_text(inp, textbox)
         if inp == "Basic":
+            return (
+                highlight(code),
+                "## Accelerate Code (Base Integration)",
+                explanation,
+                docs,
+            )
         elif inp == "Calculating Metrics":
             return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
         else:
             return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
+    elif textbox == "training_configuration":
+        yaml, changes, command, explanation, docs = get_text(inp, textbox)
+        return (highlight(yaml), highlight(changes), command, explanation, docs)
+    else:
+        raise ValueError(f"Invalid tab name: {textbox}")
+default_base = change("Basic", "base")
+default_training_config = change("Multi GPU", "training_configuration")
 def base_features(textbox):
     inp = gr.Radio(
+        [
+            "Basic",
+            "Calculating Metrics",
+            "Checkpointing",
+            "Experiment Tracking",
+            "Gradient Accumulation",
+        ],
         label="Select a feature you would like to integrate",
         value="Basic",
     )
+    with new_section():
+        feature, out = fill_tab("## Accelerate Code", default_base[0])
+    with new_section():
+        _, explanation = fill_tab("## Explanation", default_base[2])
+    with new_section():
+        _, docs = fill_tab("## Documentation Links", default_base[3])
+    inp.change(
+        fn=change, inputs=[inp, textbox], outputs=[out, feature, explanation, docs]
+    )
+def training_config(textbox):
     inp = gr.Radio(
+        [
+            "AWS SageMaker",
+            "DeepSpeed",
+            "Megatron-LM",
+            "Multi GPU",
+            "Multi Node Multi GPU",
+            "PyTorch FSDP",
+        ],
+        label="Select a distributed YAML configuration you would like to view.",
+        value="Multi GPU",
+    )
+    with new_section():
+        _, yaml = fill_tab("## Example YAML Configuration", default_training_config[0])
+    with new_section():
+        _, changes = fill_tab(
+            "## Changes to Training Script", default_training_config[1]
+        )
+    with new_section():
+        _, command = fill_tab("## Command to Run Training", default_training_config[2])
+    with new_section():
+        _, explanation = fill_tab("## Explanation", default_training_config[3])
+    with new_section():
+        _, docs = fill_tab("## Documentation Links", default_training_config[4])
+    inp.change(
+        fn=change,
+        inputs=[inp, textbox],
+        outputs=[yaml, changes, command, explanation, docs],
     )
 # def big_model_inference():
 with gr.Blocks() as demo:
     with gr.Tabs():
+        with gr.TabItem("Basic Training Integration"):
             textbox = gr.Textbox(label="tab_name", visible=False, value="base")
             base_features(textbox)
+        with gr.TabItem("Launch Configuration"):
+            textbox = gr.Textbox(
+                label="tab_name", visible=False, value="training_configuration"
+            )
+            training_config(textbox)
         with gr.TabItem("Big Model Inference"):
             # big_model_inference()
             pass
+        with gr.TabItem("Launching from Notebooks"):
             # notebook_launcher()
             pass

src/markup.py CHANGED Viewed

@@ -14,6 +14,7 @@
 from template import get_filename
 _remove_color = "rgb(103,6,12)"
 _addition_color = "rgb(6,103,12)"

 from template import get_filename
 _remove_color = "rgb(103,6,12)"
 _addition_color = "rgb(6,103,12)"

src/template.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 TEMPLATES = ["initial", "initial_with_metrics", "accelerate"]
@@ -27,4 +28,6 @@ def get_templates() -> dict:
     """
     Returns a dictionary of template type to code content
     """
-    return {template: open(get_filename("base", template)).read() for template in TEMPLATES}

 # limitations under the License.
 import os
 TEMPLATES = ["initial", "initial_with_metrics", "accelerate"]
     """
     Returns a dictionary of template type to code content
     """
+    return {
+        template: open(get_filename("base", template)).read() for template in TEMPLATES
+    }