pytorch · fduwjj · Feb 13, 2024 · Feb 14, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ Our guiding principles when building `torchtitan`:
 
 You may want to see how the model is defined or how parallelism techniques are applied. For a guided tour, see these files first:
 * [train.py](https://github.com/pytorch/torchtitan/blob/main/train.py) - the main training loop and high-level setup code
-* [torchtitan/parallelisms/parallelize_llama.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py) - helpers for applying Data Parallel, Tensor Parallel, activation checkpointing, and `torch.compile` to the model
+* [torchtitan/parallelisms/parallelize_llama.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py) - helpers for applying Data Parallel, Tensor Parallel, activation checkpointing, and `torch.compile` to the model	* [torchtitan/parallelisms/parallelize_llama.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py) - helpers for applying Data / Tensor / Pipeline Parallelisms to the model
 * [torchtitan/parallelisms/pipeline_llama.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/pipeline_llama.py) - helpers for applying Pipeline Parallel to the model
 * [torchtitan/checkpoint.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/checkpoint.py) - utils for saving/loading distributed checkpoints
 * [torchtitan/float8.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/float8.py) - utils for applying Float8 techniques
@@ -64,6 +64,7 @@ git clone https://github.com/pytorch/torchtitan
 cd torchtitan
 pip install -r requirements.txt
 pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 # or cu118
+pip3 install --pre torchdata --index-url https://download.pytorch.org/whl/nightly
 ```
 
 ### Downloading a tokenizer

diff --git a/assets/images/llama3_1_405B_loss_curves.png b/assets/images/llama3_1_405B_loss_curves.png
diff --git a/docs/performance.md b/docs/performance.md
@@ -19,6 +19,21 @@ Next we show the loss curves for Llama 3 8B and Llama 3 70B training with both 1
 ![image](../assets/images/llama3_loss_curves.png)
 
 
+## Llama 3.1 performance numbers
+
+We did a performance measurement on the 405B model released in [LLaMa 3.1](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1). Because the model now is larger, we run on 128 H100 GPUs to test both performance and loss curves. Below is the performance result of 405B model with optimizations we have developed. We do see OOM for 1D even with batch size = 1, so we only tested the 2D case.
+
+
+| Model size | Batch size | Activation checkpointing | WPS | MFU | optimizations |
+| ----- | ----- | ----- | ----- | ----- | ----- |
+| 405B | 2 | full | 118 | 37.1% | None
+| 405B | 2 | full | 177 | 27.77% | FP8
+| 405B | 2 | full | 185 | 29.03% | FP8 + async TP
+
+And the loss curves are shown below:
+
+![image](../assets/images/llama3_1_405B_loss_curves.png)
+
 ## Llama 2 performance numbers
 
 Below are the WPS and MFU results which torchtitan achieves on Llama 2 models with FSDP2 on 64 A100 (80GB) GPUs.

diff --git a/estimation.py b/estimation.py
@@ -16,7 +16,7 @@
 
 from torchtitan.config_manager import JobConfig
 from torchtitan.datasets import build_tokenizer
-from torchtitan.float8 import Float8Handler
+from torchtitan.float8_linear import Float8Handler
 from torchtitan.logging import init_logger, logger
 from torchtitan.models import model_name_to_cls, model_name_to_tokenizer, models_config
 from torchtitan.optimizer import build_lr_schedulers, build_optimizers
@@ -122,25 +122,33 @@ def loss_fn(pred, labels):
             f"Building {model_name} {job_config.model.flavor} with {model_config}"
         )
         with torch.device("meta"):
-            model = model_cls.from_model_args(model_config)
+            whole_model = model_cls.from_model_args(model_config)
 
-        # a no-op hander if float8 is not enabled
+        # a no-op hander if fp8 is not enabled
         float8_handler = Float8Handler(job_config, parallel_dims)
-        # swap to Float8Linear based on float8 configs
-        float8_handler.convert_to_float8_training(model)
+        # swap to Float8Linear base on fp8 config
+        float8_handler.convert_to_float8_training(whole_model)
 
         # apply PT-D DP/TP parallelisms and activation checkpointing
-        models_parallelize_fns[model_name](model, world_mesh, parallel_dims, job_config)
+        model_parts = [whole_model]
+        model_parts = [
+            models_parallelize_fns[model_name](m, world_mesh, parallel_dims, job_config)
+            for m in model_parts
+        ]
+
+        init_device = "cuda"
+        for model in model_parts:
+            model.to_empty(device=init_device)
 
-        model.to_empty(device="cuda")
         if not active_fake_mode():
-            model.init_weights()
-        model.train()
+            whole_model.init_weights()
 
         # build optimizer after applying parallelisms to the model
-        optimizers = build_optimizers([model], job_config)
+        optimizers = build_optimizers(model_parts, job_config)
         lr_schedulers = build_lr_schedulers(optimizers.optimizers, job_config)
 
+        for model in model_parts:
+            model.train()
         logger.info(f"Vocab size: {model_config.vocab_size}")
         # Create a dummy batch instead of loading from a dataset
         batch = (
@@ -157,31 +165,32 @@ def loss_fn(pred, labels):
                 device="cuda",
             ),
         )
-        fsdp_memtracker = FSDPMemTracker(mod=model, optm=optimizers.optimizers[0])
+        fsdp_memtracker = FSDPMemTracker(mod=whole_model, optm=optimizers.optimizers[0])
         fsdp_memtracker.track_inputs(batch)
 
         with fsdp_memtracker:
             for iter_idx in range(2):
                 input_ids, labels = batch
                 # train step
                 with train_context():
-                    pred = model(input_ids)
+                    pred = whole_model(input_ids)
                     loss = loss_fn(pred, labels)
                     del pred
                     loss.backward()
 
                 # clip gradients
-                torch.nn.utils.clip_grad_norm_(
-                    model.parameters(), job_config.training.max_norm, foreach=True
-                )
+                for model in model_parts:
+                    torch.nn.utils.clip_grad_norm_(
+                        model.parameters(), job_config.training.max_norm, foreach=True
+                    )
                 # sync float8 amaxes and scales
                 float8_handler.sync_float8_amax_and_scale_history(model)
                 # optimizer step
                 optimizers.step()
                 lr_schedulers.step()
                 # calculate float8 dynamic amax/scale for all-parameter for FSDP2
                 # it issues a single all-reduce for all parameters at once for better performance
-                float8_handler.precompute_float8_dynamic_scale_for_fsdp(model)
+                float8_handler.precompute_fp8_dynamic_scale_for_fsdp(model)
                 optimizers.zero_grad()
                 print(f"Peak Memory at iter: {iter_idx}")
                 fsdp_memtracker.display_snapshot("peak", units="MiB", tabulate=True)

diff --git a/test/datasets/test_checkpoint.py b/test/datasets/test_checkpoint.py
@@ -11,8 +11,8 @@
 
 class TestCheckpoint:
     def test_c4_resumption(self):
-        dataset_name = "c4_test"
-        dataset_path = "./test/assets/c4_test"
+        dataset_name = "c4_mini"
+        dataset_path = "./torchtitan/datasets/c4_mini"
         batch_size = 1
         seq_len = 1024
         world_size = 4