Add SMP v2 notebook for accelerating training with FP8 on P5. (#4578)

* Update example notebooks and related scripts for latest PT-2.2-TSM-2.2 release. Add FP8 training support on P5. * Add example notebook for accelerating Llama-v2 training with FP8 on P5. * Fix typo in version check * Update configurations. Revert jupyter notebook python version in metadata. Set activation_offloading=False for FP8 notebook. Explicitly enable use_smp_implementation in all SMP v2 notebooks. * Update FP8 notebook docs. * Set zipped_data=0 for use_fsx=False FP8 notebook. * Update compute_tflops() script. * Update minimum sagemaker pysdk version to `2.212`.
aws · Mar 7, 2024 · bf3cadc · bf3cadc
1 parent 123dbf4
commit bf3cadc
Show file tree

Hide file tree

Showing 14 changed files with 1,342 additions and 82 deletions.
diff --git a/...ributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb b/...ributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb
@@ -80,7 +80,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --upgrade \"sagemaker>=2.2\"\n",
+    "%pip install --upgrade \"sagemaker>=2.212\"\n",
     "%pip install sagemaker-experiments"
    ]
   },
@@ -671,6 +671,7 @@
     "    \"auto_wrap_policy\": \"transformer_auto_wrap_policy\",\n",
     "    \"model_type\": model_type,\n",
     "    \"use_smp_flash_attn\": 1,\n",
+    "    \"use_smp_implementation\": 1,\n",
     "    \"patch_neox_rope\": 0,\n",
     "    \"distributed_backend\": \"nccl\",\n",
     "}\n",
@@ -882,7 +883,7 @@
     "        },\n",
     "    },\n",
     "    py_version=\"py310\",\n",
-    "    framework_version=\"2.0.1\",\n",
+    "    framework_version=\"2.2.0\",\n",
     "    # image_uri=$IMAGE,  # Either provide `framework_version` or `image_uri`\n",
     "    output_path=s3_output_bucket,\n",
     "    max_run=86400,\n",

diff --git a/.../distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb b/.../distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb
@@ -22,7 +22,6 @@
     "\n",
     "The notebook is accompanied by the following files:\n",
     "- `train.py`: The entry point script that'll be passed to the SageMaker PyTorch estimator later in this notebook when launching the training job.\n",
-    "-\n",
     "- `arguments.py`: This file has functions for argument parsing (i.e. hyperparameters).\n",
     "- `checkpoints.py`: This file has functions for saving and loading checkpoints.\n",
     "- `data_utils`: This file has functions for handling S3 URLs.\n",
@@ -75,7 +74,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --upgrade \"sagemaker>=2.2\"\n",
+    "%pip install --upgrade \"sagemaker>=2.212\"\n",
     "%pip install sagemaker-experiments"
    ]
   },
@@ -624,7 +623,7 @@
     "\n",
     "tensor_parallel_degree = 2  # An integer in [1, world_size]\n",
     "hybrid_shard_degree = (\n",
-    "    4  # # An integer in [0, world_size // tensor_parallel_degree] and its default value is 0.\n",
+    "    4  # An integer in [0, world_size // tensor_parallel_degree] and its default value is 0.\n",
     ")\n",
     "offload_activations = True  # Enables SM activation offloading implementation.\n",
     "activation_loading_horizon = (\n",
@@ -662,6 +661,7 @@
     "    \"sharding_strategy\": \"hybrid_shard\",\n",
     "    \"train_batch_size\": 2,\n",
     "    \"use_smp_flash_attn\": 1,\n",
+    "    \"use_smp_implementation\": 1,\n",
     "    \"val_batch_size\": 4,\n",
     "    \"validation_freq\": save_steps,\n",
     "    \"vocab_size\": 50257,\n",
@@ -874,7 +874,7 @@
     "        },\n",
     "    },\n",
     "    py_version=\"py310\",\n",
-    "    framework_version=\"2.0.1\",\n",
+    "    framework_version=\"2.2.0\",\n",
     "    # image_uri=$IMAGE,  # Either provide `framework_version` or `image_uri`\n",
     "    output_path=s3_output_bucket,\n",
     "    max_run=86400,\n",
@@ -956,7 +956,7 @@
     "            },\n",
     "        },\n",
     "        py_version=\"py310\",\n",
-    "        framework_version=\"2.0.1\",\n",
+    "        framework_version=\"2.2.0\",\n",
     "        # image_uri=$IMAGE,  # Either provide `framework_version` or `image_uri`\n",
     "        output_path=s3_output_bucket,\n",
     "        max_run=86400,\n",

diff --git a/...istributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb b/...istributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb
@@ -80,7 +80,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --upgrade \"sagemaker>=2.2\"\n",
+    "%pip install --upgrade \"sagemaker>=2.212\"\n",
     "%pip install sagemaker-experiments"
    ]
   },
@@ -663,6 +663,7 @@
     "    \"auto_wrap_policy\": \"transformer_auto_wrap_policy\",\n",
     "    \"model_type\": model_type,\n",
     "    \"use_smp_flash_attn\": 1,\n",
+    "    \"use_smp_implementation\": 1,\n",
     "    \"distributed_backend\": \"nccl\",\n",
     "}\n",
     "\n",
@@ -867,7 +868,7 @@
     "        },\n",
     "    },\n",
     "    py_version=\"py310\",\n",
-    "    framework_version=\"2.0.1\",\n",
+    "    framework_version=\"2.2.0\",\n",
     "    # image_uri=$IMAGE,  # Either provide `framework_version` or `image_uri`\n",
     "    output_path=s3_output_bucket,\n",
     "    max_run=86400,\n",