diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 5c03cac5..01e8efed 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -70,38 +70,176 @@ jobs: exit_code=$? sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname + training_8GPU_4DP2TP: + strategy: + matrix: + runner: [910B] + runs-on: ${{ matrix.runner }} + timeout-minutes: 15 + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + echo "::add-mask::$path_prefix" + if [[ ${{ matrix.runner }} == 910B ]];then + sudo git clean -ffdx + fi + - uses: actions/checkout@v3 + - name: training_8GPU_4DP2TP_910B + if: ${{ matrix.runner == '910B' }} + run: | + jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} + start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py' + bash ../910B_sco.sh $jobname "$start_command" + + training_8GPU_4DP2TPSP: + strategy: + matrix: + runner: [910B] + runs-on: ${{ matrix.runner }} + timeout-minutes: 15 + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + echo "::add-mask::$path_prefix" + if [[ ${{ matrix.runner }} == 910B ]];then + sudo git clean -ffdx + fi + - uses: actions/checkout@v3 + - name: training_8GPU_4DP2TPSP_910B + if: ${{ matrix.runner == '910B' }} + run: | + jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} + start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py' + bash ../910B_sco.sh $jobname "$start_command" + + training_8GPU_4DP2PP: + strategy: + matrix: + runner: [910B] + runs-on: ${{ matrix.runner }} + timeout-minutes: 15 + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + echo "::add-mask::$path_prefix" + if [[ ${{ matrix.runner }} == 910B ]];then + sudo git clean -ffdx + fi + - uses: actions/checkout@v3 + - name: training_8GPU_4DP2PP_910B + if: ${{ matrix.runner == '910B' }} + run: | + jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} + start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py' + bash ../910B_sco.sh $jobname "$start_command" + + training_16GPU_4DP2TP2PP_MTP: + strategy: + matrix: + runner: [910B] + runs-on: ${{ matrix.runner }} + timeout-minutes: 15 + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + echo "::add-mask::$path_prefix" + if [[ ${{ matrix.runner }} == 910B ]];then + sudo git clean -ffdx + fi + - uses: actions/checkout@v3 + - name: training_16GPU_4DP2TP2PP_MTP_910B + if: ${{ matrix.runner == '910B' }} + run: | + jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} + start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py' + bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce" + + training_16GPU_4DP2TP2PP_MSP: + strategy: + matrix: + runner: [910B] + runs-on: ${{ matrix.runner }} + timeout-minutes: 15 + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + echo "::add-mask::$path_prefix" + if [[ ${{ matrix.runner }} == 910B ]];then + sudo git clean -ffdx + fi + - uses: actions/checkout@v3 + - name: training_16GPU_4DP2TP2PP_MSP_910B + if: ${{ matrix.runner == '910B' }} + run: | + jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} + start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py' + bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce" + + training_16GPU_4DP2TP2PP_FSP: + strategy: + matrix: + runner: [910B] + runs-on: ${{ matrix.runner }} + timeout-minutes: 15 + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + echo "::add-mask::$path_prefix" + if [[ ${{ matrix.runner }} == 910B ]];then + sudo git clean -ffdx + fi + - uses: actions/checkout@v3 + - name: training_16GPU_4DP2TP2PP_FSP_910B + if: ${{ matrix.runner == '910B' }} + run: | + jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} + start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py' + bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce" + training_llama2: - runs-on: [t_cluster] + strategy: + matrix: + runner: [910B] + runs-on: ${{ matrix.runner }} timeout-minutes: 20 steps: - name: mask env run: | echo "::add-mask::${{env.WORKSPACE_PREFIX}}" echo "::add-mask::$path_prefix" + if [[ ${{ matrix.runner }} == 910B ]];then + sudo git clean -ffdx + fi - uses: actions/checkout@v3 - - - name: training_llama2 + - name: training_llama2_910B run: | - source activate ${evo_env_torch21_flash2} - jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py - exit_code=$? - sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname + jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} + start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py' + bash ../910B_sco.sh $jobname "$start_command" training_internlm2: - runs-on: [t_cluster] + strategy: + matrix: + runner: [910B] + runs-on: ${{ matrix.runner }} timeout-minutes: 20 steps: - name: mask env run: | echo "::add-mask::${{env.WORKSPACE_PREFIX}}" echo "::add-mask::$path_prefix" + if [[ ${{ matrix.runner }} == 910B ]];then + sudo git clean -ffdx + fi - uses: actions/checkout@v3 - - - name: training_internlm2 + - name: training_internlm2_910B run: | - source activate ${evo_env_torch21_flash2} - jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py - exit_code=$? - sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname + jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} + start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py' + bash ../910B_sco.sh $jobname "$start_command" diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py index 37123588..b311de4e 100644 --- a/tests/test_training/test_loss.py +++ b/tests/test_training/test_loss.py @@ -113,6 +113,7 @@ def train( launcher = "slurm" else: launcher = "torch" + config.model.parallel_output = False initialize_distributed_env(config=config, launcher=launcher) assert hasattr(gpc, "config") and gpc.config is not None @@ -158,7 +159,7 @@ def train( isp_communicator = initialize_parallel_communicator(model) # initialize loss function - criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing) + criterion = FlashGPTLMLoss(parallel_output=gpc.config.model.parallel_output, label_smoothing=label_smoothing) # initialize the train data loader train_dl, dataset_types = build_train_loader_with_data_type()