Skip to content

Commit

Permalink
add test on 910B
Browse files Browse the repository at this point in the history
  • Loading branch information
kkscilife committed Sep 3, 2024
1 parent 1c43663 commit 9ac9b9f
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 17 deletions.
170 changes: 154 additions & 16 deletions .github/workflows/e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,38 +70,176 @@ jobs:
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_8GPU_4DP2TP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2TP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
training_8GPU_4DP2TPSP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2TPSP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
training_8GPU_4DP2PP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2PP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
training_16GPU_4DP2TP2PP_MTP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_MTP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
training_16GPU_4DP2TP2PP_MSP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_MSP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
training_16GPU_4DP2TP2PP_FSP:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_FSP_910B
if: ${{ matrix.runner == '910B' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
training_llama2:
runs-on: [t_cluster]
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3

- name: training_llama2
- name: training_llama2_910B
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
training_internlm2:
runs-on: [t_cluster]
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3

- name: training_internlm2
- name: training_internlm2_910B
run: |
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
3 changes: 2 additions & 1 deletion tests/test_training/test_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def train(
launcher = "slurm"
else:
launcher = "torch"
config.model.parallel_output = False

initialize_distributed_env(config=config, launcher=launcher)
assert hasattr(gpc, "config") and gpc.config is not None
Expand Down Expand Up @@ -158,7 +159,7 @@ def train(
isp_communicator = initialize_parallel_communicator(model)

# initialize loss function
criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
criterion = FlashGPTLMLoss(parallel_output=gpc.config.model.parallel_output, label_smoothing=label_smoothing)

# initialize the train data loader
train_dl, dataset_types = build_train_loader_with_data_type()
Expand Down

0 comments on commit 9ac9b9f

Please sign in to comment.