diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
index 5c03cac5..01e8efed 100644
--- a/.github/workflows/e2e_test.yaml
+++ b/.github/workflows/e2e_test.yaml
@@ -70,38 +70,176 @@ jobs:
         exit_code=$?
         sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
+  training_8GPU_4DP2TP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_8GPU_4DP2TP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
+
+  training_8GPU_4DP2TPSP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_8GPU_4DP2TPSP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
+
+  training_8GPU_4DP2PP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_8GPU_4DP2PP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
+
+  training_16GPU_4DP2TP2PP_MTP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_16GPU_4DP2TP2PP_MTP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+
+  training_16GPU_4DP2TP2PP_MSP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_16GPU_4DP2TP2PP_MSP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+
+  training_16GPU_4DP2TP2PP_FSP:
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 15
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+        echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
+    - uses: actions/checkout@v3
+    - name: training_16GPU_4DP2TP2PP_FSP_910B
+      if: ${{ matrix.runner == '910B' }}
+      run: |
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+
   training_llama2:
-    runs-on: [t_cluster]
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
     timeout-minutes: 20
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
     - uses: actions/checkout@v3
-
-    - name: training_llama2
+    - name: training_llama2_910B
       run: |
-        source activate ${evo_env_torch21_flash2}
-        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
-        exit_code=$?
-        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
 
   training_internlm2:
-    runs-on: [t_cluster]
+    strategy:
+      matrix:
+        runner: [910B]
+    runs-on: ${{ matrix.runner }}
     timeout-minutes: 20
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
+        if [[ ${{ matrix.runner }} == 910B ]];then
+           sudo git clean -ffdx
+        fi
     - uses: actions/checkout@v3
-
-    - name: training_internlm2
+    - name: training_internlm2_910B
       run: |
-        source activate ${evo_env_torch21_flash2}
-        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py
-        exit_code=$?
-        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
+        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
+        bash ../910B_sco.sh $jobname "$start_command"
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
index 37123588..b311de4e 100644
--- a/tests/test_training/test_loss.py
+++ b/tests/test_training/test_loss.py
@@ -113,6 +113,7 @@ def train(
         launcher = "slurm"
     else:
         launcher = "torch"
+        config.model.parallel_output = False
 
     initialize_distributed_env(config=config, launcher=launcher)
     assert hasattr(gpc, "config") and gpc.config is not None
@@ -158,7 +159,7 @@ def train(
     isp_communicator = initialize_parallel_communicator(model)
 
     # initialize loss function
-    criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
+    criterion = FlashGPTLMLoss(parallel_output=gpc.config.model.parallel_output, label_smoothing=label_smoothing)
 
     # initialize the train data loader
     train_dl, dataset_types = build_train_loader_with_data_type()