diff --git a/.azure-pipelines/integration-test-rocm.yml b/.azure-pipelines/integration-test-rocm.yml new file mode 100644 index 00000000..c098ab08 --- /dev/null +++ b/.azure-pipelines/integration-test-rocm.yml @@ -0,0 +1,97 @@ +trigger: +- main + +pr: + branches: + include: + - main + drafts: false + +jobs: +- job: IntegrationTestRocm + displayName: Integration test ROCm + strategy: + matrix: + rocm6.2: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 + + pool: + name: mscclpp-rocm + container: + image: $[ variables['containerImage'] ] + options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1 + + steps: + - task: Bash@3 + name: Build + displayName: Build + inputs: + targetType: 'inline' + script: | + mkdir build && cd build + CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' + + - task: Bash@3 + name: InstallRcclTest + displayName: Install rccl-test + inputs: + targetType: 'inline' + script: | + git clone https://github.com/ROCm/rccl-tests.git + cd rccl-tests + make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j + workingDirectory: '$(System.DefaultWorkingDirectory)' + + - task: Bash@3 + name: InstallDep + displayName: Install dependencies + inputs: + targetType: 'inline' + script: | + set -e + git clone https://github.com/Azure/msccl-tools.git + cd msccl-tools + pip3 install . + + - task: Bash@3 + name: GenerateExectionFiles + displayName: Generate execution files + inputs: + targetType: 'inline' + script: | + set -e + git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/azure-mscclpp + cd azure-mscclpp + git checkout binyli/ci + mkdir execution-files + python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json + python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json + + - task: Bash@3 + name: AllReduceTest + displayName: Run mscclpp allReduce test + inputs: + targetType: 'inline' + script: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + sudo /usr/local/mpi/bin/mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/apps/nccl/libmscclpp_nccl.so" \ + -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100 + workingDirectory: '$(System.DefaultWorkingDirectory)' + + - task: Bash@3 + name: AllReduceWithExecutionFileTest + displayName: Run mscclpp allReduce with execution file + inputs: + targetType: 'inline' + script: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + sudo /usr/local/mpi/bin/mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \ + -x ALLREDUCEPKT_IP_JSON_FILE=./azure-mscclpp/execution-files/allreduce_mi300_packet.json \ + -x ALLREDUCE_IP_JSON_FILE=./azure-mscclpp/execution-files/allreduce_mi300_sm_mscclpp.json \ + -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \ + -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100 + workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/docker/base-x-rocm.dockerfile b/docker/base-x-rocm.dockerfile new file mode 100644 index 00000000..5865cc39 --- /dev/null +++ b/docker/base-x-rocm.dockerfile @@ -0,0 +1,19 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +LABEL maintainer="MSCCL++" +LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp + +ENV DEBIAN_FRONTEND=noninteractive + +ENV RCCL_VERSION=rocm-6.2.0 +ARG ARCH=gfx942 +ENV ARCH_TARGET=${ARCH} +RUN cd /tmp && \ + git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \ + cd rccl && \ + ./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \ + cd .. && \ + rm -rf /tmp/rccl + +WORKDIR / diff --git a/docker/base-x.dockerfile b/docker/base-x.dockerfile index 4be89c9d..a1ba2069 100644 --- a/docker/base-x.dockerfile +++ b/docker/base-x.dockerfile @@ -5,6 +5,7 @@ LABEL maintainer="MSCCL++" LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp ENV DEBIAN_FRONTEND=noninteractive +USER root RUN rm -rf /opt/nvidia diff --git a/docker/build.sh b/docker/build.sh index c906f903..d8af5f8f 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -8,6 +8,7 @@ baseImageTable=( ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04" + ["rocm6.2"]="rocm/rocm-terminal:6.2" ) declare -A extraLdPathTable @@ -16,13 +17,14 @@ extraLdPathTable=( ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64" + ["rocm6.2"]="/opt/rocm/lib" ) GHCR="ghcr.io/microsoft/mscclpp/mscclpp" TARGET=${1} print_usage() { - echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3]" + echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|rocm6.2]" } if [[ ! -v "baseImageTable[${TARGET}]" ]]; then @@ -36,12 +38,25 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" cd ${SCRIPT_DIR}/.. -docker build -t ${GHCR}:base-${TARGET} \ +docker build -t ${GHCR}-common:base-${TARGET} \ -f docker/base-x.dockerfile \ --build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \ --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \ --build-arg TARGET=${TARGET} . +if [[ ${TARGET} == rocm* ]]; then + echo "Building ROCm base image..." + docker build -t ${GHCR}:base-${TARGET} \ + -f docker/base-x-rocm.dockerfile \ + --build-arg BASE_IMAGE=${GHCR}-common:base-${TARGET} \ + --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \ + --build-arg TARGET=${TARGET} \ + --build-arg ARCH="gfx942" . +else + echo "Building CUDA base image..." + docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET} +fi + docker build -t ${GHCR}:base-dev-${TARGET} \ -f docker/base-dev-x.dockerfile \ --build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \ diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt new file mode 100644 index 00000000..e69de29b