Skip to content

Commit

Permalink
Align interfaces of put/get/putPackets/getPackets (#185)
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang authored Oct 7, 2023
1 parent 497a9e0 commit 11ac824
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 81 deletions.
37 changes: 20 additions & 17 deletions .github/workflows/integration-test-backup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,20 @@ on: workflow_dispatch
jobs:
IntegrationTest:
runs-on: self-hosted
defaults:
run:
shell: bash
strategy:
matrix:
container-image: [ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8, ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1]
cuda: [ cuda11.8, cuda12.1 ]

container:
image: ${{ matrix.container-image }}
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v4

- name: Install CMake
run: |
Expand All @@ -39,35 +42,35 @@ jobs:
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- name: Run mscclpp SendRecv test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- name: Run mscclpp AllReduce test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- name: Run mscclpp AllToAll test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- name: Check collective primitives performance
run: |
Expand Down
23 changes: 13 additions & 10 deletions .github/workflows/ut-backup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,21 @@ on: workflow_dispatch
jobs:
UnitTest:
runs-on: self-hosted
defaults:
run:
shell: bash
timeout-minutes: 30
strategy:
matrix:
container-image: [ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8, ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1]
cuda: [ cuda11.8, cuda12.1 ]

container:
image: ${{ matrix.container-image }}
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v4

- name: Build
run: |
Expand Down Expand Up @@ -44,20 +47,20 @@ jobs:
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -tag-output -np 2 ./build/test/mp_unit_tests
mpirun -tag-output -np 4 ./build/test/mp_unit_tests
mpirun -tag-output -np 8 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
working-directory: ${{ github.workspace }}

- name: PyTests
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
cd build && make pylib-copy
if [[ '${{ matrix.container-image }}' == *'cuda11'* ]]; then
pip3 install -r ../python/test/requirements_cu11.txt
if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
python3 -m pip install -r ../python/test/requirements_cu11.txt
else
pip3 install -r ../python/test/requirements_cu12.txt
python3 -m pip install -r ../python/test/requirements_cu12.txt
fi
mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
working-directory: ${{ github.workspace }}
37 changes: 20 additions & 17 deletions include/mscclpp/packet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ union LLPacket {
/// @param flag The flag to read.
/// @param data The 8-byte data read.
/// @return True if the flag is not equal to the given flag.
__forceinline__ __device__ bool readOnce(uint32_t flag, uint2& data) {
__forceinline__ __device__ bool readOnce(uint32_t flag, uint2& data) const {
uint32_t flag1, flag2;
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];"
: "=r"(data.x), "=r"(flag1), "=r"(data.y), "=r"(flag2)
Expand All @@ -60,7 +60,7 @@ union LLPacket {
/// @param flag The flag to read.
/// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
/// @return The 8-byte data read.
__forceinline__ __device__ uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) {
__forceinline__ __device__ uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) const {
uint2 data;
POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount);
return data;
Expand All @@ -75,28 +75,31 @@ union LLPacket {
};

#ifdef __CUDACC__
__forceinline__ __device__ void putPackets(void* dst, uint64_t dstOffset, void* src, uint64_t srcOffset,
uint64_t srcBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
/// Read from the origin and write to the target buffer.
__forceinline__ __device__ void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr,
uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads, uint32_t flag) {
// Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
uint32_t* srcBase = (uint32_t*)((char*)src + srcOffset);
LLPacket* dstBase = (LLPacket*)((char*)dst + dstOffset);
size_t nElem = srcBytes / sizeof(uint64_t);
const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset);
LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset);
size_t nElem = originBytes / sizeof(uint64_t);
for (size_t i = threadId; i < nElem; i += numThreads) {
LLPacket* pkt = &dstBase[i];
pkt->write(srcBase[2 * i], srcBase[2 * i + 1], flag);
LLPacket* pkt = &targetBase[i];
pkt->write(originBase[2 * i], originBase[2 * i + 1], flag);
}
}

__forceinline__ __device__ void getPackets(void* dst, uint64_t dstOffset, void* src, uint64_t srcOffset,
uint64_t dstBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) {
/// Read from the target buffer and write to the origin.
__forceinline__ __device__ void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr,
uint64_t originOffset, uint64_t originBytes, uint32_t threadId,
uint32_t numThreads, uint32_t flag) {
// Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes
// TODO(saemal): this is not matching sm_channel get method.
LLPacket* srcBase = (LLPacket*)((char*)src + srcOffset);
uint2* dstBase = (uint2*)((char*)dst + dstOffset);
size_t nElem = dstBytes / sizeof(uint2);
const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset);
uint2* originBase = (uint2*)((char*)originPtr + originOffset);
size_t nElem = originBytes / sizeof(uint2);
for (size_t i = threadId; i < nElem; i += numThreads) {
LLPacket* pkt = &srcBase[i];
dstBase[i] = pkt->read(flag);
const LLPacket* pkt = &targetBase[i];
originBase[i] = pkt->read(flag);
}
}
#endif // __CUDACC__
Expand Down
Loading

0 comments on commit 11ac824

Please sign in to comment.