From 5fddb2e86a026f0a9ff085ce6830be0eca26360c Mon Sep 17 00:00:00 2001 From: Eric Raut Date: Sat, 24 Feb 2024 01:15:06 +0000 Subject: [PATCH] Advertise multi-recv support to NCCL for RDMA protocol RDMA protocol will now support up to 8 multi-recv buffers at a time. Signed-off-by: Eric Raut --- include/nccl_ofi.h | 3 ++- src/nccl_ofi_net.c | 2 +- src/nccl_ofi_rdma.c | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/nccl_ofi.h b/include/nccl_ofi.h index 497deea02..7212f86cc 100644 --- a/include/nccl_ofi.h +++ b/include/nccl_ofi.h @@ -59,7 +59,8 @@ extern "C" { #define MIN_TAG_BITS_FOR_RING_ID (32 + 1) /* Maximum number of grouped receives */ -#define NCCL_OFI_MAX_RECVS 1 +#define NCCL_OFI_MAX_RECVS 8 +#define NCCL_OFI_MAX_RECVS_SENDRECV 1 /* * This defines a higher value than maximum inflight requests supported by NCCL diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c index 9c6f57049..d4d7b6056 100644 --- a/src/nccl_ofi_net.c +++ b/src/nccl_ofi_net.c @@ -331,7 +331,7 @@ static int set_nic_props_default(int dev_id, struct fi_info *nic_prov, * impacted with this feature as NCCL doesn't aggregate receives from * same source. */ - props->max_group_receives = NCCL_OFI_MAX_RECVS; + props->max_group_receives = NCCL_OFI_MAX_RECVS_SENDRECV; if (support_gdr == GDR_SUPPORTED) { props->hmem_support = true; diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index aae931d62..337764b1e 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -597,6 +597,9 @@ static inline int get_properties(nccl_net_ofi_device_t *base_dev, struct fi_info *info = device->device_rails[0].info; int ret = nccl_net_ofi_info_properties(info, dev_id, base_dev->plugin->num_devs, props); + /* Multi-recv adjustment */ + props->max_group_receives = NCCL_OFI_MAX_RECVS; + /* Scale speed by the total number of rails. Assume that all * reails have the same speed. */ if (ret == 0) {