Skip to content

Commit

Permalink
rdma: support NCCL multi-recv interface
Browse files Browse the repository at this point in the history
The multi-recv interface allows aggregating up to 8 receive requests in
a single request.

This commit does not yet advertise support for multi-recv to NCCL.

* Temporarily disables eager; it will be re-enabled in a future commit.

Signed-off-by: Eric Raut <[email protected]>
  • Loading branch information
rauteric committed Feb 25, 2024
1 parent 8c770ce commit 1278cf6
Show file tree
Hide file tree
Showing 2 changed files with 456 additions and 219 deletions.
33 changes: 30 additions & 3 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,20 +74,32 @@ typedef struct nccl_net_ofi_rdma_mr_handle {
struct fid_mr *mr[];
} nccl_net_ofi_rdma_mr_handle_t;

/* Contents of ctrl message sent from receiver to sender to advertise
destination buffer */
typedef struct nccl_net_ofi_rdma_ctrl_msg {
typedef struct nccl_net_ofi_rdma_ctrl_msg_entry {
int multi_recv_tag;
uint64_t buff_addr;
uint64_t buff_len;
uint64_t buff_mr_key[MAX_NUM_RAILS];
} nccl_net_ofi_rdma_ctrl_msg_entry_t;

/* Contents of ctrl message sent from receiver to sender to advertise
destination buffer */
typedef struct nccl_net_ofi_rdma_ctrl_msg {
uint16_t msg_seq_num;
uint16_t multi_recv_size;
nccl_net_ofi_rdma_ctrl_msg_entry_t entries[];
} nccl_net_ofi_rdma_ctrl_msg_t;

#define RDMA_CTRL_MSG_ENTRIES_MAX_SIZE (NCCL_OFI_MAX_RECVS * sizeof(nccl_net_ofi_rdma_ctrl_msg_entry_t))
#define RDMA_CTRL_MSG_MAX_SIZE (sizeof(nccl_net_ofi_rdma_ctrl_msg_t) + RDMA_CTRL_MSG_ENTRIES_MAX_SIZE)

/* Structure used to store control messages in a free list */
typedef struct nccl_net_ofi_rdma_ctrl_fl_item {
nccl_ofi_freelist_reginfo_t fl_reginfo;
nccl_net_ofi_rdma_ctrl_msg_t ctrl_msg;
} nccl_net_ofi_rdma_ctrl_fl_item_t;

#define RDMA_CTRL_FL_ITEM_MAX_SIZE (sizeof(nccl_net_ofi_rdma_ctrl_fl_item_t) + RDMA_CTRL_MSG_ENTRIES_MAX_SIZE)

/* For LL/LL128 protocols, bounce buffers (source of RDMA read operations) need to be 128B aligned */
#define BOUNCE_BUFFER_ALIGNMENT 128

Expand Down Expand Up @@ -152,6 +164,13 @@ typedef struct {
/* Total number of completions. Expect one completion for receiving the
* control message and one completion for each send segment. */
int total_num_compls;

/* Multi-recv information */
uint16_t multi_recv_size;
uint16_t multi_recv_start;
int multi_recv_tag;
/* This may not match sender-side seq num with multi-recv */
uint16_t recv_side_msg_seq_num;
} rdma_req_send_data_t;

/*
Expand All @@ -166,6 +185,8 @@ typedef struct {
nccl_net_ofi_schedule_t *ctrl_schedule;
/* Pointer to recv parent request */
nccl_net_ofi_rdma_req_t *recv_req;
/* Size of ctrl message */
size_t ctrl_msg_size;
} rdma_req_send_ctrl_data_t;

typedef struct {
Expand Down Expand Up @@ -206,6 +227,12 @@ typedef struct {
* For eager messages, the second completion will be received
* when the local read into the destination buffer is complete */
int total_num_compls;
/* Multi-recv information */
uint16_t multi_recv_size;
uint16_t multi_recv_start;
int multi_recv_tag;
/* Next req in sequence */
nccl_net_ofi_rdma_req_t *multi_recv_next;
} rdma_req_recv_data_t;

/*
Expand Down
Loading

0 comments on commit 1278cf6

Please sign in to comment.