Skip to content

Commit

Permalink
rdma: move idpool init to endpoint refcnt+ stage
Browse files Browse the repository at this point in the history
The idpool is freed when refcnt goes to zero, so it should be
initialized again when refcnt increases.

Signed-off-by: Eric Raut <[email protected]>
  • Loading branch information
rauteric committed Feb 21, 2024
1 parent f931085 commit 213172a
Showing 1 changed file with 16 additions and 16 deletions.
32 changes: 16 additions & 16 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -5570,22 +5570,6 @@ static int get_ep(nccl_net_ofi_device_t *base_dev,
/* Initialize number of rail */
ep->num_rails = num_rails;

/* Initialize endpoint ID pool */
ep->comm_idpool = malloc(sizeof(nccl_ofi_idpool_t));
if (OFI_UNLIKELY(ep->comm_idpool == NULL)) {
ret = ncclSystemError;
NCCL_OFI_WARN("Unable to allocate rdma endpoint ID pool");
goto unlock;
}

ret = nccl_ofi_idpool_init(ep->comm_idpool, device->num_comm_ids);
if (OFI_UNLIKELY(ret != 0)) {
ret = ncclSystemError;
free(ep->comm_idpool);
ep->comm_idpool = NULL;
goto unlock;
}

/* Initialize reference count */
ep->ref_cnt = 0;

Expand Down Expand Up @@ -5625,6 +5609,22 @@ static int get_ep(nccl_net_ofi_device_t *base_dev,
goto unlock;
}

/* Initialize endpoint ID pool */
ep->comm_idpool = malloc(sizeof(nccl_ofi_idpool_t));
if (OFI_UNLIKELY(ep->comm_idpool == NULL)) {
ret = ncclSystemError;
NCCL_OFI_WARN("Unable to allocate rdma endpoint ID pool");
goto unlock;
}

ret = nccl_ofi_idpool_init(ep->comm_idpool, device->num_comm_ids);
if (OFI_UNLIKELY(ret != 0)) {
ret = ncclSystemError;
free(ep->comm_idpool);
ep->comm_idpool = NULL;
goto unlock;
}

ret = init_rail_ofi_resources(device, ep);
if (ret != 0) {
goto unlock;
Expand Down

0 comments on commit 213172a

Please sign in to comment.