From e8e632b4e4ddb3b910ce017ce28e855164d687b5 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 24 Jun 2026 14:55:50 +0300 Subject: [PATCH 01/33] libibverbs: Set vmr->access in ibv_cmd_reg_mr_ex() ibv_cmd_reg_mr_ex() did not record the MR's access flags in vmr->access, unlike ibv_cmd_reg_mr(). ibv_dereg_mr() decides whether to re-enable fork tracking from verbs_get_mr(mr)->access: if (... && !(access & IBV_ACCESS_ON_DEMAND)) ibv_dofork_range(addr, length); With vmr->access left zero, an on-demand (ODP) MR registered through ibv_reg_mr_ex() / ibv_cmd_reg_mr_ex() reads access == 0 at dereg, so ibv_dofork_range() runs even though registration skipped the matching ibv_dontfork_range() for ODP (need_fork = !(ON_DEMAND || FD)). The unbalanced dofork corrupts the fork-range accounting when ibv_fork_init() is enabled. Set vmr->access = mr_init_attr->access on the success path, mirroring ibv_cmd_reg_mr(), so the dereg fork decision is correct. (The write-ABI fallback path already goes through ibv_cmd_reg_mr(), which sets it.) Fixes: ca61708a8838 ("verbs: Add ibv_cmd_reg_mr_ex() to be used by drivers") Signed-off-by: Yishai Hadas Signed-off-by: Jiri Pirko Co-Authored-By: Claude Opus 4.8 --- libibverbs/cmd_mr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libibverbs/cmd_mr.c b/libibverbs/cmd_mr.c index bf0953253..f45ce3992 100644 --- a/libibverbs/cmd_mr.c +++ b/libibverbs/cmd_mr.c @@ -235,6 +235,7 @@ int ibv_cmd_reg_mr_ex(struct ibv_pd *pd, struct verbs_mr *vmr, vmr->mr_type = IBV_MR_TYPE_DMABUF_MR; else vmr->mr_type = IBV_MR_TYPE_MR; + vmr->access = mr_init_attr->access; return 0; } From 752a9b2f116c33e02703ff254d584049de375359 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 4 May 2026 14:54:53 +0200 Subject: [PATCH 02/33] Update kernel headers To commit: d7a40b519497 ("RDMA/uverbs: Expose CoCo DMA bounce requirement to userspace"). Signed-off-by: Jiri Pirko --- kernel-headers/rdma/ib_user_ioctl_cmds.h | 4 ++++ kernel-headers/rdma/ib_user_ioctl_verbs.h | 27 ++++++++++++++++++++++ kernel-headers/rdma/ib_user_verbs.h | 2 ++ kernel-headers/rdma/mlx5_user_ioctl_cmds.h | 5 ++++ 4 files changed, 38 insertions(+) diff --git a/kernel-headers/rdma/ib_user_ioctl_cmds.h b/kernel-headers/rdma/ib_user_ioctl_cmds.h index 72041c1b0..839835bd4 100644 --- a/kernel-headers/rdma/ib_user_ioctl_cmds.h +++ b/kernel-headers/rdma/ib_user_ioctl_cmds.h @@ -117,6 +117,7 @@ enum uverbs_attrs_create_cq_cmd_attr_ids { UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH, UVERBS_ATTR_CREATE_CQ_BUFFER_FD, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET, + UVERBS_ATTR_CREATE_CQ_BUF_UMEM, }; enum uverbs_attrs_destroy_cq_cmd_attr_ids { @@ -158,6 +159,9 @@ enum uverbs_attrs_create_qp_cmd_attr_ids { UVERBS_ATTR_CREATE_QP_EVENT_FD, UVERBS_ATTR_CREATE_QP_RESP_CAP, UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + UVERBS_ATTR_CREATE_QP_BUF_UMEM, + UVERBS_ATTR_CREATE_QP_RQ_BUF_UMEM, + UVERBS_ATTR_CREATE_QP_SQ_BUF_UMEM, }; enum uverbs_attrs_destroy_qp_cmd_attr_ids { diff --git a/kernel-headers/rdma/ib_user_ioctl_verbs.h b/kernel-headers/rdma/ib_user_ioctl_verbs.h index 90c5cd8e7..51030c27d 100644 --- a/kernel-headers/rdma/ib_user_ioctl_verbs.h +++ b/kernel-headers/rdma/ib_user_ioctl_verbs.h @@ -273,4 +273,31 @@ struct ib_uverbs_gid_entry { __u32 netdev_ifindex; /* It is 0 if there is no netdev associated with it */ }; +enum ib_uverbs_buffer_type { + IB_UVERBS_BUFFER_TYPE_DMABUF, + IB_UVERBS_BUFFER_TYPE_VA, +}; + +/* + * Describes a single buffer backed by dma-buf or user virtual address. + * Used as the payload of a per-attribute UVERBS_ATTR_UMEM-typed attribute. + * + * @type: buffer type from enum ib_uverbs_buffer_type + * @fd: dma-buf file descriptor (valid for IB_UVERBS_BUFFER_TYPE_DMABUF) + * @flags: required flags; the kernel rejects the call with -EINVAL if any + * bit is not understood. No bits are defined yet. + * @optional_flags: advisory flags; bits the kernel does not understand are + * silently ignored. No bits are defined yet. + * @addr: offset within dma-buf, or user virtual address for VA + * @length: buffer length in bytes + */ +struct ib_uverbs_buffer_desc { + __u32 type; + __s32 fd; + __u32 flags; + __u32 optional_flags; + __aligned_u64 addr; + __aligned_u64 length; +}; + #endif diff --git a/kernel-headers/rdma/ib_user_verbs.h b/kernel-headers/rdma/ib_user_verbs.h index 3b7bd9981..d2aeadb6d 100644 --- a/kernel-headers/rdma/ib_user_verbs.h +++ b/kernel-headers/rdma/ib_user_verbs.h @@ -1368,6 +1368,8 @@ enum ib_uverbs_device_cap_flags { IB_UVERBS_DEVICE_FLUSH_PERSISTENT = 1ULL << 39, /* Atomic write attributes */ IB_UVERBS_DEVICE_ATOMIC_WRITE = 1ULL << 40, + /* CoCo guest with DMA bounce buffering required */ + IB_UVERBS_DEVICE_CC_DMA_BOUNCE = 1ULL << 41, }; enum ib_uverbs_raw_packet_caps { diff --git a/kernel-headers/rdma/mlx5_user_ioctl_cmds.h b/kernel-headers/rdma/mlx5_user_ioctl_cmds.h index 01a2a050e..ddb898afd 100644 --- a/kernel-headers/rdma/mlx5_user_ioctl_cmds.h +++ b/kernel-headers/rdma/mlx5_user_ioctl_cmds.h @@ -274,6 +274,11 @@ enum mlx5_ib_device_query_context_attrs { enum mlx5_ib_create_cq_attrs { MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX = UVERBS_ID_DRIVER_NS_WITH_UHW, + MLX5_IB_ATTR_CREATE_CQ_DBR_BUF_UMEM, +}; + +enum mlx5_ib_create_qp_attrs { + MLX5_IB_ATTR_CREATE_QP_DBR_BUF_UMEM = UVERBS_ID_DRIVER_NS_WITH_UHW, }; enum mlx5_ib_reg_dmabuf_mr_attrs { From 21ffe5bf21979e3322da563ee1b04e8616dc3bb6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 7 Apr 2026 14:26:54 +0200 Subject: [PATCH 03/33] libibverbs: Introduce struct ibv_buf for common buffer description Add struct ibv_buf with addr and size fields to provide a common abstraction for buffer metadata that providers can embed in their internal buffer structures. Introduce an init helper alongside. Signed-off-by: Jiri Pirko --- libibverbs/driver.h | 15 +++++++++++++++ libibverbs/verbs.h | 2 ++ 2 files changed, 17 insertions(+) diff --git a/libibverbs/driver.h b/libibverbs/driver.h index e6187729d..45a0b1d29 100644 --- a/libibverbs/driver.h +++ b/libibverbs/driver.h @@ -49,6 +49,21 @@ struct verbs_device; +/* Must change the PRIVATE IBVERBS_PRIVATE_ symbol if this is changed */ +struct ibv_buf { + void *addr; + size_t size; + struct ibv_pd *pd; +}; + +static inline void ibv_buf_init(struct ibv_buf *buf, struct ibv_pd *pd, + void *addr, size_t size) +{ + buf->pd = pd; + buf->addr = addr; + buf->size = size; +} + enum { VERBS_LOG_LEVEL_NONE, VERBS_LOG_ERR, diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 36d120eec..63a6faad3 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -636,6 +636,8 @@ struct ibv_dmah { struct ibv_context *context; }; +struct ibv_buf; + struct ibv_pd { struct ibv_context *context; uint32_t handle; From 60a66816002b4e0db6e0deccb1da9af262a1f774 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 7 Apr 2026 14:22:18 +0200 Subject: [PATCH 04/33] libibverbs: Add dmabuf fd field to struct ibv_buf Add an int dmabuf_fd field to struct ibv_buf so that providers can store the DMA-buf file descriptor directly in the common buffer abstraction. Signed-off-by: Jiri Pirko --- libibverbs/driver.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/libibverbs/driver.h b/libibverbs/driver.h index 45a0b1d29..4d73194d5 100644 --- a/libibverbs/driver.h +++ b/libibverbs/driver.h @@ -54,6 +54,7 @@ struct ibv_buf { void *addr; size_t size; struct ibv_pd *pd; + int dmabuf_fd; }; static inline void ibv_buf_init(struct ibv_buf *buf, struct ibv_pd *pd, @@ -62,6 +63,14 @@ static inline void ibv_buf_init(struct ibv_buf *buf, struct ibv_pd *pd, buf->pd = pd; buf->addr = addr; buf->size = size; + buf->dmabuf_fd = -1; +} + +static inline void ibv_buf_init_dmabuf(struct ibv_buf *buf, struct ibv_pd *pd, + void *addr, size_t size, int fd) +{ + ibv_buf_init(buf, pd, addr, size); + buf->dmabuf_fd = fd; } enum { From 0dd1543fc09b4de8b12cb183c313f8e484164a5b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 17 Feb 2026 10:16:09 +0100 Subject: [PATCH 05/33] libibverbs: Add ibv_alloc_buf/ibv_free_buf verbs API Add alloc_buf/free_buf provider ops and corresponding helpers to allow applications to allocate buffers using the provider's configured allocation method. Signed-off-by: Jiri Pirko --- CMakeLists.txt | 2 +- debian/control | 2 +- debian/libibverbs1.symbols | 5 +++- libibverbs/CMakeLists.txt | 2 +- libibverbs/driver.h | 3 ++ libibverbs/dummy_ops.c | 36 +++++++++++++++++++++++ libibverbs/libibverbs.map.in | 6 ++++ libibverbs/man/CMakeLists.txt | 2 ++ libibverbs/man/ibv_alloc_buf.3.md | 48 +++++++++++++++++++++++++++++++ libibverbs/verbs.c | 10 +++++++ libibverbs/verbs.h | 20 +++++++++++++ 11 files changed, 132 insertions(+), 4 deletions(-) create mode 100644 libibverbs/man/ibv_alloc_buf.3.md diff --git a/CMakeLists.txt b/CMakeLists.txt index fac3a4354..140ea8c5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,7 +88,7 @@ set(PACKAGE_VERSION "64.0") # When this is changed the values in these files need changing too: # debian/control # debian/libibverbs1.symbols -set(IBVERBS_PABI_VERSION "59") +set(IBVERBS_PABI_VERSION "64") set(IBVERBS_PROVIDER_SUFFIX "-rdmav${IBVERBS_PABI_VERSION}.so") #------------------------- diff --git a/debian/control b/debian/control index 1109aa772..b7a731abe 100644 --- a/debian/control +++ b/debian/control @@ -151,7 +151,7 @@ Section: libs Pre-Depends: ${misc:Pre-Depends} Depends: adduser, ${misc:Depends}, ${shlibs:Depends} Recommends: ibverbs-providers -Breaks: ibverbs-providers (<< 59~) +Breaks: ibverbs-providers (<< 64~) Description: Library for direct userspace use of RDMA (InfiniBand/iWARP) libibverbs is a library that allows userspace processes to use RDMA "verbs" as described in the InfiniBand Architecture Specification and diff --git a/debian/libibverbs1.symbols b/debian/libibverbs1.symbols index cbfe19624..509383b65 100644 --- a/debian/libibverbs1.symbols +++ b/debian/libibverbs1.symbols @@ -14,13 +14,15 @@ libibverbs.so.1 libibverbs1 #MINVER# IBVERBS_1.14@IBVERBS_1.14 36 IBVERBS_1.15@IBVERBS_1.15 59 IBVERBS_1.16@IBVERBS_1.16 62 - (symver)IBVERBS_PRIVATE_59 59 + IBVERBS_1.17@IBVERBS_1.17 64 + (symver)IBVERBS_PRIVATE_64 64 _ibv_query_gid_ex@IBVERBS_1.11 32 _ibv_query_gid_table@IBVERBS_1.11 32 ibv_ack_async_event@IBVERBS_1.0 1.1.6 ibv_ack_async_event@IBVERBS_1.1 1.1.6 ibv_ack_cq_events@IBVERBS_1.0 1.1.6 ibv_ack_cq_events@IBVERBS_1.1 1.1.6 + ibv_alloc_buf@IBVERBS_1.17 64 ibv_alloc_dmah@IBVERBS_1.15 59 ibv_alloc_pd@IBVERBS_1.0 1.1.6 ibv_alloc_pd@IBVERBS_1.1 1.1.6 @@ -63,6 +65,7 @@ libibverbs.so.1 libibverbs1 #MINVER# ibv_dontfork_range@IBVERBS_1.1 1.1.6 ibv_event_type_str@IBVERBS_1.1 1.1.6 ibv_fork_init@IBVERBS_1.1 1.1.6 + ibv_free_buf@IBVERBS_1.17 64 ibv_free_device_list@IBVERBS_1.0 1.1.6 ibv_free_device_list@IBVERBS_1.1 1.1.6 ibv_get_async_event@IBVERBS_1.0 1.1.6 diff --git a/libibverbs/CMakeLists.txt b/libibverbs/CMakeLists.txt index d428bdbdf..f37a3dddb 100644 --- a/libibverbs/CMakeLists.txt +++ b/libibverbs/CMakeLists.txt @@ -21,7 +21,7 @@ configure_file("libibverbs.map.in" rdma_library(ibverbs "${CMAKE_CURRENT_BINARY_DIR}/libibverbs.map" # See Documentation/versioning.md - 1 1.16.${PACKAGE_VERSION} + 1 1.17.${PACKAGE_VERSION} all_providers.c cmd.c cmd_ah.c diff --git a/libibverbs/driver.h b/libibverbs/driver.h index 4d73194d5..e7f420be8 100644 --- a/libibverbs/driver.h +++ b/libibverbs/driver.h @@ -364,6 +364,8 @@ struct verbs_context_ops { uint32_t flags, struct ibv_sge *sg_list, uint32_t num_sges); + void *(*alloc_buf)(struct ibv_pd *pd, size_t size, + struct ibv_buf **buf); struct ibv_dm *(*alloc_dm)(struct ibv_context *context, struct ibv_alloc_dm_attr *attr); struct ibv_dmah *(*alloc_dmah)(struct ibv_context *context, @@ -432,6 +434,7 @@ struct verbs_context_ops { int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); int (*dm_export_dmabuf_fd)(struct ibv_dm *dm); + void (*free_buf)(struct ibv_buf *buf); void (*free_context)(struct ibv_context *context); int (*free_dm)(struct ibv_dm *dm); int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); diff --git a/libibverbs/dummy_ops.c b/libibverbs/dummy_ops.c index e81159374..67aae0c1e 100644 --- a/libibverbs/dummy_ops.c +++ b/libibverbs/dummy_ops.c @@ -30,6 +30,8 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include +#include #include #include "ibverbs.h" #include @@ -63,6 +65,36 @@ static struct ibv_mw *alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) return NULL; } +static void *alloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf) +{ + struct ibv_buf *ibuf; + void *ptr; + int ret; + + ibuf = calloc(1, sizeof(*ibuf)); + if (!ibuf) { + errno = ENOMEM; + return NULL; + } + + ret = posix_memalign(&ptr, sysconf(_SC_PAGESIZE), size); + if (ret) { + free(ibuf); + errno = ret; + return NULL; + } + + ibv_buf_init(ibuf, pd, ptr, size); + *buf = ibuf; + return ptr; +} + +static void free_buf(struct ibv_buf *buf) +{ + free(buf->addr); + free(buf); +} + static struct ibv_mr *alloc_null_mr(struct ibv_pd *pd) { errno = EOPNOTSUPP; @@ -535,6 +567,7 @@ static void unimport_pd(struct ibv_pd *pd) */ const struct verbs_context_ops verbs_dummy_ops = { advise_mr, + alloc_buf, alloc_dm, alloc_dmah, alloc_mw, @@ -576,6 +609,7 @@ const struct verbs_context_ops verbs_dummy_ops = { destroy_wq, detach_mcast, dm_export_dmabuf_fd, + free_buf, free_context, free_dm, get_srq_num, @@ -665,6 +699,7 @@ void verbs_set_ops(struct verbs_context *vctx, } while (0) SET_OP(vctx, advise_mr); + SET_OP(vctx, alloc_buf); SET_OP(vctx, alloc_dm); SET_OP(vctx, alloc_dmah); SET_OP(ctx, alloc_mw); @@ -706,6 +741,7 @@ void verbs_set_ops(struct verbs_context *vctx, SET_OP(vctx, destroy_wq); SET_PRIV_OP(ctx, detach_mcast); SET_OP(vctx, dm_export_dmabuf_fd); + SET_OP(vctx, free_buf); SET_PRIV_OP_IC(ctx, free_context); SET_OP(vctx, free_dm); SET_OP(vctx, get_srq_num); diff --git a/libibverbs/libibverbs.map.in b/libibverbs/libibverbs.map.in index d3d4d41b6..d82112455 100644 --- a/libibverbs/libibverbs.map.in +++ b/libibverbs/libibverbs.map.in @@ -177,6 +177,12 @@ IBVERBS_1.16 { ibv_query_port_speed; } IBVERBS_1.15; +IBVERBS_1.17 { + global: + ibv_alloc_buf; + ibv_free_buf; +} IBVERBS_1.16; + /* If any symbols in this stanza change ABI then the entire staza gets a new symbol version. See the top level CMakeLists.txt for this setting. */ diff --git a/libibverbs/man/CMakeLists.txt b/libibverbs/man/CMakeLists.txt index f498c1532..1711807d9 100644 --- a/libibverbs/man/CMakeLists.txt +++ b/libibverbs/man/CMakeLists.txt @@ -1,5 +1,6 @@ rdma_man_pages( ibv_advise_mr.3.md + ibv_alloc_buf.3.md ibv_alloc_dm.3 ibv_alloc_dmah.3.md ibv_alloc_mw.3 @@ -86,6 +87,7 @@ rdma_man_pages( ibv_xsrq_pingpong.1 ) rdma_alias_man_pages( + ibv_alloc_buf.3 ibv_free_buf.3 ibv_alloc_dm.3 ibv_free_dm.3 ibv_alloc_dm.3 ibv_reg_dm_mr.3 ibv_alloc_dm.3 ibv_memcpy_to_dm.3 diff --git a/libibverbs/man/ibv_alloc_buf.3.md b/libibverbs/man/ibv_alloc_buf.3.md new file mode 100644 index 000000000..310e9affc --- /dev/null +++ b/libibverbs/man/ibv_alloc_buf.3.md @@ -0,0 +1,48 @@ +--- +date: 2026-05-29 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_alloc_buf +--- + +# NAME + +ibv_alloc_buf, ibv_free_buf - allocate and free provider-aware buffers + +# SYNOPSIS + +```c +#include + +void *ibv_alloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf); + +void ibv_free_buf(struct ibv_buf *buf); +``` + +# DESCRIPTION + +**ibv_alloc_buf()** allocates a buffer using the allocation method selected by +the provider for the protection domain *pd*. On success it returns the mapped +address and stores an opaque buffer handle in *buf*. The handle is used by +**ibv_free_buf()** and must not be interpreted by applications. + +# ARGUMENTS + +*pd* +: The protection domain (or parent domain) to allocate from; its provider selects the buffer's backing allocation method. It must remain valid until the buffer is freed with **ibv_free_buf()**. + +*size* +: Size of the buffer to allocate, in bytes. + +*buf* +: For **ibv_alloc_buf()**, an output parameter set on success to an opaque buffer handle. For **ibv_free_buf()**, the buffer handle returned by **ibv_alloc_buf()** that is to be released. + +# RETURN VALUE + +**ibv_alloc_buf()** returns the mapped buffer address on success, or NULL if the +request fails. + +**ibv_free_buf()** does not return a value. diff --git a/libibverbs/verbs.c b/libibverbs/verbs.c index c65176ccf..371431c4c 100644 --- a/libibverbs/verbs.c +++ b/libibverbs/verbs.c @@ -435,6 +435,16 @@ struct ibv_mr *ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, return mr; } +void *ibv_alloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf) +{ + return get_ops(pd->context)->alloc_buf(pd, size, buf); +} + +void ibv_free_buf(struct ibv_buf *buf) +{ + get_ops(buf->pd->context)->free_buf(buf); +} + /* Note: mr_init_attr may be modified during this call */ struct ibv_mr *ibv_reg_mr_ex(struct ibv_pd *pd, struct ibv_mr_init_attr *mr_init_attr) { diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 63a6faad3..0bbb802f9 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -2185,6 +2185,9 @@ struct ibv_values_ex { struct verbs_context { /* "grows up" - new fields go here */ + void (*free_buf)(struct ibv_buf *buf); + void *(*alloc_buf)(struct ibv_pd *pd, size_t size, + struct ibv_buf **buf); int (*dm_export_dmabuf_fd)(struct ibv_dm *dm); struct ibv_mr *(*reg_mr_ex)(struct ibv_pd *pd, struct ibv_mr_init_attr *mr_init_attr); @@ -3197,6 +3200,23 @@ ibv_alloc_parent_domain(struct ibv_context *context, return vctx->alloc_parent_domain(context, attr); } +/** + * ibv_alloc_buf - Allocate a buffer using provider-configured allocation method + * @pd: Protection domain or parent domain + * @size: Buffer size in bytes + * @buf: On success, set to a handle for ibv_free_buf() + * + * Returns the usable buffer address on success, or NULL on failure with + * errno set. + */ +void *ibv_alloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf); + +/** + * ibv_free_buf - Free a buffer allocated with ibv_alloc_buf + * @buf: Handle from ibv_alloc_buf() + */ +void ibv_free_buf(struct ibv_buf *buf); + /** * ibv_alloc_dmah - Allocate a dma handle */ From ac570df397e52e68399e85075933d446196cc83c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 16 Jun 2026 11:20:06 +0200 Subject: [PATCH 06/33] libibverbs: Fall back to legacy ops in default reg_mr_ex The default (dummy) reg_mr_ex op returns EOPNOTSUPP, so ibv_reg_mr_ex() fails on any provider that does not implement its own reg_mr_ex, even when the provider supports the equivalent legacy registration ops. Translate the ibv_mr_init_attr request into the provider's existing reg_mr() or reg_dmabuf_mr() op, selecting the dmabuf-based op when the request carries an fd and the address-based op otherwise. Signed-off-by: Jiri Pirko --- libibverbs/dummy_ops.c | 45 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/libibverbs/dummy_ops.c b/libibverbs/dummy_ops.c index 67aae0c1e..a27e22905 100644 --- a/libibverbs/dummy_ops.c +++ b/libibverbs/dummy_ops.c @@ -508,8 +508,49 @@ static struct ibv_mr *reg_mr(struct ibv_pd *pd, void *addr, size_t length, static struct ibv_mr *reg_mr_ex(struct ibv_pd *pd, struct ibv_mr_init_attr *mr_init_attr) { - errno = EOPNOTSUPP; - return NULL; + const struct verbs_context_ops *ops = get_ops(pd->context); + uint64_t comp_mask = mr_init_attr->comp_mask; + uint64_t supported_mask = IBV_REG_MR_MASK_IOVA | + IBV_REG_MR_MASK_ADDR | + IBV_REG_MR_MASK_FD | + IBV_REG_MR_MASK_FD_OFFSET; + + /* + * The provider has no reg_mr_ex op, so translate the request into one + * of the legacy registration ops. Fork handling and the common mr + * fields are taken care of by ibv_reg_mr_ex(). Anything the legacy ops + * cannot express (e.g. a dma handle) is rejected. + */ + if (!check_comp_mask(comp_mask, supported_mask)) { + errno = EOPNOTSUPP; + return NULL; + } + + if (comp_mask & IBV_REG_MR_MASK_FD) { + if ((comp_mask & IBV_REG_MR_MASK_ADDR) || + !(comp_mask & IBV_REG_MR_MASK_FD_OFFSET) || + !(comp_mask & IBV_REG_MR_MASK_IOVA)) { + errno = EINVAL; + return NULL; + } + + return ops->reg_dmabuf_mr(pd, mr_init_attr->fd_offset, + mr_init_attr->length, + mr_init_attr->iova, mr_init_attr->fd, + mr_init_attr->access); + } + + if (!(comp_mask & IBV_REG_MR_MASK_ADDR) || + (comp_mask & IBV_REG_MR_MASK_FD_OFFSET)) { + errno = EINVAL; + return NULL; + } + + return ops->reg_mr(pd, mr_init_attr->addr, mr_init_attr->length, + (comp_mask & IBV_REG_MR_MASK_IOVA) ? + mr_init_attr->iova : + (uintptr_t)mr_init_attr->addr, + mr_init_attr->access); } static struct ibv_mr *reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, From 29bcea3f3fc9ca6eacff6887a76ff064500d8a96 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 11 Jun 2026 17:40:11 +0200 Subject: [PATCH 07/33] libibverbs: Accept ibv_buf in ibv_reg_mr_ex() Extend struct ibv_mr_init_attr with a buf field and the IBV_REG_MR_MASK_BUF comp_mask bit so a handle obtained from ibv_alloc_buf() can be registered directly through ibv_reg_mr_ex(). When the bit is set, translage the ibv_buf fields into the concreate init attr fields before registration, so the rest of the flow stays unchanged: a dma-buf backed buffer is mapped to the fd-based path (fd/fd_offset/iova) while a plain memory buffer is mapped to the addr-based path. Consume the IBV_REG_MR_MASK_BUF bit during this translation, leave only the masks the lower layers already understand. Signed-off-by: Jiri Pirko --- libibverbs/man/ibv_reg_mr.3 | 40 +++++++++++- libibverbs/verbs.c | 120 +++++++++++++++++++++++++++++++----- libibverbs/verbs.h | 2 + 3 files changed, 145 insertions(+), 17 deletions(-) diff --git a/libibverbs/man/ibv_reg_mr.3 b/libibverbs/man/ibv_reg_mr.3 index 9fcb69a09..bd4aeb380 100644 --- a/libibverbs/man/ibv_reg_mr.3 +++ b/libibverbs/man/ibv_reg_mr.3 @@ -129,6 +129,22 @@ field to let application mark which fields are applicable. In addition, it includes the .I mr_init_attr->dmah which can be used to include an ibv_dmah object that will be used for that MR. +It also includes the +.I mr_init_attr->buf +field, selected by +.B IBV_REG_MR_MASK_BUF\fR, +which can be used to register a buffer handle returned by +.BR ibv_alloc_buf (3). +When +.B IBV_REG_MR_MASK_BUF +is set, +.I mr_init_attr->buf +must point to the buffer handle and +.I mr_init_attr->addr +and +.I mr_init_attr->length +describe the buffer range to register. +The protection domain must be the same protection domain used to allocate the buffer, and the range must be within the allocated buffer. The other fields on the input pointer have the same meaning as of the fields that described in that man page for the other verbs. .PP .B ibv_dereg_mr() @@ -152,10 +168,32 @@ returns 0 on success, or the value of errno on failure (which indicates the fail .SH "NOTES" .B ibv_dereg_mr() fails if any memory window is still bound to this MR. -.B ibv_dereg_mr_ex() +.PP +.B ibv_reg_mr_ex() One among mr_init_attr->fd and mr_init_attr->addr is required, both can't come together. +When +.B IBV_REG_MR_MASK_BUF +is set, +.I mr_init_attr->buf +identifies the buffer handle returned by +.BR ibv_alloc_buf (3), +and +.I mr_init_attr->addr +is required and +.B IBV_REG_MR_MASK_ADDR +must be set; the address selects the registered range within +.I mr_init_attr->buf\fR; +.PP +For a dma-buf backed buffer, +the caller must not set +.I mr_init_attr->fd\fR, +.I mr_init_attr->fd_offset\fR, +or +.I mr_init_attr->iova +because libibverbs derives them from the buffer handle. .SH "SEE ALSO" .BR ibv_alloc_pd (3), +.BR ibv_alloc_buf (3), .BR ibv_post_send (3), .BR ibv_post_recv (3), .BR ibv_post_srq_recv (3) diff --git a/libibverbs/verbs.c b/libibverbs/verbs.c index 371431c4c..8e5244d7e 100644 --- a/libibverbs/verbs.c +++ b/libibverbs/verbs.c @@ -445,38 +445,126 @@ void ibv_free_buf(struct ibv_buf *buf) get_ops(buf->pd->context)->free_buf(buf); } -/* Note: mr_init_attr may be modified during this call */ +/* + * Translate an ibv_buf handle into the concrete mr_init_attr fields so the + * registration can proceed through the regular addr-based or fd-based path. + * The IBV_REG_MR_MASK_BUF bit is consumed here and replaced by the masks the + * lower layers understand. + */ +static int fill_mr_init_attr_from_buf(struct ibv_pd *pd, + struct ibv_mr_init_attr *mr_init_attr) +{ + struct ibv_buf *buf = mr_init_attr->buf; + uintptr_t base = (uintptr_t)buf->addr; + uintptr_t addr = (uintptr_t)mr_init_attr->addr; + size_t off; + + if (pd != buf->pd) { + errno = EINVAL; + return -1; + } + + if (!(mr_init_attr->comp_mask & IBV_REG_MR_MASK_ADDR)) { + errno = EINVAL; + return -1; + } + + /* The registered range must lie within the allocated buffer. */ + if (addr < base) { + errno = EINVAL; + return -1; + } + + off = addr - base; + if (off > buf->size || + mr_init_attr->length > buf->size - off) { + errno = EINVAL; + return -1; + } + + if (mr_init_attr->comp_mask & (IBV_REG_MR_MASK_FD | + IBV_REG_MR_MASK_FD_OFFSET)) { + errno = EINVAL; + return -1; + } + + mr_init_attr->comp_mask &= ~IBV_REG_MR_MASK_BUF; + + if (buf->dmabuf_fd != -1) { + /* dma-buf backed buffer: register through the fd-based path. */ + if (mr_init_attr->comp_mask & IBV_REG_MR_MASK_IOVA) { + errno = EINVAL; + return -1; + } + + mr_init_attr->comp_mask &= ~IBV_REG_MR_MASK_ADDR; + mr_init_attr->comp_mask |= IBV_REG_MR_MASK_FD | + IBV_REG_MR_MASK_FD_OFFSET | + IBV_REG_MR_MASK_IOVA; + mr_init_attr->fd = buf->dmabuf_fd; + mr_init_attr->fd_offset = off; + mr_init_attr->iova = addr; + } + + return 0; +} + +/* + * The caller's mr_init_attr is read-only input and is never modified: a local + * copy is built and passed to the lower layers which may modify it further. + */ struct ibv_mr *ibv_reg_mr_ex(struct ibv_pd *pd, struct ibv_mr_init_attr *mr_init_attr) { struct verbs_device *device = verbs_get_device(pd->context->device); + uint64_t comp_mask = mr_init_attr->comp_mask; + struct ibv_mr_init_attr attr = { + .length = mr_init_attr->length, + .access = mr_init_attr->access, + .comp_mask = comp_mask, + }; struct ibv_mr *mr; - int in_access = mr_init_attr->access; - bool need_fork = !((mr_init_attr->access & IBV_ACCESS_ON_DEMAND) || - (mr_init_attr->comp_mask & IBV_REG_MR_MASK_FD)); + bool need_fork; + + if (comp_mask & IBV_REG_MR_MASK_IOVA) + attr.iova = mr_init_attr->iova; + if (comp_mask & IBV_REG_MR_MASK_ADDR) + attr.addr = mr_init_attr->addr; + if (comp_mask & IBV_REG_MR_MASK_FD) + attr.fd = mr_init_attr->fd; + if (comp_mask & IBV_REG_MR_MASK_FD_OFFSET) + attr.fd_offset = mr_init_attr->fd_offset; + if (comp_mask & IBV_REG_MR_MASK_DMAH) + attr.dmah = mr_init_attr->dmah; + if (comp_mask & IBV_REG_MR_MASK_BUF) + attr.buf = mr_init_attr->buf; + + if (attr.comp_mask & IBV_REG_MR_MASK_BUF && + fill_mr_init_attr_from_buf(pd, &attr)) + return NULL; - if (need_fork && ibv_dontfork_range(mr_init_attr->addr, mr_init_attr->length)) + need_fork = !((attr.access & IBV_ACCESS_ON_DEMAND) || + (attr.comp_mask & IBV_REG_MR_MASK_FD)); + + if (need_fork && ibv_dontfork_range(attr.addr, attr.length)) return NULL; if (!(device->core_support & IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS)) - mr_init_attr->access &= ~IBV_ACCESS_OPTIONAL_RANGE; + attr.access &= ~IBV_ACCESS_OPTIONAL_RANGE; - mr = get_ops(pd->context)->reg_mr_ex(pd, mr_init_attr); + mr = get_ops(pd->context)->reg_mr_ex(pd, &attr); if (mr) { mr->context = pd->context; - mr->length = mr_init_attr->length; + mr->length = attr.length; mr->pd = pd; - if (mr_init_attr->comp_mask & IBV_REG_MR_MASK_ADDR) - mr->addr = mr_init_attr->addr; + if (attr.comp_mask & IBV_REG_MR_MASK_ADDR) + mr->addr = attr.addr; else /* Follows ibv_reg_dmabuf_mr logic */ - mr->addr = (void *)(uintptr_t) mr_init_attr->fd_offset; - } else { - if (need_fork) - ibv_dofork_range(mr_init_attr->addr, mr_init_attr->length); + mr->addr = (void *)(uintptr_t) attr.fd_offset; + } else if (need_fork) { + ibv_dofork_range(attr.addr, attr.length); } - /* restore the input access flags */ - mr_init_attr->access = in_access; return mr; } diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 0bbb802f9..9a90f2128 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -690,6 +690,7 @@ enum ibv_mr_init_attr_mask { IBV_REG_MR_MASK_FD = 1 << 2, IBV_REG_MR_MASK_FD_OFFSET = 1 << 3, IBV_REG_MR_MASK_DMAH = 1 << 4, + IBV_REG_MR_MASK_BUF = 1 << 5, }; struct ibv_mr_init_attr { @@ -701,6 +702,7 @@ struct ibv_mr_init_attr { int fd; uint64_t fd_offset; struct ibv_dmah *dmah; + struct ibv_buf *buf; /* Handle from ibv_alloc_buf(), addr must be set */ }; enum ibv_mw_type { From 3f56298c0120b1d3c8d924e4a8586a86a1f79577 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 3 Mar 2026 15:53:19 +0100 Subject: [PATCH 08/33] libibverbs: Add ibv_reg_buf_mr() for provider-aware buffers Add ibv_reg_buf_mr() to register memory returned by ibv_alloc_buf(). Implement it as a thin wrapper: set IBV_REG_MR_MASK_BUF and forward the buffer handle to ibv_reg_mr_ex(), which resolves it to the appropriate addr-based or DMA-buf-based registration and validates the range against the stored buffer metadata. Keep it equivalent to calling ibv_reg_mr_ex() with IBV_REG_MR_MASK_BUF. Signed-off-by: Jiri Pirko --- debian/libibverbs1.symbols | 1 + libibverbs/libibverbs.map.in | 1 + libibverbs/man/CMakeLists.txt | 1 + libibverbs/man/ibv_alloc_buf.3.md | 52 ++++++++++++++++++++++++++++--- libibverbs/verbs.c | 14 +++++++++ libibverbs/verbs.h | 17 ++++++++++ 6 files changed, 81 insertions(+), 5 deletions(-) diff --git a/debian/libibverbs1.symbols b/debian/libibverbs1.symbols index 509383b65..5378bd801 100644 --- a/debian/libibverbs1.symbols +++ b/debian/libibverbs1.symbols @@ -114,6 +114,7 @@ libibverbs.so.1 libibverbs1 #MINVER# ibv_rate_to_mbps@IBVERBS_1.1 1.1.8 ibv_rate_to_mult@IBVERBS_1.0 1.1.6 ibv_read_sysfs_file@IBVERBS_1.0 1.1.6 + ibv_reg_buf_mr@IBVERBS_1.17 64 ibv_reg_dmabuf_mr@IBVERBS_1.12 34 ibv_reg_mr@IBVERBS_1.0 1.1.6 ibv_reg_mr@IBVERBS_1.1 1.1.6 diff --git a/libibverbs/libibverbs.map.in b/libibverbs/libibverbs.map.in index d82112455..300240b03 100644 --- a/libibverbs/libibverbs.map.in +++ b/libibverbs/libibverbs.map.in @@ -181,6 +181,7 @@ IBVERBS_1.17 { global: ibv_alloc_buf; ibv_free_buf; + ibv_reg_buf_mr; } IBVERBS_1.16; /* If any symbols in this stanza change ABI then the entire staza gets a new symbol diff --git a/libibverbs/man/CMakeLists.txt b/libibverbs/man/CMakeLists.txt index 1711807d9..f0e2e7ee0 100644 --- a/libibverbs/man/CMakeLists.txt +++ b/libibverbs/man/CMakeLists.txt @@ -88,6 +88,7 @@ rdma_man_pages( ) rdma_alias_man_pages( ibv_alloc_buf.3 ibv_free_buf.3 + ibv_alloc_buf.3 ibv_reg_buf_mr.3 ibv_alloc_dm.3 ibv_free_dm.3 ibv_alloc_dm.3 ibv_reg_dm_mr.3 ibv_alloc_dm.3 ibv_memcpy_to_dm.3 diff --git a/libibverbs/man/ibv_alloc_buf.3.md b/libibverbs/man/ibv_alloc_buf.3.md index 310e9affc..021c53f82 100644 --- a/libibverbs/man/ibv_alloc_buf.3.md +++ b/libibverbs/man/ibv_alloc_buf.3.md @@ -10,7 +10,7 @@ title: ibv_alloc_buf # NAME -ibv_alloc_buf, ibv_free_buf - allocate and free provider-aware buffers +ibv_alloc_buf, ibv_free_buf, ibv_reg_buf_mr - allocate provider-aware buffers and register them as memory regions # SYNOPSIS @@ -20,6 +20,9 @@ ibv_alloc_buf, ibv_free_buf - allocate and free provider-aware buffers void *ibv_alloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf); void ibv_free_buf(struct ibv_buf *buf); + +struct ibv_mr *ibv_reg_buf_mr(struct ibv_pd *pd, struct ibv_buf *buf, void *addr, + size_t length, int access); ``` # DESCRIPTION @@ -27,22 +30,61 @@ void ibv_free_buf(struct ibv_buf *buf); **ibv_alloc_buf()** allocates a buffer using the allocation method selected by the provider for the protection domain *pd*. On success it returns the mapped address and stores an opaque buffer handle in *buf*. The handle is used by -**ibv_free_buf()** and must not be interpreted by applications. +**ibv_free_buf()**, **ibv_reg_buf_mr()**, and **ibv_reg_mr_ex()** with +**IBV_REG_MR_MASK_BUF**, and must not be interpreted by applications. + +**ibv_free_buf()** releases a buffer handle returned by **ibv_alloc_buf()**. The +protection domain used for allocation must remain valid until the buffer is +freed. + +**ibv_reg_buf_mr()** registers a memory region for a buffer returned by +**ibv_alloc_buf()**. Applications can register the same buffer through +**ibv_reg_mr_ex()** by setting both **IBV_REG_MR_MASK_BUF** and +**IBV_REG_MR_MASK_ADDR**, passing the buffer handle in `mr_init_attr->buf` and +the address to register in `mr_init_attr->addr`. When **IBV_REG_MR_MASK_BUF** +is set the caller must not set **IBV_REG_MR_MASK_FD** or +**IBV_REG_MR_MASK_FD_OFFSET**, and, for a DMA-buf backed buffer, must not set +**IBV_REG_MR_MASK_IOVA**; libibverbs derives these from the buffer handle and +otherwise fails with **EINVAL**. The *pd* argument must be +the same protection domain that was used to allocate the buffer. +If a different protection domain is supplied, registration fails +with **EINVAL**. For ordinary memory it behaves like **ibv_reg_mr()**. +For provider allocations backed by a DMA-buf, it registers +the corresponding DMA-buf range using the metadata stored in +the opaque *buf* handle. # ARGUMENTS *pd* -: The protection domain (or parent domain) to allocate from; its provider selects the buffer's backing allocation method. It must remain valid until the buffer is freed with **ibv_free_buf()**. +: For **ibv_alloc_buf()**, the protection domain (or parent domain) to allocate from; its provider selects the buffer's backing allocation method. It must remain valid until the buffer is freed with **ibv_free_buf()**. For **ibv_reg_buf_mr()**, the same protection domain that allocated *buf* (otherwise registration fails with **EINVAL**). *size* -: Size of the buffer to allocate, in bytes. +: Size of the buffer to allocate, in bytes (**ibv_alloc_buf()**). *buf* -: For **ibv_alloc_buf()**, an output parameter set on success to an opaque buffer handle. For **ibv_free_buf()**, the buffer handle returned by **ibv_alloc_buf()** that is to be released. +: For **ibv_alloc_buf()**, an output parameter set on success to an opaque buffer handle. For **ibv_free_buf()** and **ibv_reg_buf_mr()**, the buffer handle returned by **ibv_alloc_buf()** to be released or registered, respectively. + +*addr* +: The start address to register (**ibv_reg_buf_mr()**): the buffer base returned by **ibv_alloc_buf()** or an address within that buffer. + +*length* +: Length in bytes to register (**ibv_reg_buf_mr()**); *addr* + *length* must stay within the buffer. + +*access* +: Access flags for the memory region (**ibv_reg_buf_mr()**), the same as for **ibv_reg_mr()**. # RETURN VALUE **ibv_alloc_buf()** returns the mapped buffer address on success, or NULL if the request fails. +**ibv_reg_buf_mr()** returns a pointer to the registered MR on success, or NULL +if the request fails. + **ibv_free_buf()** does not return a value. + +# SEE ALSO + +**ibv_reg_mr**(3), +**ibv_reg_mr_ex**(3), +**ibv_reg_dmabuf_mr**(3) diff --git a/libibverbs/verbs.c b/libibverbs/verbs.c index 8e5244d7e..0565be208 100644 --- a/libibverbs/verbs.c +++ b/libibverbs/verbs.c @@ -440,6 +440,20 @@ void *ibv_alloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf) return get_ops(pd->context)->alloc_buf(pd, size, buf); } +struct ibv_mr *ibv_reg_buf_mr(struct ibv_pd *pd, struct ibv_buf *buf, + void *addr, size_t length, int access) +{ + struct ibv_mr_init_attr mr_init_attr = { + .length = length, + .access = access, + .comp_mask = IBV_REG_MR_MASK_BUF | IBV_REG_MR_MASK_ADDR, + .addr = addr, + .buf = buf, + }; + + return ibv_reg_mr_ex(pd, &mr_init_attr); +} + void ibv_free_buf(struct ibv_buf *buf) { get_ops(buf->pd->context)->free_buf(buf); diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 9a90f2128..67832e4ff 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -3208,6 +3208,9 @@ ibv_alloc_parent_domain(struct ibv_context *context, * @size: Buffer size in bytes * @buf: On success, set to a handle for ibv_free_buf() * + * The returned address is at least page aligned, so it can be registered + * with ibv_reg_buf_mr() regardless of how the buffer is backed. + * * Returns the usable buffer address on success, or NULL on failure with * errno set. */ @@ -3219,6 +3222,20 @@ void *ibv_alloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf); */ void ibv_free_buf(struct ibv_buf *buf); +/** + * ibv_reg_buf_mr - Register an MR for a buffer from ibv_alloc_buf + * @pd: Protection domain + * @buf: Handle from ibv_alloc_buf() + * @addr: Start address to register; the buffer base returned by + * ibv_alloc_buf() or an address within that buffer + * @length: Length in bytes; @addr + @length must stay within the buffer + * @access: Access flags (IBV_ACCESS_*) + * + * Returns the registered MR on success, or NULL on failure with errno set. + */ +struct ibv_mr *ibv_reg_buf_mr(struct ibv_pd *pd, struct ibv_buf *buf, + void *addr, size_t length, int access); + /** * ibv_alloc_dmah - Allocate a dma handle */ From c628b762551fa46053145ce0b153692a49b078c4 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 24 Jun 2026 13:36:04 +0300 Subject: [PATCH 09/33] libibverbs: Fall back to the write ABI in ibv_cmd_reg_mr_ex() ibv_cmd_reg_mr_ex() issued the REG_MR ioctl unconditionally (execute_ioctl) with no fallback to the legacy write-based registration, unlike ibv_cmd_reg_mr() which is write-ABI based. An address-based MR registered through the extended path therefore fails on a kernel old enough to lack the REG_MR ioctl method, even though it still supports the legacy write ABI that ibv_cmd_reg_mr() uses. Issue the command through the existing ioctl/write fallback machinery instead: declare the command buffer with DECLARE_FBCMD_BUFFER and dispatch via execute_ioctl_fallback(). On TRY_WRITE -- the kernel has no REG_MR ioctl method -- fall back to the legacy write REG_MR command via ibv_cmd_reg_mr() for the address-based case. A dma-buf (fd) or DMA-handle request has no write-ABI equivalent, and a kernel without the ioctl could not service it anyway, so those return EOPNOTSUPP. Signed-off-by: Yishai Hadas Signed-off-by: Jiri Pirko Co-Authored-By: Claude Opus 4.8 --- libibverbs/cmd_mr.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/libibverbs/cmd_mr.c b/libibverbs/cmd_mr.c index f45ce3992..02c7f2e04 100644 --- a/libibverbs/cmd_mr.c +++ b/libibverbs/cmd_mr.c @@ -160,8 +160,8 @@ int ibv_cmd_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, int ibv_cmd_reg_mr_ex(struct ibv_pd *pd, struct verbs_mr *vmr, struct ibv_mr_init_attr *mr_init_attr) { - DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_MR, - UVERBS_METHOD_REG_MR, 11); + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_MR, + UVERBS_METHOD_REG_MR, 11, NULL); bool fd_based = (mr_init_attr->comp_mask & IBV_REG_MR_MASK_FD); struct ib_uverbs_attr *handle; uint64_t length = mr_init_attr->length; @@ -222,9 +222,43 @@ int ibv_cmd_reg_mr_ex(struct ibv_pd *pd, struct verbs_mr *vmr, fill_attr_in_obj(cmdb, UVERBS_ATTR_REG_MR_DMA_HANDLE, verbs_get_dmah(mr_init_attr->dmah)->handle); - ret = execute_ioctl(pd->context, cmdb); - if (ret) - return errno; + switch (execute_ioctl_fallback(pd->context, reg_mr_ex, cmdb, &ret)) { + case SUCCESS: + break; + + case TRY_WRITE: + case TRY_WRITE_EX: { + /* + * The kernel has no REG_MR ioctl method. Only an address-based + * request maps to the legacy write-ABI REG_MR command; a + * dma-buf (fd) or DMA-handle request has no write-ABI + * equivalent (and such a kernel could not service it anyway). + * This mirrors ibv_cmd_reg_mr()'s write path so an ordinary + * buffer registered via ibv_reg_mr_ex() -- e.g. through + * ibv_reg_buf_mr() -- keeps working on a write-ABI-only kernel. + */ + struct ib_uverbs_reg_mr_resp wresp; + struct ibv_reg_mr req; + uint64_t hca_va; + + if (fd_based || + (mr_init_attr->comp_mask & IBV_REG_MR_MASK_DMAH)) { + errno = EOPNOTSUPP; + return EOPNOTSUPP; + } + + hca_va = (mr_init_attr->comp_mask & IBV_REG_MR_MASK_IOVA) ? + mr_init_attr->iova : + (uintptr_t)mr_init_attr->addr; + + return ibv_cmd_reg_mr(pd, mr_init_attr->addr, length, hca_va, + mr_init_attr->access, vmr, &req, + sizeof(req), &wresp, sizeof(wresp)); + } + + default: + return ret; + } vmr->ibv_mr.handle = read_attr_obj(UVERBS_ATTR_REG_MR_HANDLE, handle); From 7691b5180d8ef4ce62baf5074cc31d262a8f2368 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 17 Feb 2026 10:55:03 +0100 Subject: [PATCH 10/33] libibverbs: Add parent domain ALLOW_CC_UNPROTECTED_ALLOC flag Add CoCo unprotected alloc flag for applications to opt-in to unprotected/shared memory allocation via parent domain creation. Signed-off-by: Jiri Pirko --- libibverbs/man/ibv_alloc_buf.3.md | 9 +++++++++ libibverbs/man/ibv_alloc_parent_domain.3 | 16 ++++++++++++++++ libibverbs/verbs.h | 1 + 3 files changed, 26 insertions(+) diff --git a/libibverbs/man/ibv_alloc_buf.3.md b/libibverbs/man/ibv_alloc_buf.3.md index 021c53f82..8809c6b72 100644 --- a/libibverbs/man/ibv_alloc_buf.3.md +++ b/libibverbs/man/ibv_alloc_buf.3.md @@ -83,8 +83,17 @@ if the request fails. **ibv_free_buf()** does not return a value. +# NOTES + +Applications running in a CoCo guest that need unprotected/shared memory should +first create a parent domain with +**IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC**, then use that +parent domain as the *pd* argument to **ibv_alloc_buf()** and +**ibv_reg_buf_mr()** (or **ibv_reg_mr_ex()** with **IBV_REG_MR_MASK_BUF**). + # SEE ALSO +**ibv_alloc_parent_domain**(3), **ibv_reg_mr**(3), **ibv_reg_mr_ex**(3), **ibv_reg_dmabuf_mr**(3) diff --git a/libibverbs/man/ibv_alloc_parent_domain.3 b/libibverbs/man/ibv_alloc_parent_domain.3 index edca89979..8f00c60e2 100644 --- a/libibverbs/man/ibv_alloc_parent_domain.3 +++ b/libibverbs/man/ibv_alloc_parent_domain.3 @@ -45,6 +45,7 @@ enum ibv_parent_domain_init_attr_mask { .in +8 IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS = 1 << 0, IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT = 1 << 1, +IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC = 1 << 2, .in -8 }; @@ -78,6 +79,21 @@ An optional thread domain that the parent domain uses. .B comp_mask Bit-mask of optional fields in the ibv_parent_domain_init_attr struct. .PP +.B IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC +When set, allows a provider to allocate unprotected/shared memory for +provider-managed buffers when running in a Confidential Computing (CoCo) +guest on a device that reports +.B IBV_DEVICE_CC_DMA_BOUNCE\fR. +If the device does not report this flag, providers may continue to use +plain memory. Applications opt in and then allocate memory through +.B ibv_alloc_buf() +and register it with +.B ibv_reg_buf_mr() +or +.B ibv_reg_mr_ex() +with +.B IBV_REG_MR_MASK_BUF\fR. +.PP .B alloc Custom memory allocation function for this parent domain. Provider memory allocations will use this function to allocate the needed memory. diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 67832e4ff..70b015095 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -2119,6 +2119,7 @@ struct ibv_cq_init_attr_ex { enum ibv_parent_domain_init_attr_mask { IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS = 1 << 0, IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT = 1 << 1, + IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC = 1 << 2, }; #define IBV_ALLOCATOR_USE_DEFAULT ((void *)-1) From 47d990751338723e9993f368b2f4e6674022d435 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 17 Feb 2026 10:54:52 +0100 Subject: [PATCH 11/33] libibverbs: Add CoCo DMA bounce flag Add a flag the kernel sets when the device is in a CoCo guest and requires DMA bounce buffering. Signed-off-by: Jiri Pirko --- libibverbs/examples/devinfo.c | 5 ++++- libibverbs/man/ibv_query_device_ex.3 | 10 ++++++++++ libibverbs/verbs.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/libibverbs/examples/devinfo.c b/libibverbs/examples/devinfo.c index c5088aba2..1e8fcfb46 100644 --- a/libibverbs/examples/devinfo.c +++ b/libibverbs/examples/devinfo.c @@ -390,12 +390,15 @@ static void print_device_cap_flags_ex(uint64_t device_cap_flags_ex) { uint64_t ex_flags = device_cap_flags_ex & 0xffffffff00000000ULL; uint64_t unknown_flags = ~(IBV_DEVICE_RAW_SCATTER_FCS | - IBV_DEVICE_PCI_WRITE_END_PADDING); + IBV_DEVICE_PCI_WRITE_END_PADDING | + IBV_DEVICE_CC_DMA_BOUNCE); if (ex_flags & IBV_DEVICE_RAW_SCATTER_FCS) printf("\t\t\t\t\tRAW_SCATTER_FCS\n"); if (ex_flags & IBV_DEVICE_PCI_WRITE_END_PADDING) printf("\t\t\t\t\tPCI_WRITE_END_PADDING\n"); + if (ex_flags & IBV_DEVICE_CC_DMA_BOUNCE) + printf("\t\t\t\t\tCC_DMA_BOUNCE\n"); if (ex_flags & unknown_flags) printf("\t\t\t\t\tUnknown flags: 0x%" PRIX64 "\n", ex_flags & unknown_flags); diff --git a/libibverbs/man/ibv_query_device_ex.3 b/libibverbs/man/ibv_query_device_ex.3 index c77e8b4f8..c7b6764ab 100644 --- a/libibverbs/man/ibv_query_device_ex.3 +++ b/libibverbs/man/ibv_query_device_ex.3 @@ -160,6 +160,16 @@ This feature can be enabled on a QP or WQ basis via the IBV_QP_CREATE_PCI_WRITE_END_PADDING or IBV_WQ_FLAGS_PCI_WRITE_END_PADDING flags. +.TP 7 +IBV_DEVICE_CC_DMA_BOUNCE + +Indicates the device is running inside a Confidential Computing (CoCo) guest +and requires DMA bounce buffering. Applications that explicitly opt in to +unprotected/shared memory allocation should create a parent domain with +IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC, allocate buffers +through ibv_alloc_buf(), and register them with ibv_reg_buf_mr() (or +ibv_reg_mr_ex() with IBV_REG_MR_MASK_BUF). + .SH "RETURN VALUE" .B ibv_query_device_ex() returns 0 on success, or the value of errno on failure (which indicates the failure reason). diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 70b015095..60103d98a 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -151,6 +151,7 @@ enum ibv_fork_status { */ #define IBV_DEVICE_RAW_SCATTER_FCS (1ULL << 34) #define IBV_DEVICE_PCI_WRITE_END_PADDING (1ULL << 36) +#define IBV_DEVICE_CC_DMA_BOUNCE (1ULL << 41) enum ibv_atomic_cap { IBV_ATOMIC_NONE, From 461d67caf9a16b84d190ea74355b9e93aa8cf826 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 May 2026 13:38:42 +0200 Subject: [PATCH 12/33] libibverbs: Add per-buffer UMEM helper The new umem UAPI gives every dmabuf-backed buffer (CQ ring, QP main/RQ/SQ, mlx5 doorbell record) its own ioctl attribute of type UVERBS_ATTR_UMEM whose payload is a single struct ib_uverbs_buffer_desc. Add a per-attribute helper, fill_attr_in_buf_umem(), that providers call once per buffer they want to register through the kernel. Signed-off-by: Jiri Pirko --- libibverbs/driver.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/libibverbs/driver.h b/libibverbs/driver.h index e7f420be8..d6ad39066 100644 --- a/libibverbs/driver.h +++ b/libibverbs/driver.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -830,4 +831,25 @@ static inline void ibv_initialize_parent_domain(struct ibv_pd *parent_domain, parent_domain->handle = contained_pd->handle; } +/** + * In case the length is 0, the buffer size is used. + */ +static inline void +fill_attr_in_buf_umem(struct ibv_command_buffer *cmdb, uint16_t attr_id, + struct ib_uverbs_buffer_desc *storage, + struct ibv_buf *buf, void *addr, size_t length) +{ + if (!buf || buf->dmabuf_fd == -1) + return; + + *storage = (struct ib_uverbs_buffer_desc){ + .type = IB_UVERBS_BUFFER_TYPE_DMABUF, + .fd = buf->dmabuf_fd, + .addr = addr ? (uintptr_t)addr - (uintptr_t)buf->addr : 0, + .length = length ? length : buf->size, + }; + fill_attr_in_ptr(cmdb, attr_id, storage); + cmdb->fallback_ioctl_only = 1; +} + #endif /* INFINIBAND_DRIVER_H */ From d2782f9d76cfcfc63e7b07e29010ccd520d877bd Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 May 2026 13:39:10 +0200 Subject: [PATCH 13/33] libibverbs: Extend ibv_cmd_create_qp_ex2() with driver chain Add a driver command-buffer chain argument to the private QP create helper ibv_cmd_create_qp_ex2() and build its command buffer with DECLARE_CMD_BUFFER_LINK_COMPAT(), mirroring ibv_cmd_create_cq_ex2(). This lets a provider link a driver-namespace command buffer onto the create-QP ioctl; later commits use it to attach per-buffer UMEM attributes (UVERBS_ATTR_CREATE_QP_*_BUF_UMEM). Update all existing callers (mlx4, mlx5, hns, mana, rxe, ionic) to pass NULL as they have no driver chain to attach. Signed-off-by: Jiri Pirko --- libibverbs/cmd_qp.c | 9 +++++---- libibverbs/driver.h | 3 ++- providers/hns/hns_roce_u_verbs.c | 2 +- providers/ionic/ionic_verbs.c | 2 +- providers/mana/qp.c | 2 +- providers/mlx4/verbs.c | 4 ++-- providers/mlx5/verbs.c | 4 ++-- providers/rxe/rxe.c | 2 +- 8 files changed, 15 insertions(+), 13 deletions(-) diff --git a/libibverbs/cmd_qp.c b/libibverbs/cmd_qp.c index 499b241e5..686222a1b 100644 --- a/libibverbs/cmd_qp.c +++ b/libibverbs/cmd_qp.c @@ -425,11 +425,12 @@ int ibv_cmd_create_qp_ex2(struct ibv_context *context, struct ibv_create_qp_ex *cmd, size_t cmd_size, struct ib_uverbs_ex_create_qp_resp *resp, - size_t resp_size) + size_t resp_size, + struct ibv_command_buffer *driver) { - DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_QP, - UVERBS_METHOD_QP_CREATE, cmd, cmd_size, resp, - resp_size); + DECLARE_CMD_BUFFER_LINK_COMPAT(cmdb, UVERBS_OBJECT_QP, + UVERBS_METHOD_QP_CREATE, + driver, cmd, cmd_size, resp, resp_size); if (!check_comp_mask(attr_ex->comp_mask, IBV_QP_INIT_ATTR_PD | diff --git a/libibverbs/driver.h b/libibverbs/driver.h index d6ad39066..ec72c7748 100644 --- a/libibverbs/driver.h +++ b/libibverbs/driver.h @@ -698,7 +698,8 @@ int ibv_cmd_create_qp_ex2(struct ibv_context *context, struct ibv_create_qp_ex *cmd, size_t cmd_size, struct ib_uverbs_ex_create_qp_resp *resp, - size_t resp_size); + size_t resp_size, + struct ibv_command_buffer *driver); int ibv_cmd_open_qp(struct ibv_context *context, struct verbs_qp *qp, int vqp_sz, struct ibv_qp_open_attr *attr, diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c index 7a861ba32..e19a76675 100644 --- a/providers/hns/hns_roce_u_verbs.c +++ b/providers/hns/hns_roce_u_verbs.c @@ -1446,7 +1446,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr, ret = ibv_cmd_create_qp_ex2(&ctx->ibv_ctx.context, &qp->verbs_qp, attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex), - &resp_ex.ibv_resp, sizeof(resp_ex)); + &resp_ex.ibv_resp, sizeof(resp_ex), NULL); if (ret) { verbs_err(&ctx->ibv_ctx, "failed to exec create qp cmd, ret = %d.\n", ret); diff --git a/providers/ionic/ionic_verbs.c b/providers/ionic/ionic_verbs.c index 6452894e8..64981f52f 100644 --- a/providers/ionic/ionic_verbs.c +++ b/providers/ionic/ionic_verbs.c @@ -1739,7 +1739,7 @@ static struct ibv_qp *ionic_create_qp_ex(struct ibv_context *ibctx, &req.ibv_cmd, sizeof(req), &resp.ibv_resp, - sizeof(resp)); + sizeof(resp), NULL); if (rc) goto err_cmd; diff --git a/providers/mana/qp.c b/providers/mana/qp.c index 1b828ff6a..b6a9a7ecc 100644 --- a/providers/mana/qp.c +++ b/providers/mana/qp.c @@ -526,7 +526,7 @@ static struct ibv_qp *mana_create_qp_ex_raw(struct ibv_context *context, cmd_drv->port = port; ret = ibv_cmd_create_qp_ex2(context, &qp->ibqp, attr, &cmd.ibv_cmd, - sizeof(cmd), &resp.ibv_resp, sizeof(resp)); + sizeof(cmd), &resp.ibv_resp, sizeof(resp), NULL); if (ret) { verbs_err(verbs_get_ctx(context), "Create QP EX failed\n"); free(qp); diff --git a/providers/mlx4/verbs.c b/providers/mlx4/verbs.c index 67c47f699..d88c319c2 100644 --- a/providers/mlx4/verbs.c +++ b/providers/mlx4/verbs.c @@ -804,7 +804,7 @@ static int mlx4_cmd_create_qp_ex_rss(struct ibv_context *context, ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex), &resp.ibv_resp, - sizeof(resp)); + sizeof(resp), NULL); return ret; } @@ -863,7 +863,7 @@ static int mlx4_cmd_create_qp_ex(struct ibv_context *context, ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex), &resp.ibv_resp, - sizeof(resp)); + sizeof(resp), NULL); return ret; } diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 805a25e8e..1c93f0941 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -2168,7 +2168,7 @@ static int mlx5_cmd_create_rss_qp(struct ibv_context *context, ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, attr, &cmd_ex_rss.ibv_cmd, sizeof(cmd_ex_rss), - &resp.ibv_resp, sizeof(resp)); + &resp.ibv_resp, sizeof(resp), NULL); if (ret) return ret; @@ -2201,7 +2201,7 @@ static int mlx5_cmd_create_qp_ex(struct ibv_context *context, ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex), &resp->ibv_resp, - sizeof(*resp)); + sizeof(*resp), NULL); return ret; } diff --git a/providers/rxe/rxe.c b/providers/rxe/rxe.c index 541f1c42f..507f4aab2 100644 --- a/providers/rxe/rxe.c +++ b/providers/rxe/rxe.c @@ -1396,7 +1396,7 @@ static struct ibv_qp *rxe_create_qp_ex(struct ibv_context *context, ret = ibv_cmd_create_qp_ex2(context, &qp->vqp, attr, &cmd, cmd_size, - &resp.ibv_resp, resp_size); + &resp.ibv_resp, resp_size, NULL); if (ret) goto err_free; From 8b6cab4df7a43895bc56994011956ad6a66e4dae Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 17 Feb 2026 09:37:36 +0100 Subject: [PATCH 14/33] libibverbs: Add internal DMA-buf heap allocator library Add dmabuf_heap.c/h including a set of internal helpers for providers to allocate memory from Linux DMA-buf heaps (/dev/dma_heap/). Add a helper to initialize "system_cc_shared" heap allocator. Signed-off-by: Jiri Pirko --- CMakeLists.txt | 2 + buildlib/config.h.in | 2 + libibverbs/CMakeLists.txt | 2 + libibverbs/dmabuf_heap.c | 124 +++++++++++++++++++++++++++++++++++ libibverbs/dmabuf_heap.h | 53 +++++++++++++++ libibverbs/libibverbs.map.in | 4 ++ 6 files changed, 187 insertions(+) create mode 100644 libibverbs/dmabuf_heap.c create mode 100644 libibverbs/dmabuf_heap.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 140ea8c5d..14dc41d65 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -499,6 +499,8 @@ if (NOT HAVE_SPARSE) RDMA_DoFixup("${HAVE_STDATOMIC}" "stdatomic.h") endif() +CHECK_INCLUDE_FILE("linux/dma-heap.h" HAVE_LINUX_DMA_HEAP_H) + RDMA_Check_SSE(HAVE_TARGET_SSE) RDMA_Check_LS64(HAVE_LS64) diff --git a/buildlib/config.h.in b/buildlib/config.h.in index eb4c63310..0a7769e64 100644 --- a/buildlib/config.h.in +++ b/buildlib/config.h.in @@ -52,6 +52,8 @@ #cmakedefine HAVE_WORKING_IF_H 1 +#cmakedefine HAVE_LINUX_DMA_HEAP_H 1 + #cmakedefine HAVE_SOCKADDR_ARG_AS_UNION 1 // Operating mode for symbol versions diff --git a/libibverbs/CMakeLists.txt b/libibverbs/CMakeLists.txt index f37a3dddb..7fe44d861 100644 --- a/libibverbs/CMakeLists.txt +++ b/libibverbs/CMakeLists.txt @@ -11,6 +11,7 @@ publish_headers(infiniband publish_internal_headers(infiniband cmd_ioctl.h cmd_write.h + dmabuf_heap.h driver.h kern-abi.h marshall.h @@ -31,6 +32,7 @@ rdma_library(ibverbs "${CMAKE_CURRENT_BINARY_DIR}/libibverbs.map" cmd_dm.c cmd_dmabuf.c cmd_dmah.c + dmabuf_heap.c cmd_fallback.c cmd_flow.c cmd_flow_action.c diff --git a/libibverbs/dmabuf_heap.c b/libibverbs/dmabuf_heap.c new file mode 100644 index 000000000..7532a6e12 --- /dev/null +++ b/libibverbs/dmabuf_heap.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * DMA-buf heap allocator implementation + */ + +#include "config.h" + +#include +#include + +#if HAVE_LINUX_DMA_HEAP_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ibv_dmabuf_heap { + int heap_fd; +}; + +static struct ibv_dmabuf_heap *ibv_dmabuf_heap_init(const char *heap_name) +{ + struct ibv_dmabuf_heap *heap; + char path[256]; + int fd; + + if (!heap_name) { + errno = EINVAL; + return NULL; + } + + if (!check_snprintf(path, sizeof(path), "/dev/dma_heap/%s", heap_name)) { + errno = ENOMEM; + return NULL; + } + + fd = open(path, O_RDWR | O_CLOEXEC); + if (fd < 0) + return NULL; + + heap = calloc(1, sizeof(*heap)); + if (!heap) { + close(fd); + errno = ENOMEM; + return NULL; + } + + heap->heap_fd = fd; + return heap; +} + +struct ibv_dmabuf_heap *ibv_dmabuf_heap_cc_shared_init(void) +{ + return ibv_dmabuf_heap_init("system_cc_shared"); +} + +void ibv_dmabuf_heap_destroy(struct ibv_dmabuf_heap *heap) +{ + close(heap->heap_fd); + free(heap); +} + +void *ibv_dmabuf_heap_alloc(struct ibv_dmabuf_heap *heap, size_t size, + int *dmabuf_fd) +{ + struct dma_heap_allocation_data heap_data = {}; + void *buf; + int fd; + + heap_data.len = size; + heap_data.fd_flags = O_RDWR | O_CLOEXEC; + if (ioctl(heap->heap_fd, DMA_HEAP_IOCTL_ALLOC, &heap_data) < 0) + return NULL; + + fd = heap_data.fd; + + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (buf == MAP_FAILED) { + close(fd); + return NULL; + } + + *dmabuf_fd = fd; + return buf; +} + +void ibv_dmabuf_heap_free(void *buf, size_t size, int dmabuf_fd) +{ + munmap(buf, size); + close(dmabuf_fd); +} + +#else /* !HAVE_LINUX_DMA_HEAP_H */ + +struct ibv_dmabuf_heap *ibv_dmabuf_heap_cc_shared_init(void) +{ + errno = EOPNOTSUPP; + return NULL; +} + +void ibv_dmabuf_heap_destroy(struct ibv_dmabuf_heap *heap) +{ +} + +void *ibv_dmabuf_heap_alloc(struct ibv_dmabuf_heap *heap, size_t size, + int *dmabuf_fd) +{ + errno = EOPNOTSUPP; + return NULL; +} + +void ibv_dmabuf_heap_free(void *buf, size_t size, int dmabuf_fd) +{ +} + +#endif /* HAVE_LINUX_DMA_HEAP_H */ diff --git a/libibverbs/dmabuf_heap.h b/libibverbs/dmabuf_heap.h new file mode 100644 index 000000000..08793697d --- /dev/null +++ b/libibverbs/dmabuf_heap.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */ +/* + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * DMA-buf heap allocator API + * + * Provides a simple interface for allocating memory from Linux DMA-buf heaps. + * currently only CoCo shared heap is supported. The allocated buffers are + * mmap'd into user-space and can be used with RDMA verbs. + */ + +#ifndef INFINIBAND_DMABUF_HEAP_H +#define INFINIBAND_DMABUF_HEAP_H + +#include + +struct ibv_dmabuf_heap; + +/** + * ibv_dmabuf_heap_cc_shared_init - Open the CoCo shared DMA-buf heap + */ +struct ibv_dmabuf_heap *ibv_dmabuf_heap_cc_shared_init(void); + +/** + * ibv_dmabuf_heap_destroy - Close the heap device and free the handle + * @heap: Heap handle from ibv_dmabuf_heap_cc_shared_init() + */ +void ibv_dmabuf_heap_destroy(struct ibv_dmabuf_heap *heap); + +/** + * ibv_dmabuf_heap_alloc - Allocate a buffer from the DMA-buf heap + * @heap: Heap handle from ibv_dmabuf_heap_cc_shared_init() + * @size: Requested buffer size in bytes + * @dmabuf_fd: On success, set to the DMA-buf file descriptor + * + * Returns an mmap'd pointer on success, or NULL on failure with errno set. + * The caller must store both the returned pointer and *dmabuf_fd and pass + * them to ibv_dmabuf_heap_free() when done. + */ +void *ibv_dmabuf_heap_alloc(struct ibv_dmabuf_heap *heap, size_t size, + int *dmabuf_fd); + +/** + * ibv_dmabuf_heap_free - Free a buffer allocated with ibv_dmabuf_heap_alloc + * @buf: Pointer returned by ibv_dmabuf_heap_alloc() + * @size: Same size passed to ibv_dmabuf_heap_alloc() + * @dmabuf_fd: DMA-buf fd returned by ibv_dmabuf_heap_alloc() + * + * Unmaps the buffer and closes the DMA-buf fd. + */ +void ibv_dmabuf_heap_free(void *buf, size_t size, int dmabuf_fd); + +#endif /* INFINIBAND_DMABUF_HEAP_H */ diff --git a/libibverbs/libibverbs.map.in b/libibverbs/libibverbs.map.in index 300240b03..dd7fb8a88 100644 --- a/libibverbs/libibverbs.map.in +++ b/libibverbs/libibverbs.map.in @@ -259,6 +259,10 @@ IBVERBS_PRIVATE_@IBVERBS_PABI_VERSION@ { ibv_cmd_req_notify_cq; ibv_cmd_rereg_mr; ibv_cmd_resize_cq; + ibv_dmabuf_heap_alloc; + ibv_dmabuf_heap_cc_shared_init; + ibv_dmabuf_heap_destroy; + ibv_dmabuf_heap_free; ibv_query_gid_type; ibv_read_ibdev_sysfs_file; ibv_wr_opcode_str; From 9edb3bb64a984d6e2a0af6c00aa1e3000d650f1d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 7 Apr 2026 14:00:20 +0200 Subject: [PATCH 15/33] mlx5: Adopt struct ibv_buf for internal buffer allocations Replace the separate void *buf and size_t length fields in struct mlx5_buf with an embedded struct ibv_buf, aligning the provider's internal buffer representation with the common ibv_buf abstraction. Link struct mlx5_buf to the ibv_buf API (pd, addr, size) instead of maintaining duplicate fields. Add a pd argument to the internal allocation helpers and initialize the embedded buffer through ibv_buf_init(), so each buffer records its owning protection domain. Signed-off-by: Jiri Pirko --- providers/mlx5/buf.c | 81 ++++++++++++++++++++++------------------ providers/mlx5/cq.c | 11 +++--- providers/mlx5/dbrec.c | 15 ++++---- providers/mlx5/dr_send.c | 19 +++++----- providers/mlx5/mlx5.c | 10 ++--- providers/mlx5/mlx5.h | 14 +++---- providers/mlx5/qp.c | 2 +- providers/mlx5/srq.c | 7 ++-- providers/mlx5/verbs.c | 39 +++++++++---------- 9 files changed, 101 insertions(+), 97 deletions(-) diff --git a/providers/mlx5/buf.c b/providers/mlx5/buf.c index 3a3a79250..b44ee7d52 100644 --- a/providers/mlx5/buf.c +++ b/providers/mlx5/buf.c @@ -120,18 +120,21 @@ static struct mlx5_hugetlb_mem *alloc_huge_mem(size_t size) } static int alloc_huge_buf(struct mlx5_context *mctx, struct mlx5_buf *buf, - size_t size, int page_size) + size_t size, int page_size, struct ibv_pd *pd) { int found = 0; int nchunk; struct mlx5_hugetlb_mem *hmem; int ret; - buf->length = align(size, MLX5_Q_CHUNK_SIZE); - nchunk = buf->length / MLX5_Q_CHUNK_SIZE; + buf->ibv_buf.size = align(size, MLX5_Q_CHUNK_SIZE); + nchunk = buf->ibv_buf.size / MLX5_Q_CHUNK_SIZE; - if (!nchunk) + if (!nchunk) { + ibv_buf_init(&buf->ibv_buf, pd, NULL, 0); + buf->type = MLX5_ALLOC_TYPE_HUGE; return 0; + } mlx5_spin_lock(&mctx->hugetlb_lock); list_for_each(&mctx->hugetlb_list, hmem, entry) { @@ -151,7 +154,7 @@ static int alloc_huge_buf(struct mlx5_context *mctx, struct mlx5_buf *buf, mlx5_spin_unlock(&mctx->hugetlb_lock); if (!found) { - hmem = alloc_huge_mem(buf->length); + hmem = alloc_huge_mem(buf->ibv_buf.size); if (!hmem) return -1; @@ -169,9 +172,11 @@ static int alloc_huge_buf(struct mlx5_context *mctx, struct mlx5_buf *buf, mlx5_spin_unlock(&mctx->hugetlb_lock); } - buf->buf = hmem->shmaddr + buf->base * MLX5_Q_CHUNK_SIZE; + ibv_buf_init(&buf->ibv_buf, pd, + hmem->shmaddr + buf->base * MLX5_Q_CHUNK_SIZE, + buf->ibv_buf.size); - ret = ibv_dontfork_range(buf->buf, buf->length); + ret = ibv_dontfork_range(buf->ibv_buf.addr, buf->ibv_buf.size); if (ret) { mlx5_dbg(stderr, MLX5_DBG_CONTIG, "\n"); goto out_fork; @@ -197,7 +202,7 @@ static void free_huge_buf(struct mlx5_context *ctx, struct mlx5_buf *buf) { int nchunk; - nchunk = buf->length / MLX5_Q_CHUNK_SIZE; + nchunk = buf->ibv_buf.size / MLX5_Q_CHUNK_SIZE; if (!nchunk) return; @@ -213,12 +218,12 @@ static void free_huge_buf(struct mlx5_context *ctx, struct mlx5_buf *buf) void mlx5_free_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf) { - ibv_dofork_range(buf->buf, buf->length); - ctx->extern_alloc.free(buf->buf, ctx->extern_alloc.data); + ibv_dofork_range(buf->ibv_buf.addr, buf->ibv_buf.size); + ctx->extern_alloc.free(buf->ibv_buf.addr, ctx->extern_alloc.data); } int mlx5_alloc_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf, - size_t size) + size_t size, struct ibv_pd *pd) { void *addr; @@ -231,8 +236,7 @@ int mlx5_alloc_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf, ctx->extern_alloc.data); return -1; } - buf->buf = addr; - buf->length = size; + ibv_buf_init(&buf->ibv_buf, pd, addr, size); buf->type = MLX5_ALLOC_TYPE_EXTERNAL; return 0; } @@ -244,18 +248,19 @@ int mlx5_alloc_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf, static void mlx5_free_buf_custom(struct mlx5_context *ctx, struct mlx5_buf *buf) { - struct mlx5_parent_domain *mparent_domain = buf->mparent_domain; + struct mlx5_parent_domain *mparent_domain = + to_mparent_domain(buf->ibv_buf.pd); mparent_domain->free(&mparent_domain->mpd.ibv_pd, mparent_domain->pd_context, - buf->buf, + buf->ibv_buf.addr, buf->resource_type); } static int mlx5_alloc_buf_custom(struct mlx5_context *ctx, - struct mlx5_buf *buf, size_t size) + struct mlx5_buf *buf, size_t size, struct ibv_pd *pd) { - struct mlx5_parent_domain *mparent_domain = buf->mparent_domain; + struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); void *addr; addr = mparent_domain->alloc(&mparent_domain->mpd.ibv_pd, @@ -266,8 +271,7 @@ static int mlx5_alloc_buf_custom(struct mlx5_context *ctx, return 1; if (addr || size == 0) { - buf->buf = addr; - buf->length = size; + ibv_buf_init(&buf->ibv_buf, pd, addr, size); buf->type = MLX5_ALLOC_TYPE_CUSTOM; return 0; } @@ -279,12 +283,13 @@ int mlx5_alloc_prefered_buf(struct mlx5_context *mctx, struct mlx5_buf *buf, size_t size, int page_size, enum mlx5_alloc_type type, - const char *component) + const char *component, + struct ibv_pd *pd) { int ret; if (type == MLX5_ALLOC_TYPE_CUSTOM) { - ret = mlx5_alloc_buf_custom(mctx, buf, size); + ret = mlx5_alloc_buf_custom(mctx, buf, size, pd); if (ret <= 0) return ret; @@ -300,7 +305,7 @@ int mlx5_alloc_prefered_buf(struct mlx5_context *mctx, if (type == MLX5_ALLOC_TYPE_HUGE || type == MLX5_ALLOC_TYPE_PREFER_HUGE || type == MLX5_ALLOC_TYPE_ALL) { - ret = alloc_huge_buf(mctx, buf, size, page_size); + ret = alloc_huge_buf(mctx, buf, size, page_size, pd); if (!ret) return 0; @@ -315,7 +320,8 @@ int mlx5_alloc_prefered_buf(struct mlx5_context *mctx, if (type == MLX5_ALLOC_TYPE_CONTIG || type == MLX5_ALLOC_TYPE_PREFER_CONTIG || type == MLX5_ALLOC_TYPE_ALL) { - ret = mlx5_alloc_buf_contig(mctx, buf, size, page_size, component); + ret = mlx5_alloc_buf_contig(mctx, buf, size, page_size, component, + pd); if (!ret) return 0; @@ -326,9 +332,9 @@ int mlx5_alloc_prefered_buf(struct mlx5_context *mctx, } if (type == MLX5_ALLOC_TYPE_EXTERNAL) - return mlx5_alloc_buf_extern(mctx, buf, size); + return mlx5_alloc_buf_extern(mctx, buf, size, pd); - return mlx5_alloc_buf(buf, size, page_size); + return mlx5_alloc_buf(buf, size, page_size, pd); } @@ -483,7 +489,7 @@ static void mlx5_alloc_get_env_info(struct mlx5_context *mctx, int mlx5_alloc_buf_contig(struct mlx5_context *mctx, struct mlx5_buf *buf, size_t size, int page_size, - const char *component) + const char *component, struct ibv_pd *pd) { void *addr = MAP_FAILED; int block_size_exp; @@ -529,8 +535,7 @@ int mlx5_alloc_buf_contig(struct mlx5_context *mctx, return -1; } - buf->buf = addr; - buf->length = size; + ibv_buf_init(&buf->ibv_buf, pd, addr, size); buf->type = MLX5_ALLOC_TYPE_CONTIG; return 0; @@ -538,26 +543,28 @@ int mlx5_alloc_buf_contig(struct mlx5_context *mctx, void mlx5_free_buf_contig(struct mlx5_context *mctx, struct mlx5_buf *buf) { - ibv_dofork_range(buf->buf, buf->length); - munmap(buf->buf, buf->length); + ibv_dofork_range(buf->ibv_buf.addr, buf->ibv_buf.size); + munmap(buf->ibv_buf.addr, buf->ibv_buf.size); } -int mlx5_alloc_buf(struct mlx5_buf *buf, size_t size, int page_size) +int mlx5_alloc_buf(struct mlx5_buf *buf, size_t size, int page_size, + struct ibv_pd *pd) { + void *addr; int ret; int al_size; al_size = align(size, page_size); - ret = posix_memalign(&buf->buf, page_size, al_size); + ret = posix_memalign(&addr, page_size, al_size); if (ret) return ret; - ret = ibv_dontfork_range(buf->buf, al_size); + ret = ibv_dontfork_range(addr, al_size); if (ret) - free(buf->buf); + free(addr); if (!ret) { - buf->length = al_size; + ibv_buf_init(&buf->ibv_buf, pd, addr, al_size); buf->type = MLX5_ALLOC_TYPE_ANON; } @@ -566,6 +573,6 @@ int mlx5_alloc_buf(struct mlx5_buf *buf, size_t size, int page_size) void mlx5_free_buf(struct mlx5_buf *buf) { - ibv_dofork_range(buf->buf, buf->length); - free(buf->buf); + ibv_dofork_range(buf->ibv_buf.addr, buf->ibv_buf.size); + free(buf->ibv_buf.addr); } diff --git a/providers/mlx5/cq.c b/providers/mlx5/cq.c index b2dd66432..4d2ad7f8f 100644 --- a/providers/mlx5/cq.c +++ b/providers/mlx5/cq.c @@ -128,12 +128,12 @@ static inline uint8_t get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe) static void *get_buf_cqe(struct mlx5_buf *buf, int n, int cqe_sz) { - return buf->buf + n * cqe_sz; + return buf->ibv_buf.addr + n * cqe_sz; } static void *get_cqe(struct mlx5_cq *cq, int n) { - return cq->active_buf->buf + n * cq->cqe_sz; + return cq->active_buf->ibv_buf.addr + n * cq->cqe_sz; } static void *get_sw_cqe(struct mlx5_cq *cq, int n) @@ -1947,7 +1947,6 @@ int mlx5_alloc_cq_buf(struct mlx5_context *mctx, struct mlx5_cq *cq, MLX5_CQ_PREFIX, &type, default_type); if (type == MLX5_ALLOC_TYPE_CUSTOM) { - buf->mparent_domain = to_mparent_domain(cq->parent_domain); buf->req_alignment = dev->page_size; buf->resource_type = MLX5DV_RES_TYPE_CQ; } @@ -1956,16 +1955,16 @@ int mlx5_alloc_cq_buf(struct mlx5_context *mctx, struct mlx5_cq *cq, align(nent * cqe_sz, dev->page_size), dev->page_size, type, - MLX5_CQ_PREFIX); + MLX5_CQ_PREFIX, cq->parent_domain); if (ret) return -1; if (buf->type != MLX5_ALLOC_TYPE_CUSTOM) - memset(buf->buf, 0, nent * cqe_sz); + memset(buf->ibv_buf.addr, 0, nent * cqe_sz); for (i = 0; i < nent; ++i) { - cqe = buf->buf + i * cqe_sz; + cqe = buf->ibv_buf.addr + i * cqe_sz; cqe += cqe_sz == 128 ? 1 : 0; cqe->op_own = MLX5_CQE_INVALID << 4; } diff --git a/providers/mlx5/dbrec.c b/providers/mlx5/dbrec.c index e35183f1f..c6ef9415b 100644 --- a/providers/mlx5/dbrec.c +++ b/providers/mlx5/dbrec.c @@ -47,7 +47,8 @@ struct mlx5_db_page { unsigned long free[0]; }; -static struct mlx5_db_page *__add_page(struct mlx5_context *context) +static struct mlx5_db_page *__add_page(struct mlx5_context *context, + struct ibv_pd *pd) { struct mlx5_db_page *page; int ps = to_mdev(context->ibv_ctx.context.device)->page_size; @@ -64,9 +65,9 @@ static struct mlx5_db_page *__add_page(struct mlx5_context *context) return NULL; if (mlx5_is_extern_alloc(context)) - ret = mlx5_alloc_buf_extern(context, &page->buf, ps); + ret = mlx5_alloc_buf_extern(context, &page->buf, ps, pd); else - ret = mlx5_alloc_buf(&page->buf, ps, ps); + ret = mlx5_alloc_buf(&page->buf, ps, ps, pd); if (ret) { free(page); return NULL; @@ -77,7 +78,7 @@ static struct mlx5_db_page *__add_page(struct mlx5_context *context) for (i = 0; i < nlong; ++i) page->free[i] = ~0; - cl_qmap_insert(&context->dbr_map, (uintptr_t) page->buf.buf, + cl_qmap_insert(&context->dbr_map, (uintptr_t) page->buf.ibv_buf.addr, &page->cl_map); list_add(&context->dbr_available_pages, &page->available); @@ -116,7 +117,7 @@ __be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, if (page) goto found; - page = __add_page(context); + page = __add_page(context, pd); if (!page) goto out; @@ -131,7 +132,7 @@ __be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, j = ffsl(page->free[i]); --j; page->free[i] &= ~(1UL << j); - db = page->buf.buf + (i * 8 * sizeof(long) + j) * context->cache_line_size; + db = page->buf.ibv_buf.addr + (i * 8 * sizeof(long) + j) * context->cache_line_size; out: pthread_mutex_unlock(&context->dbr_map_mutex); @@ -164,7 +165,7 @@ void mlx5_free_db(struct mlx5_context *context, __be32 *db, struct ibv_pd *pd, assert(item != cl_qmap_end(&context->dbr_map)); page = (container_of(item, struct mlx5_db_page, cl_map)); - i = ((void *) db - page->buf.buf) / context->cache_line_size; + i = ((void *) db - page->buf.ibv_buf.addr) / context->cache_line_size; page->free[i / (8 * sizeof(long))] |= 1UL << (i % (8 * sizeof(long))); if (page->use_cnt == page->num_db) list_add(&context->dbr_available_pages, &page->available); diff --git a/providers/mlx5/dr_send.c b/providers/mlx5/dr_send.c index 0018b173b..0369f53b7 100644 --- a/providers/mlx5/dr_send.c +++ b/providers/mlx5/dr_send.c @@ -251,6 +251,7 @@ static int dr_calc_wq_size(struct dr_qp *dr_qp, struct dr_qp_init_attr *attr) static int dr_qp_alloc_buf(struct dr_qp *dr_qp, int size) { int al_size; + void *addr; int ret; dr_qp->sq.wqe_head = malloc(dr_qp->sq.wqe_cnt * @@ -261,15 +262,15 @@ static int dr_qp_alloc_buf(struct dr_qp *dr_qp, int size) } al_size = align(size, sysconf(_SC_PAGESIZE)); - ret = posix_memalign(&dr_qp->buf.buf, sysconf(_SC_PAGESIZE), al_size); + ret = posix_memalign(&addr, sysconf(_SC_PAGESIZE), al_size); if (ret) { errno = ret; goto free_wqe_head; } - dr_qp->buf.length = al_size; + ibv_buf_init(&dr_qp->buf.ibv_buf, NULL, addr, al_size); dr_qp->buf.type = MLX5_ALLOC_TYPE_ANON; - memset(dr_qp->buf.buf, 0, dr_qp->buf.length); + memset(dr_qp->buf.ibv_buf.addr, 0, dr_qp->buf.ibv_buf.size); return 0; @@ -298,8 +299,8 @@ static struct dr_qp *dr_create_rc_qp(struct ibv_context *ctx, if (dr_qp_alloc_buf(dr_qp, size)) goto err_alloc_bufs; - dr_qp->sq_start = dr_qp->buf.buf + dr_qp->sq.offset; - dr_qp->sq.qend = dr_qp->buf.buf + dr_qp->sq.offset + + dr_qp->sq_start = dr_qp->buf.ibv_buf.addr + dr_qp->sq.offset; + dr_qp->sq.qend = dr_qp->buf.ibv_buf.addr + dr_qp->sq.offset + (dr_qp->sq.wqe_cnt << dr_qp->sq.wqe_shift); dr_qp->rq.head = 0; dr_qp->rq.tail = 0; @@ -320,8 +321,8 @@ static struct dr_qp *dr_create_rc_qp(struct ibv_context *ctx, if (!dr_qp->db_umem) goto err_db_umem; - dr_qp->buf_umem = mlx5dv_devx_umem_reg(ctx, dr_qp->buf.buf, - dr_qp->buf.length, + dr_qp->buf_umem = mlx5dv_devx_umem_reg(ctx, dr_qp->buf.ibv_buf.addr, + dr_qp->buf.ibv_buf.size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); @@ -360,7 +361,7 @@ static struct dr_qp *dr_create_rc_qp(struct ibv_context *ctx, free(dr_qp->db); err_db_alloc: free(dr_qp->sq.wqe_head); - free(dr_qp->buf.buf); + free(dr_qp->buf.ibv_buf.addr); err_alloc_bufs: free(dr_qp); return NULL; @@ -384,7 +385,7 @@ static int dr_destroy_qp(struct dr_qp *dr_qp) free(dr_qp->db); free(dr_qp->sq.wqe_head); - free(dr_qp->buf.buf); + free(dr_qp->buf.ibv_buf.addr); free(dr_qp); return 0; diff --git a/providers/mlx5/mlx5.c b/providers/mlx5/mlx5.c index 857f7adb4..8cc9ad6f4 100644 --- a/providers/mlx5/mlx5.c +++ b/providers/mlx5/mlx5.c @@ -1022,13 +1022,13 @@ static int mlx5dv_get_qp(struct ibv_qp *qp_in, if (mqp->sq_buf_size) /* IBV_QPT_RAW_PACKET */ - qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.buf); + qp_out->sq.buf = (void *)((uintptr_t)mqp->sq_buf.ibv_buf.addr); else - qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset); + qp_out->sq.buf = (void *)((uintptr_t)mqp->buf.ibv_buf.addr + mqp->sq.offset); qp_out->sq.wqe_cnt = mqp->sq.wqe_cnt; qp_out->sq.stride = 1 << mqp->sq.wqe_shift; - qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.buf + mqp->rq.offset); + qp_out->rq.buf = (void *)((uintptr_t)mqp->buf.ibv_buf.addr + mqp->rq.offset); qp_out->rq.wqe_cnt = mqp->rq.wqe_cnt; qp_out->rq.stride = 1 << mqp->rq.wqe_shift; @@ -1072,7 +1072,7 @@ static int mlx5dv_get_cq(struct ibv_cq *cq_in, cq_out->cqn = mcq->cqn; cq_out->cqe_cnt = mcq->verbs_cq.cq.cqe + 1; cq_out->cqe_size = mcq->cqe_sz; - cq_out->buf = mcq->active_buf->buf; + cq_out->buf = mcq->active_buf->ibv_buf.addr; cq_out->dbrec = mcq->dbrec; cq_out->cq_uar = mctx->cq_uar_reg; @@ -1103,7 +1103,7 @@ static int mlx5dv_get_srq(struct ibv_srq *srq_in, msrq = container_of(srq_in, struct mlx5_srq, vsrq.srq); - srq_out->buf = msrq->buf.buf; + srq_out->buf = msrq->buf.ibv_buf.addr; srq_out->dbrec = msrq->db; srq_out->stride = 1 << msrq->wqe_shift; srq_out->head = msrq->head; diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index bff29af58..c0c66df40 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -437,14 +437,12 @@ struct mlx5_hugetlb_mem { }; struct mlx5_buf { - void *buf; - size_t length; + struct ibv_buf ibv_buf; int base; struct mlx5_hugetlb_mem *hmem; enum mlx5_alloc_type type; uint64_t resource_type; size_t req_alignment; - struct mlx5_parent_domain *mparent_domain; }; struct mlx5_td { @@ -1113,16 +1111,18 @@ void mlx5_open_debug_file(FILE **dbg_fp); void mlx5_close_debug_file(FILE *dbg_fp); void mlx5_set_debug_mask(void); -int mlx5_alloc_buf(struct mlx5_buf *buf, size_t size, int page_size); +int mlx5_alloc_buf(struct mlx5_buf *buf, size_t size, int page_size, + struct ibv_pd *pd); void mlx5_free_buf(struct mlx5_buf *buf); int mlx5_alloc_buf_contig(struct mlx5_context *mctx, struct mlx5_buf *buf, - size_t size, int page_size, const char *component); + size_t size, int page_size, const char *component, + struct ibv_pd *pd); void mlx5_free_buf_contig(struct mlx5_context *mctx, struct mlx5_buf *buf); int mlx5_alloc_prefered_buf(struct mlx5_context *mctx, struct mlx5_buf *buf, size_t size, int page_size, enum mlx5_alloc_type alloc_type, - const char *component); + const char *component, struct ibv_pd *pd); int mlx5_free_actual_buf(struct mlx5_context *ctx, struct mlx5_buf *buf); void mlx5_get_alloc_type(struct mlx5_context *context, struct ibv_pd *pd, @@ -1133,7 +1133,7 @@ int mlx5_use_huge(const char *key); bool mlx5_is_custom_alloc(struct ibv_pd *pd); bool mlx5_is_extern_alloc(struct mlx5_context *context); int mlx5_alloc_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf, - size_t size); + size_t size, struct ibv_pd *pd); void mlx5_free_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf); __be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, diff --git a/providers/mlx5/qp.c b/providers/mlx5/qp.c index 60f0e2c75..b28273c83 100644 --- a/providers/mlx5/qp.c +++ b/providers/mlx5/qp.c @@ -64,7 +64,7 @@ static const uint32_t mlx5_ib_opcode[] = { static void *get_recv_wqe(struct mlx5_qp *qp, int n) { - return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); + return qp->buf.ibv_buf.addr + qp->rq.offset + (n << qp->rq.wqe_shift); } static void *get_wq_recv_wqe(struct mlx5_rwq *rwq, int n) diff --git a/providers/mlx5/srq.c b/providers/mlx5/srq.c index b1cc96d39..95ad20e74 100644 --- a/providers/mlx5/srq.c +++ b/providers/mlx5/srq.c @@ -42,7 +42,7 @@ static void *get_wqe(struct mlx5_srq *srq, int n) { - return srq->buf.buf + (n << srq->wqe_shift); + return srq->buf.ibv_buf.addr + (n << srq->wqe_shift); } static inline void set_next_tail(struct mlx5_srq *srq, int next_tail) @@ -390,7 +390,6 @@ int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq, MLX5_ALLOC_TYPE_ANON); if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { - srq->buf.mparent_domain = to_mparent_domain(pd); srq->buf.req_alignment = to_mdev(context->device)->page_size; srq->buf.resource_type = MLX5DV_RES_TYPE_SRQ; } @@ -399,11 +398,11 @@ int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq, &srq->buf, buf_size, to_mdev(context->device)->page_size, alloc_type, - MLX5_SRQ_PREFIX)) + MLX5_SRQ_PREFIX, pd)) return -1; if (srq->buf.type != MLX5_ALLOC_TYPE_CUSTOM) - memset(srq->buf.buf, 0, buf_size); + memset(srq->buf.ibv_buf.addr, 0, buf_size); srq->head = 0; srq->tail = align_queue_size(orig_max_wr + 1) - 1; diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 1c93f0941..2439ccd00 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -1118,7 +1118,7 @@ static struct ibv_cq_ex *create_cq(struct ibv_context *context, cq->cqe_sz = cqe_sz; cq->flags = cq_alloc_flags; - cmd_drv->buf_addr = (uintptr_t) cq->buf_a.buf; + cmd_drv->buf_addr = (uintptr_t) cq->buf_a.ibv_buf.addr; cmd_drv->db_addr = (uintptr_t) cq->dbrec; cmd_drv->cqe_size = cqe_sz; @@ -1313,7 +1313,7 @@ int mlx5_resize_cq(struct ibv_cq *ibcq, int cqe) goto out; } - cmd.buf_addr = (uintptr_t)cq->resize_buf->buf; + cmd.buf_addr = (uintptr_t)cq->resize_buf->ibv_buf.addr; cmd.cqe_size = cq->resize_cqe_sz; err = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof(cmd), @@ -1419,7 +1419,7 @@ struct ibv_srq *mlx5_create_srq(struct ibv_pd *pd, if (!srq->custom_db) *srq->db = 0; - cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.buf_addr = (uintptr_t) srq->buf.ibv_buf.addr; cmd.db_addr = (uintptr_t) srq->db; srq->wq_sig = srq_sig_enabled(); if (srq->wq_sig) @@ -2021,7 +2021,6 @@ static int mlx5_alloc_qp_buf(struct ibv_context *context, &alloc_type, default_alloc_type); if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { - qp->buf.mparent_domain = to_mparent_domain(attr->pd); if (attr->qp_type != IBV_QPT_RAW_PACKET && !(qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY)) req_align = mlx5_set_custom_qp_alignment(context, qp); @@ -2033,7 +2032,7 @@ static int mlx5_alloc_qp_buf(struct ibv_context *context, align(qp->buf_size, req_align), to_mdev(context->device)->page_size, alloc_type, - MLX5_QP_PREFIX); + MLX5_QP_PREFIX, attr->pd); if (err) { err = -ENOMEM; @@ -2041,7 +2040,7 @@ static int mlx5_alloc_qp_buf(struct ibv_context *context, } if (qp->buf.type != MLX5_ALLOC_TYPE_CUSTOM) - memset(qp->buf.buf, 0, qp->buf_size); + memset(qp->buf.ibv_buf.addr, 0, qp->buf_size); if (attr->qp_type == IBV_QPT_RAW_PACKET || qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) { @@ -2049,7 +2048,6 @@ static int mlx5_alloc_qp_buf(struct ibv_context *context, to_mdev(context->device)->page_size); if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { - qp->sq_buf.mparent_domain = to_mparent_domain(attr->pd); qp->sq_buf.req_alignment = to_mdev(context->device)->page_size; qp->sq_buf.resource_type = MLX5DV_RES_TYPE_QP; } @@ -2059,14 +2057,14 @@ static int mlx5_alloc_qp_buf(struct ibv_context *context, aligned_sq_buf_size, to_mdev(context->device)->page_size, alloc_type, - MLX5_QP_PREFIX); + MLX5_QP_PREFIX, attr->pd); if (err) { err = -ENOMEM; goto rq_buf; } if (qp->sq_buf.type != MLX5_ALLOC_TYPE_CUSTOM) - memset(qp->sq_buf.buf, 0, aligned_sq_buf_size); + memset(qp->sq_buf.ibv_buf.addr, 0, aligned_sq_buf_size); } return 0; @@ -2091,7 +2089,7 @@ static void mlx5_free_qp_buf(struct mlx5_context *ctx, struct mlx5_qp *qp) { mlx5_free_actual_buf(ctx, &qp->buf); - if (qp->sq_buf.buf) + if (qp->sq_buf.ibv_buf.addr) mlx5_free_actual_buf(ctx, &qp->sq_buf); if (qp->rq.wrid) @@ -2665,12 +2663,12 @@ static struct ibv_qp *create_qp(struct ibv_context *context, if (attr->qp_type == IBV_QPT_RAW_PACKET || qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) { - qp->sq_start = qp->sq_buf.buf; - qp->sq.qend = qp->sq_buf.buf + + qp->sq_start = qp->sq_buf.ibv_buf.addr; + qp->sq.qend = qp->sq_buf.ibv_buf.addr + (qp->sq.wqe_cnt << qp->sq.wqe_shift); } else { - qp->sq_start = qp->buf.buf + qp->sq.offset; - qp->sq.qend = qp->buf.buf + qp->sq.offset + + qp->sq_start = qp->buf.ibv_buf.addr + qp->sq.offset; + qp->sq.qend = qp->buf.ibv_buf.addr + qp->sq.offset + (qp->sq.wqe_cnt << qp->sq.wqe_shift); } @@ -2691,10 +2689,10 @@ static struct ibv_qp *create_qp(struct ibv_context *context, qp->db[MLX5_SND_DBR] = 0; } - cmd.buf_addr = (uintptr_t) qp->buf.buf; + cmd.buf_addr = (uintptr_t) qp->buf.ibv_buf.addr; cmd.sq_buf_addr = (attr->qp_type == IBV_QPT_RAW_PACKET || qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) ? - (uintptr_t) qp->sq_buf.buf : 0; + (uintptr_t) qp->sq_buf.ibv_buf.addr : 0; cmd.db_addr = (uintptr_t) qp->db; cmd.sq_wqe_count = qp->sq.wqe_cnt; cmd.rq_wqe_count = qp->rq.wqe_cnt; @@ -3826,7 +3824,7 @@ struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, if (!msrq->custom_db) *msrq->db = 0; - cmd.buf_addr = (uintptr_t)msrq->buf.buf; + cmd.buf_addr = (uintptr_t)msrq->buf.ibv_buf.addr; cmd.db_addr = (uintptr_t)msrq->db; msrq->wq_sig = srq_sig_enabled(); if (msrq->wq_sig) @@ -4382,7 +4380,6 @@ static int mlx5_alloc_rwq_buf(struct ibv_context *context, } if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { - rwq->buf.mparent_domain = to_mparent_domain(pd); rwq->buf.req_alignment = to_mdev(context->device)->page_size; rwq->buf.resource_type = MLX5DV_RES_TYPE_RWQ; } @@ -4392,7 +4389,7 @@ static int mlx5_alloc_rwq_buf(struct ibv_context *context, (context->device)->page_size), to_mdev(context->device)->page_size, alloc_type, - MLX5_RWQ_PREFIX); + MLX5_RWQ_PREFIX, pd); if (err) { free(rwq->rq.wrid); @@ -4454,9 +4451,9 @@ static struct ibv_wq *create_wq(struct ibv_context *context, rwq->db[MLX5_SND_DBR] = 0; } - rwq->pbuff = rwq->buf.buf + rwq->rq.offset; + rwq->pbuff = rwq->buf.ibv_buf.addr + rwq->rq.offset; rwq->recv_db = &rwq->db[MLX5_RCV_DBR]; - cmd.buf_addr = (uintptr_t)rwq->buf.buf; + cmd.buf_addr = (uintptr_t)rwq->buf.ibv_buf.addr; cmd.db_addr = (uintptr_t)rwq->db; cmd.rq_wqe_count = rwq->rq.wqe_cnt; cmd.rq_wqe_shift = rwq->rq.wqe_shift; From 0be215c091eb821ad83b7f14e014923ea4467508 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 17 Feb 2026 10:24:57 +0100 Subject: [PATCH 16/33] mlx5: Add DMA-buf heap and CoCo shared allocation support When a parent domain is created with ALLOW_CC_UNPROTECTED_ALLOC and the device reports CC_DMA_BOUNCE, open a dmabuf heap and use it for all provider-internal buffer allocations (CQ, QP, SRQ, RWQ, doorbell records) through the existing preferred allocation path. Signed-off-by: Jiri Pirko --- providers/mlx5/buf.c | 63 ++++++++++++++++++++++++++++++++++++++++++ providers/mlx5/cq.c | 3 +- providers/mlx5/dbrec.c | 32 +++++++++++++-------- providers/mlx5/mlx5.h | 8 ++++++ providers/mlx5/srq.c | 5 ++++ providers/mlx5/verbs.c | 38 +++++++++++++++++++++++-- 6 files changed, 134 insertions(+), 15 deletions(-) diff --git a/providers/mlx5/buf.c b/providers/mlx5/buf.c index b44ee7d52..1a6b6cda6 100644 --- a/providers/mlx5/buf.c +++ b/providers/mlx5/buf.c @@ -279,6 +279,49 @@ static int mlx5_alloc_buf_custom(struct mlx5_context *ctx, return -1; } +void mlx5_free_buf_dmabuf(struct mlx5_context *ctx, struct mlx5_buf *buf) +{ + if (!buf->ibv_buf.addr) + return; + ibv_dofork_range(buf->ibv_buf.addr, buf->ibv_buf.size); + ibv_dmabuf_heap_free(buf->ibv_buf.addr, buf->ibv_buf.size, + buf->ibv_buf.dmabuf_fd); +} + +int mlx5_alloc_buf_dmabuf(struct mlx5_context *ctx, + struct mlx5_buf *buf, size_t size, struct ibv_pd *pd) +{ + struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); + int dmabuf_fd = -1; + void *addr; + + if (!size) { + addr = NULL; + goto out; + } + + /* + * buf->req_alignment is not enforced on the mapping: the buffer is its + * own dma-buf, passed to the kernel as a per-buffer UMEM at offset 0, + * so the resulting HW page offset is 0 and meets any alignment. + */ + addr = ibv_dmabuf_heap_alloc(mparent_domain->dmabuf_heap, size, + &dmabuf_fd); + if (!addr) + return -1; + + if (ibv_dontfork_range(addr, size)) { + ibv_dmabuf_heap_free(addr, size, dmabuf_fd); + return -1; + } + +out: + ibv_buf_init_dmabuf(&buf->ibv_buf, pd, addr, size, + dmabuf_fd); + buf->type = MLX5_ALLOC_TYPE_DMABUF; + return 0; +} + int mlx5_alloc_prefered_buf(struct mlx5_context *mctx, struct mlx5_buf *buf, size_t size, int page_size, @@ -288,6 +331,10 @@ int mlx5_alloc_prefered_buf(struct mlx5_context *mctx, { int ret; + if (type == MLX5_ALLOC_TYPE_DMABUF) + return mlx5_alloc_buf_dmabuf(mctx, buf, align(size, page_size), + pd); + if (type == MLX5_ALLOC_TYPE_CUSTOM) { ret = mlx5_alloc_buf_custom(mctx, buf, size, pd); if (ret <= 0) @@ -363,6 +410,10 @@ int mlx5_free_actual_buf(struct mlx5_context *ctx, struct mlx5_buf *buf) mlx5_free_buf_custom(ctx, buf); break; + case MLX5_ALLOC_TYPE_DMABUF: + mlx5_free_buf_dmabuf(ctx, buf); + break; + default: mlx5_err(ctx->dbg_fp, "Bad allocation type\n"); } @@ -402,6 +453,13 @@ bool mlx5_is_custom_alloc(struct ibv_pd *pd) return (mparent_domain && mparent_domain->alloc && mparent_domain->free); } +bool mlx5_is_dmabuf_alloc(struct ibv_pd *pd) +{ + struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); + + return (mparent_domain && mparent_domain->dmabuf_heap); +} + bool mlx5_is_extern_alloc(struct mlx5_context *context) { return context->extern_alloc.alloc && context->extern_alloc.free; @@ -417,6 +475,11 @@ void mlx5_get_alloc_type(struct mlx5_context *context, char *env_value; char name[128]; + if (mlx5_is_dmabuf_alloc(pd)) { + *alloc_type = MLX5_ALLOC_TYPE_DMABUF; + return; + } + if (mlx5_is_custom_alloc(pd)) { *alloc_type = MLX5_ALLOC_TYPE_CUSTOM; return; diff --git a/providers/mlx5/cq.c b/providers/mlx5/cq.c index 4d2ad7f8f..29916a276 100644 --- a/providers/mlx5/cq.c +++ b/providers/mlx5/cq.c @@ -1946,7 +1946,8 @@ int mlx5_alloc_cq_buf(struct mlx5_context *mctx, struct mlx5_cq *cq, mlx5_get_alloc_type(mctx, cq->parent_domain, MLX5_CQ_PREFIX, &type, default_type); - if (type == MLX5_ALLOC_TYPE_CUSTOM) { + if (type == MLX5_ALLOC_TYPE_CUSTOM || + type == MLX5_ALLOC_TYPE_DMABUF) { buf->req_alignment = dev->page_size; buf->resource_type = MLX5DV_RES_TYPE_CQ; } diff --git a/providers/mlx5/dbrec.c b/providers/mlx5/dbrec.c index c6ef9415b..091519511 100644 --- a/providers/mlx5/dbrec.c +++ b/providers/mlx5/dbrec.c @@ -64,10 +64,13 @@ static struct mlx5_db_page *__add_page(struct mlx5_context *context, if (!page) return NULL; - if (mlx5_is_extern_alloc(context)) + if (mlx5_is_dmabuf_alloc(pd)) { + ret = mlx5_alloc_buf_dmabuf(context, &page->buf, ps, pd); + } else if (mlx5_is_extern_alloc(context)) { ret = mlx5_alloc_buf_extern(context, &page->buf, ps, pd); - else + } else { ret = mlx5_alloc_buf(&page->buf, ps, ps, pd); + } if (ret) { free(page); return NULL; @@ -92,7 +95,7 @@ __be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, __be32 *db = NULL; int i, j; - if (mlx5_is_custom_alloc(pd)) { + if (!mlx5_is_dmabuf_alloc(pd) && mlx5_is_custom_alloc(pd)) { struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); db = mparent_domain->alloc(&mparent_domain->mpd.ibv_pd, @@ -112,10 +115,20 @@ __be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, default_alloc: pthread_mutex_lock(&context->dbr_map_mutex); - page = list_top(&context->dbr_available_pages, struct mlx5_db_page, - available); - if (page) - goto found; + /* + * DMA-buf and non-DMA-buf pages must not be mixed, so reuse only an + * available page whose backing type matches the requested allocation. + * In the common, homogeneous case (e.g. no DMA-buf in use, where both + * sides are false) the first page already matches, so we still take + * the head and keep the original list_top() O(1) fast path. The scan + * only walks further - becoming O(n) - when DMA-buf and non-DMA-buf + * pages coexist on the list and the matching type is not at the head. + */ + list_for_each(&context->dbr_available_pages, page, available) { + if (mlx5_is_dmabuf_alloc(pd) == + (page->buf.type == MLX5_ALLOC_TYPE_DMABUF)) + goto found; + } page = __add_page(context, pd); if (!page) @@ -174,10 +187,7 @@ void mlx5_free_db(struct mlx5_context *context, __be32 *db, struct ibv_pd *pd, cl_qmap_remove_item(&context->dbr_map, item); list_del(&page->available); - if (page->buf.type == MLX5_ALLOC_TYPE_EXTERNAL) - mlx5_free_buf_extern(context, &page->buf); - else - mlx5_free_buf(&page->buf); + mlx5_free_actual_buf(context, &page->buf); free(page); } diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index c0c66df40..585133d36 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -42,6 +42,7 @@ #include #include +#include #include #include #include @@ -203,6 +204,7 @@ enum mlx5_alloc_type { MLX5_ALLOC_TYPE_PREFER_CONTIG, MLX5_ALLOC_TYPE_EXTERNAL, MLX5_ALLOC_TYPE_CUSTOM, + MLX5_ALLOC_TYPE_DMABUF, MLX5_ALLOC_TYPE_ALL }; @@ -266,6 +268,7 @@ enum mlx5_ctx_flags { MLX5_CTX_FLAGS_SQD2RTS_SUPPORTED = 1 << 3, MLX5_CTX_FLAGS_REAL_TIME_TS_SUPPORTED = 1 << 4, MLX5_CTX_FLAGS_MKEY_UPDATE_TAG_SUPPORTED = 1 << 5, + MLX5_CTX_FLAGS_CC_DMA_BOUNCE = 1 << 6, }; struct mlx5_entropy_caps { @@ -471,6 +474,7 @@ struct mlx5_parent_domain { void (*free)(struct ibv_pd *pd, void *pd_context, void *ptr, uint64_t resource_type); void *pd_context; + struct ibv_dmabuf_heap *dmabuf_heap; }; enum { @@ -1131,10 +1135,14 @@ void mlx5_get_alloc_type(struct mlx5_context *context, enum mlx5_alloc_type default_alloc_type); int mlx5_use_huge(const char *key); bool mlx5_is_custom_alloc(struct ibv_pd *pd); +bool mlx5_is_dmabuf_alloc(struct ibv_pd *pd); bool mlx5_is_extern_alloc(struct mlx5_context *context); int mlx5_alloc_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf, size_t size, struct ibv_pd *pd); void mlx5_free_buf_extern(struct mlx5_context *ctx, struct mlx5_buf *buf); +int mlx5_alloc_buf_dmabuf(struct mlx5_context *ctx, struct mlx5_buf *buf, + size_t size, struct ibv_pd *pd); +void mlx5_free_buf_dmabuf(struct mlx5_context *ctx, struct mlx5_buf *buf); __be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, bool *custom_alloc); diff --git a/providers/mlx5/srq.c b/providers/mlx5/srq.c index 95ad20e74..efd2bc7e8 100644 --- a/providers/mlx5/srq.c +++ b/providers/mlx5/srq.c @@ -389,6 +389,11 @@ int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq, mlx5_get_alloc_type(ctx, pd, MLX5_SRQ_PREFIX, &alloc_type, MLX5_ALLOC_TYPE_ANON); + if (alloc_type == MLX5_ALLOC_TYPE_DMABUF) { + errno = EOPNOTSUPP; + return -1; + } + if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { srq->buf.req_alignment = to_mdev(context->device)->page_size; srq->buf.resource_type = MLX5DV_RES_TYPE_SRQ; diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 2439ccd00..3aa858498 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -557,7 +557,8 @@ mlx5_alloc_parent_domain(struct ibv_context *context, if (!check_comp_mask(attr->comp_mask, IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS | - IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT)) { + IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT | + IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC)) { errno = EINVAL; return NULL; } @@ -568,6 +569,18 @@ mlx5_alloc_parent_domain(struct ibv_context *context, return NULL; } + if (attr->comp_mask & IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC) { + struct mlx5_context *mctx = to_mctx(context); + + if (mctx->flags & MLX5_CTX_FLAGS_CC_DMA_BOUNCE) { + mparent_domain->dmabuf_heap = ibv_dmabuf_heap_cc_shared_init(); + if (!mparent_domain->dmabuf_heap) { + free(mparent_domain); + return NULL; + } + } + } + if (attr->td) { mparent_domain->mtd = to_mtd(attr->td); atomic_fetch_add(&mparent_domain->mtd->refcount, 1); @@ -602,6 +615,9 @@ static int mlx5_dealloc_parent_domain(struct mlx5_parent_domain *mparent_domain) if (mparent_domain->mtd) atomic_fetch_sub(&mparent_domain->mtd->refcount, 1); + if (mparent_domain->dmabuf_heap) + ibv_dmabuf_heap_destroy(mparent_domain->dmabuf_heap); + free(mparent_domain); return 0; } @@ -1303,6 +1319,13 @@ int mlx5_resize_cq(struct ibv_cq *ibcq, int cqe) goto out; } + if (mlx5_is_dmabuf_alloc(cq->parent_domain)) { + cq->resize_buf = NULL; + errno = EOPNOTSUPP; + err = EOPNOTSUPP; + goto out; + } + /* currently we don't change cqe size */ cq->resize_cqe_sz = cq->cqe_sz; cq->resize_cqes = cqe; @@ -2020,7 +2043,8 @@ static int mlx5_alloc_qp_buf(struct ibv_context *context, mlx5_get_alloc_type(to_mctx(context), attr->pd, MLX5_QP_PREFIX, &alloc_type, default_alloc_type); - if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { + if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM || + alloc_type == MLX5_ALLOC_TYPE_DMABUF) { if (attr->qp_type != IBV_QPT_RAW_PACKET && !(qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY)) req_align = mlx5_set_custom_qp_alignment(context, qp); @@ -2047,7 +2071,8 @@ static int mlx5_alloc_qp_buf(struct ibv_context *context, size_t aligned_sq_buf_size = align(qp->sq_buf_size, to_mdev(context->device)->page_size); - if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { + if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM || + alloc_type == MLX5_ALLOC_TYPE_DMABUF) { qp->sq_buf.req_alignment = to_mdev(context->device)->page_size; qp->sq_buf.resource_type = MLX5DV_RES_TYPE_QP; } @@ -4293,6 +4318,8 @@ void mlx5_query_device_ctx(struct mlx5_context *mctx) mctx->cached_device_cap_flags = device_attr.orig_attr.device_cap_flags; mctx->atomic_cap = device_attr.orig_attr.atomic_cap; mctx->max_dm_size = device_attr.max_dm_size; + if (device_attr.device_cap_flags_ex & IBV_DEVICE_CC_DMA_BOUNCE) + mctx->flags |= MLX5_CTX_FLAGS_CC_DMA_BOUNCE; mctx->cached_tso_caps = resp.tso_caps; if (resp.mlx5_ib_support_multi_pkt_send_wqes & MLX5_IB_ALLOW_MPW) @@ -4373,6 +4400,11 @@ static int mlx5_alloc_rwq_buf(struct ibv_context *context, mlx5_get_alloc_type(to_mctx(context), pd, MLX5_RWQ_PREFIX, &alloc_type, MLX5_ALLOC_TYPE_ANON); + if (alloc_type == MLX5_ALLOC_TYPE_DMABUF) { + errno = EOPNOTSUPP; + return -1; + } + rwq->rq.wrid = malloc(rwq->rq.wqe_cnt * sizeof(uint64_t)); if (!rwq->rq.wrid) { errno = ENOMEM; From 28142de3d39b91c58846bc00c4d19ff27eb9447f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 17 Feb 2026 10:25:17 +0100 Subject: [PATCH 17/33] mlx5: Implement ibv_alloc_buf/ibv_free_buf ops Implement mlx5_alloc_buf_op()/mlx5_free_buf_op() using the existing struct mlx5_buf based allocation infrastructure. Signed-off-by: Jiri Pirko --- providers/mlx5/buf.c | 2 +- providers/mlx5/mlx5.c | 2 ++ providers/mlx5/mlx5.h | 2 ++ providers/mlx5/verbs.c | 43 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 1 deletion(-) diff --git a/providers/mlx5/buf.c b/providers/mlx5/buf.c index 1a6b6cda6..9fc7d44a0 100644 --- a/providers/mlx5/buf.c +++ b/providers/mlx5/buf.c @@ -613,9 +613,9 @@ void mlx5_free_buf_contig(struct mlx5_context *mctx, struct mlx5_buf *buf) int mlx5_alloc_buf(struct mlx5_buf *buf, size_t size, int page_size, struct ibv_pd *pd) { + size_t al_size; void *addr; int ret; - int al_size; al_size = align(size, page_size); ret = posix_memalign(&addr, page_size, al_size); diff --git a/providers/mlx5/mlx5.c b/providers/mlx5/mlx5.c index 8cc9ad6f4..22daff36b 100644 --- a/providers/mlx5/mlx5.c +++ b/providers/mlx5/mlx5.c @@ -133,6 +133,8 @@ static const struct verbs_context_ops mlx5_ctx_common_ops = { .alloc_dm = mlx5_alloc_dm, .alloc_parent_domain = mlx5_alloc_parent_domain, .alloc_td = mlx5_alloc_td, + .alloc_buf = mlx5_alloc_buf_op, + .free_buf = mlx5_free_buf_op, .attach_counters_point_flow = mlx5_attach_counters_point_flow, .close_xrcd = mlx5_close_xrcd, .create_counters = mlx5_create_counters, diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index 585133d36..0803b8fc7 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -1308,6 +1308,8 @@ int mlx5_dealloc_td(struct ibv_td *td); struct ibv_pd *mlx5_alloc_parent_domain(struct ibv_context *context, struct ibv_parent_domain_init_attr *attr); +void *mlx5_alloc_buf_op(struct ibv_pd *pd, size_t size, struct ibv_buf **buf); +void mlx5_free_buf_op(struct ibv_buf *buf); struct ibv_dmah *mlx5_alloc_dmah(struct ibv_context *context, struct ibv_dmah_init_attr *attr); diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 3aa858498..6b14672e1 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -622,6 +622,49 @@ static int mlx5_dealloc_parent_domain(struct mlx5_parent_domain *mparent_domain) return 0; } +void *mlx5_alloc_buf_op(struct ibv_pd *pd, size_t size, struct ibv_buf **ibv_buf) +{ + struct mlx5_context *mctx = to_mctx(pd->context); + enum mlx5_alloc_type alloc_type; + struct mlx5_buf *buf; + + if (size == 0) { + errno = EINVAL; + return NULL; + } + + buf = calloc(1, sizeof(*buf)); + if (!buf) { + errno = ENOMEM; + return NULL; + } + + mlx5_get_alloc_type(mctx, pd, MLX5_MR_PREFIX, &alloc_type, + MLX5_ALLOC_TYPE_ANON); + + buf->req_alignment = to_mdev(pd->context->device)->page_size; + + if (mlx5_alloc_prefered_buf(mctx, buf, size, + to_mdev(pd->context->device)->page_size, + alloc_type, MLX5_MR_PREFIX, pd)) { + free(buf); + errno = ENOMEM; + return NULL; + } + + *ibv_buf = &buf->ibv_buf; + return buf->ibv_buf.addr; +} + +void mlx5_free_buf_op(struct ibv_buf *ibv_buf) +{ + struct mlx5_buf *buf; + + buf = container_of(ibv_buf, struct mlx5_buf, ibv_buf); + mlx5_free_actual_buf(to_mctx(ibv_buf->pd->context), buf); + free(buf); +} + static int _mlx5_free_pd(struct ibv_pd *pd, bool unimport) { int ret; From 9b1812c023406e0aecd068fc14ce10a1a94379a9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 4 May 2026 15:04:49 +0200 Subject: [PATCH 18/33] mlx5: Pass per-buffer dmabuf UMEM attrs to kernel Wire CQ and QP create paths to the new per-attribute UMEM UAPI: emit a struct ib_uverbs_buffer_desc for each dmabuf-backed buffer (CQ ring, QP main/SQ, doorbell record) on the driver_attrs chain via fill_attr_in_buf_umem(). Signed-off-by: Jiri Pirko --- providers/mlx5/dbrec.c | 7 ++++- providers/mlx5/mlx5.h | 4 ++- providers/mlx5/verbs.c | 64 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 62 insertions(+), 13 deletions(-) diff --git a/providers/mlx5/dbrec.c b/providers/mlx5/dbrec.c index 091519511..c5d1c3f68 100644 --- a/providers/mlx5/dbrec.c +++ b/providers/mlx5/dbrec.c @@ -89,12 +89,15 @@ static struct mlx5_db_page *__add_page(struct mlx5_context *context, } __be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, - bool *custom_alloc) + bool *custom_alloc, struct ibv_buf **dbrec_buf) { struct mlx5_db_page *page; __be32 *db = NULL; int i, j; + if (dbrec_buf) + *dbrec_buf = NULL; + if (!mlx5_is_dmabuf_alloc(pd) && mlx5_is_custom_alloc(pd)) { struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); @@ -146,6 +149,8 @@ __be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, --j; page->free[i] &= ~(1UL << j); db = page->buf.ibv_buf.addr + (i * 8 * sizeof(long) + j) * context->cache_line_size; + if (dbrec_buf) + *dbrec_buf = &page->buf.ibv_buf; out: pthread_mutex_unlock(&context->dbr_map_mutex); diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index 0803b8fc7..b24ef7989 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -505,6 +505,7 @@ struct mlx5_cq { uint32_t cqn; uint32_t cons_index; __be32 *dbrec; + struct ibv_buf *dbrec_ibv_buf; bool custom_db; int arm_sn; int cqe_sz; @@ -695,6 +696,7 @@ struct mlx5_qp { struct mlx5_wq sq; __be32 *db; + struct ibv_buf *dbrec_ibv_buf; bool custom_db; struct mlx5_wq rq; int wq_sig; @@ -1145,7 +1147,7 @@ int mlx5_alloc_buf_dmabuf(struct mlx5_context *ctx, struct mlx5_buf *buf, void mlx5_free_buf_dmabuf(struct mlx5_context *ctx, struct mlx5_buf *buf); __be32 *mlx5_alloc_dbrec(struct mlx5_context *context, struct ibv_pd *pd, - bool *custom_alloc); + bool *custom_alloc, struct ibv_buf **dbrec_buf); void mlx5_free_db(struct mlx5_context *context, __be32 *db, struct ibv_pd *pd, bool custom_alloc); diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 6b14672e1..6f321f66c 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -1061,8 +1061,10 @@ static struct ibv_cq_ex *create_cq(struct ibv_context *context, struct mlx5dv_cq_init_attr *mlx5cq_attr) { DECLARE_COMMAND_BUFFER_LINK(driver_attrs, UVERBS_OBJECT_CQ, - UVERBS_METHOD_CQ_CREATE, 1, + UVERBS_METHOD_CQ_CREATE, 3, NULL); + struct ib_uverbs_buffer_desc cq_buf_umem_desc; + struct ib_uverbs_buffer_desc cq_dbr_umem_desc; struct mlx5_create_cq_ex cmd_ex = {}; struct mlx5_create_cq_ex_resp resp_ex = {}; struct mlx5_ib_create_cq *cmd_drv; @@ -1165,7 +1167,7 @@ static struct ibv_cq_ex *create_cq(struct ibv_context *context, } cq->dbrec = mlx5_alloc_dbrec(to_mctx(context), cq->parent_domain, - &cq->custom_db); + &cq->custom_db, &cq->dbrec_ibv_buf); if (!cq->dbrec) { mlx5_dbg(fp, MLX5_DBG_CQ, "\n"); goto err_buf; @@ -1234,6 +1236,12 @@ static struct ibv_cq_ex *create_cq(struct ibv_context *context, } } + fill_attr_in_buf_umem(driver_attrs, UVERBS_ATTR_CREATE_CQ_BUF_UMEM, + &cq_buf_umem_desc, &cq->buf_a.ibv_buf, NULL, 0); + fill_attr_in_buf_umem(driver_attrs, MLX5_IB_ATTR_CREATE_CQ_DBR_BUF_UMEM, + &cq_dbr_umem_desc, cq->dbrec_ibv_buf, cq->dbrec, + sizeof(*cq->dbrec) * 2); + { struct ibv_cq_init_attr_ex cq_attr_ex = *cq_attr; @@ -1476,7 +1484,8 @@ struct ibv_srq *mlx5_create_srq(struct ibv_pd *pd, goto err; } - srq->db = mlx5_alloc_dbrec(to_mctx(pd->context), pd, &srq->custom_db); + srq->db = mlx5_alloc_dbrec(to_mctx(pd->context), pd, &srq->custom_db, + NULL); if (!srq->db) { mlx5_err(ctx->dbg_fp, "%s-%d:\n", __func__, __LINE__); goto err_free; @@ -2254,7 +2263,8 @@ static int mlx5_cmd_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr, struct mlx5_create_qp *cmd, struct mlx5_qp *qp, - struct mlx5_create_qp_ex_resp *resp) + struct mlx5_create_qp_ex_resp *resp, + struct ibv_command_buffer *driver_attrs) { struct mlx5_create_qp_ex cmd_ex; int ret; @@ -2267,7 +2277,7 @@ static int mlx5_cmd_create_qp_ex(struct ibv_context *context, ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex), &resp->ibv_resp, - sizeof(*resp), NULL); + sizeof(*resp), driver_attrs); return ret; } @@ -2482,6 +2492,11 @@ static struct ibv_qp *create_qp(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr, struct mlx5dv_qp_init_attr *mlx5_qp_attr) { + DECLARE_COMMAND_BUFFER_LINK(driver_attrs, UVERBS_OBJECT_QP, + UVERBS_METHOD_QP_CREATE, 3, NULL); + struct ib_uverbs_buffer_desc qp_buf_umem_desc; + struct ib_uverbs_buffer_desc qp_sq_buf_umem_desc; + struct ib_uverbs_buffer_desc qp_dbr_umem_desc; struct mlx5_create_qp cmd; struct mlx5_create_qp_resp resp; struct mlx5_create_qp_ex_resp resp_ex; @@ -2495,6 +2510,8 @@ static struct ibv_qp *create_qp(struct ibv_context *context, FILE *fp = ctx->dbg_fp; struct mlx5_parent_domain *mparent_domain; struct mlx5_ib_create_qp_resp *resp_drv; + bool need_buf_umem; + uint16_t qp_main_attr_id; if (attr->comp_mask & ~MLX5_CREATE_QP_SUP_COMP_MASK) return NULL; @@ -2746,7 +2763,8 @@ static struct ibv_qp *create_qp(struct ibv_context *context, mlx5_spinlock_init_pd(&qp->rq.lock, attr->pd)) goto err_free_qp_buf; - qp->db = mlx5_alloc_dbrec(ctx, attr->pd, &qp->custom_db); + qp->db = mlx5_alloc_dbrec(ctx, attr->pd, &qp->custom_db, + &qp->dbrec_ibv_buf); if (!qp->db) { mlx5_dbg(fp, MLX5_DBG_QP, "\n"); goto err_free_qp_buf; @@ -2803,8 +2821,31 @@ static struct ibv_qp *create_qp(struct ibv_context *context, /* Create QP should start from ECE version 1 as a trigger */ cmd.ece_options = 0x10000000; - if (attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK) - ret = mlx5_cmd_create_qp_ex(context, attr, &cmd, qp, &resp_ex); + qp_main_attr_id = (attr->qp_type == IBV_QPT_RAW_PACKET || + qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY) ? + UVERBS_ATTR_CREATE_QP_RQ_BUF_UMEM : + UVERBS_ATTR_CREATE_QP_BUF_UMEM; + + fill_attr_in_buf_umem(driver_attrs, qp_main_attr_id, + &qp_buf_umem_desc, &qp->buf.ibv_buf, NULL, 0); + if (qp->sq_buf.ibv_buf.addr) + fill_attr_in_buf_umem(driver_attrs, + UVERBS_ATTR_CREATE_QP_SQ_BUF_UMEM, + &qp_sq_buf_umem_desc, + &qp->sq_buf.ibv_buf, NULL, 0); + fill_attr_in_buf_umem(driver_attrs, + MLX5_IB_ATTR_CREATE_QP_DBR_BUF_UMEM, + &qp_dbr_umem_desc, qp->dbrec_ibv_buf, qp->db, + sizeof(*qp->db) * 2); + + need_buf_umem = (qp->buf.type == MLX5_ALLOC_TYPE_DMABUF) || + (qp->sq_buf.type == MLX5_ALLOC_TYPE_DMABUF) || + (qp->dbrec_ibv_buf && + qp->dbrec_ibv_buf->dmabuf_fd != -1); + + if ((attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK) || need_buf_umem) + ret = mlx5_cmd_create_qp_ex(context, attr, &cmd, qp, &resp_ex, + driver_attrs); else ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, attr, &cmd.ibv_cmd, sizeof(cmd), @@ -2814,7 +2855,8 @@ static struct ibv_qp *create_qp(struct ibv_context *context, goto err_free_uidx; } - resp_drv = attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK ? + resp_drv = ((attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK) || + need_buf_umem) ? &resp_ex.drv_payload : &resp.drv_payload; if (!ctx->cqe_version) { if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) { @@ -3883,7 +3925,7 @@ struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, goto err; } - msrq->db = mlx5_alloc_dbrec(ctx, attr->pd, &msrq->custom_db); + msrq->db = mlx5_alloc_dbrec(ctx, attr->pd, &msrq->custom_db, NULL); if (!msrq->db) { mlx5_err(ctx->dbg_fp, "%s-%d:\n", __func__, __LINE__); goto err_free; @@ -4517,7 +4559,7 @@ static struct ibv_wq *create_wq(struct ibv_context *context, if (mlx5_spinlock_init_pd(&rwq->rq.lock, attr->pd)) goto err_free_rwq_buf; - rwq->db = mlx5_alloc_dbrec(ctx, attr->pd, &rwq->custom_db); + rwq->db = mlx5_alloc_dbrec(ctx, attr->pd, &rwq->custom_db, NULL); if (!rwq->db) goto err_free_rwq_buf; From de196b3549996b102ef3688bf1dfc627e449a06e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 16 Feb 2026 14:51:19 +0100 Subject: [PATCH 19/33] rc_pingpong: Move buffer allocation after PD setup In preparation for the follow-up patch, move ctx->buf allocation later in pp_init_ctx(). Signed-off-by: Jiri Pirko --- libibverbs/examples/rc_pingpong.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/libibverbs/examples/rc_pingpong.c b/libibverbs/examples/rc_pingpong.c index 9781c4f34..c6b052680 100644 --- a/libibverbs/examples/rc_pingpong.c +++ b/libibverbs/examples/rc_pingpong.c @@ -344,20 +344,11 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, ctx->send_flags = IBV_SEND_SIGNALED; ctx->rx_depth = rx_depth; - ctx->buf = memalign(page_size, size); - if (!ctx->buf) { - fprintf(stderr, "Couldn't allocate work buf.\n"); - goto clean_ctx; - } - - /* FIXME memset(ctx->buf, 0, size); */ - memset(ctx->buf, 0x7b, size); - ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); - goto clean_buffer; + goto clean_ctx; } if (use_event) { @@ -431,6 +422,15 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, } } + ctx->buf = memalign(page_size, size); + if (!ctx->buf) { + fprintf(stderr, "Couldn't allocate work buf.\n"); + goto clean_dm; + } + + /* FIXME memset(ctx->buf, 0, size); */ + memset(ctx->buf, 0x7b, size); + if (implicit_odp) { ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, access_flags); } else { @@ -441,7 +441,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, if (!ctx->mr) { fprintf(stderr, "Couldn't register MR\n"); - goto clean_dm; + goto clean_buffer; } if (prefetch_mr) { @@ -557,6 +557,9 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, clean_mr: ibv_dereg_mr(ctx->mr); +clean_buffer: + free(ctx->buf); + clean_dm: if (ctx->dm) ibv_free_dm(ctx->dm); @@ -571,9 +574,6 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, clean_device: ibv_close_device(ctx->context); -clean_buffer: - free(ctx->buf); - clean_ctx: free(ctx); @@ -597,6 +597,8 @@ static int pp_close_ctx(struct pingpong_context *ctx) return 1; } + free(ctx->buf); + if (ctx->dm) { if (ibv_free_dm(ctx->dm)) { fprintf(stderr, "Couldn't free DM\n"); @@ -621,7 +623,6 @@ static int pp_close_ctx(struct pingpong_context *ctx) return 1; } - free(ctx->buf); free(ctx); return 0; From 76d0bee61571e4732d5ad59827e3d020b7599127 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 16 Feb 2026 19:12:12 +0100 Subject: [PATCH 20/33] rc_pingpong: Add unprotected memory allocation for CoCo guests Add -U/--allow-cc-unprotected option for running in CoCo guests if device DMA requires unprotected/shared memory. When set, create a parent domain with ALLOW_CC_UNPROTECTED_ALLOC and use ibv_alloc_buf()/ibv_free_buf()/ibv_reg_buf_mr() for MR buffer allocation and registration. Signed-off-by: Jiri Pirko --- libibverbs/examples/rc_pingpong.c | 89 +++++++++++++++++++++++++------ libibverbs/man/ibv_rc_pingpong.1 | 7 ++- 2 files changed, 79 insertions(+), 17 deletions(-) diff --git a/libibverbs/examples/rc_pingpong.c b/libibverbs/examples/rc_pingpong.c index c6b052680..c31bdb698 100644 --- a/libibverbs/examples/rc_pingpong.c +++ b/libibverbs/examples/rc_pingpong.c @@ -62,12 +62,13 @@ static int prefetch_mr; static int use_ts; static int validate_buf; static int use_dm; +static int allow_cc_unprotected; static int use_new_send; struct pingpong_context { struct ibv_context *context; struct ibv_comp_channel *channel; - struct ibv_pd *pd; + struct ibv_pd *pd; /* PD or parent domain (if using CC unprotected alloc) */ struct ibv_mr *mr; struct ibv_dm *dm; union { @@ -83,11 +84,13 @@ struct pingpong_context { int pending; struct ibv_port_attr portinfo; uint64_t completion_timestamp_mask; + struct ibv_buf *ibv_buf; + struct ibv_pd *base_pd; /* Real PD backing the parent domain; NULL unless CC unprotected alloc is used */ }; static struct ibv_cq *pp_cq(struct pingpong_context *ctx) { - return use_ts ? ibv_cq_ex_to_cq(ctx->cq_s.cq_ex) : + return (use_ts || allow_cc_unprotected) ? ibv_cq_ex_to_cq(ctx->cq_s.cq_ex) : ctx->cq_s.cq; } @@ -360,10 +363,30 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, } else ctx->channel = NULL; - ctx->pd = ibv_alloc_pd(ctx->context); - if (!ctx->pd) { - fprintf(stderr, "Couldn't allocate PD\n"); - goto clean_comp_channel; + if (allow_cc_unprotected) { + struct ibv_parent_domain_init_attr parent_attr = {}; + + ctx->base_pd = ibv_alloc_pd(ctx->context); + if (!ctx->base_pd) { + fprintf(stderr, "Couldn't allocate base PD\n"); + goto clean_comp_channel; + } + + parent_attr.pd = ctx->base_pd; + parent_attr.comp_mask = + IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC; + + ctx->pd = ibv_alloc_parent_domain(ctx->context, &parent_attr); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate parent domain\n"); + goto clean_base_pd; + } + } else { + ctx->pd = ibv_alloc_pd(ctx->context); + if (!ctx->pd) { + fprintf(stderr, "Couldn't allocate PD\n"); + goto clean_comp_channel; + } } if (use_odp || use_ts || use_dm) { @@ -422,7 +445,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, } } - ctx->buf = memalign(page_size, size); + ctx->buf = ibv_alloc_buf(ctx->pd, size, &ctx->ibv_buf); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_dm; @@ -433,10 +456,12 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, if (implicit_odp) { ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, access_flags); + } else if (use_dm) { + ctx->mr = ibv_reg_dm_mr(ctx->pd, ctx->dm, 0, + size, access_flags); } else { - ctx->mr = use_dm ? ibv_reg_dm_mr(ctx->pd, ctx->dm, 0, - size, access_flags) : - ibv_reg_mr(ctx->pd, ctx->buf, size, access_flags); + ctx->mr = ibv_reg_buf_mr(ctx->pd, ctx->ibv_buf, ctx->buf, size, + access_flags); } if (!ctx->mr) { @@ -460,15 +485,22 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, fprintf(stderr, "Couldn't prefetch MR(%d). Continue anyway\n", ret); } - if (use_ts) { + if (use_ts || allow_cc_unprotected) { struct ibv_cq_init_attr_ex attr_ex = { .cqe = rx_depth + 1, .cq_context = NULL, .channel = ctx->channel, .comp_vector = 0, - .wc_flags = IBV_WC_EX_WITH_COMPLETION_TIMESTAMP }; + if (use_ts) + attr_ex.wc_flags = IBV_WC_EX_WITH_COMPLETION_TIMESTAMP; + + if (allow_cc_unprotected) { + attr_ex.comp_mask |= IBV_CQ_INIT_ATTR_MASK_PD; + attr_ex.parent_domain = ctx->pd; + } + ctx->cq_s.cq_ex = ibv_create_cq_ex(ctx->context, &attr_ex); } else { ctx->cq_s.cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, @@ -558,7 +590,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, ibv_dereg_mr(ctx->mr); clean_buffer: - free(ctx->buf); + ibv_free_buf(ctx->ibv_buf); clean_dm: if (ctx->dm) @@ -566,6 +598,9 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, clean_pd: ibv_dealloc_pd(ctx->pd); +clean_base_pd: + if (ctx->base_pd) + ibv_dealloc_pd(ctx->base_pd); clean_comp_channel: if (ctx->channel) @@ -597,7 +632,7 @@ static int pp_close_ctx(struct pingpong_context *ctx) return 1; } - free(ctx->buf); + ibv_free_buf(ctx->ibv_buf); if (ctx->dm) { if (ibv_free_dm(ctx->dm)) { @@ -611,6 +646,13 @@ static int pp_close_ctx(struct pingpong_context *ctx) return 1; } + if (ctx->base_pd) { + if (ibv_dealloc_pd(ctx->base_pd)) { + fprintf(stderr, "Couldn't deallocate base PD\n"); + return 1; + } + } + if (ctx->channel) { if (ibv_destroy_comp_channel(ctx->channel)) { fprintf(stderr, "Couldn't destroy completion channel\n"); @@ -788,6 +830,8 @@ static void usage(const char *argv0) printf(" -c, --chk validate received buffer\n"); printf(" -j, --dm use device memory\n"); printf(" -N, --new_send use new post send WR API\n"); + printf(" -U, --allow-cc-unprotected allow allocation of unprotected/shared\n" + " memory on CoCo guests\n"); } int main(int argc, char *argv[]) @@ -838,10 +882,11 @@ int main(int argc, char *argv[]) { .name = "chk", .has_arg = 0, .val = 'c' }, { .name = "dm", .has_arg = 0, .val = 'j' }, { .name = "new_send", .has_arg = 0, .val = 'N' }, + { .name = "allow-cc-unprotected", .has_arg = 0, .val = 'U' }, {} }; - c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:eg:oOPtcjN", + c = getopt_long(argc, argv, "p:d:i:s:m:r:n:l:eg:oOPtcjNU", long_options, NULL); if (c == -1) @@ -925,6 +970,10 @@ int main(int argc, char *argv[]) use_new_send = 1; break; + case 'U': + allow_cc_unprotected = 1; + break; + default: usage(argv[0]); return 1; @@ -943,6 +992,16 @@ int main(int argc, char *argv[]) return 1; } + if (allow_cc_unprotected && use_odp) { + fprintf(stderr, "CoCo unprotected memory cannot be used with ODP\n"); + return 1; + } + + if (allow_cc_unprotected && use_dm) { + fprintf(stderr, "CoCo unprotected memory cannot be used with device memory\n"); + return 1; + } + if (!use_odp && prefetch_mr) { fprintf(stderr, "prefetch is valid only with on-demand memory region\n"); return 1; diff --git a/libibverbs/man/ibv_rc_pingpong.1 b/libibverbs/man/ibv_rc_pingpong.1 index 92554c021..318c85fd5 100644 --- a/libibverbs/man/ibv_rc_pingpong.1 +++ b/libibverbs/man/ibv_rc_pingpong.1 @@ -8,12 +8,12 @@ ibv_rc_pingpong \- simple InfiniBand RC transport test .B ibv_rc_pingpong [\-p port] [\-d device] [\-i ib port] [\-s size] [\-m size] [\-r rx depth] [\-n iters] [\-l sl] [\-e] [\-g gid index] -[\-o] [\-P] [\-t] [\-j] [\-N] \fBHOSTNAME\fR +[\-o] [\-P] [\-t] [\-c] [\-j] [\-N] [\-U] \fBHOSTNAME\fR .B ibv_rc_pingpong [\-p port] [\-d device] [\-i ib port] [\-s size] [\-m size] [\-r rx depth] [\-n iters] [\-l sl] [\-e] [\-g gid index] -[\-o] [\-P] [\-t] [\-j] [\-N] +[\-o] [\-P] [\-t] [\-c] [\-j] [\-N] [\-U] .SH DESCRIPTION .PP @@ -72,6 +72,9 @@ use device memory .TP \fB\-N\fR, \fB\-\-new_send\fR use new post send WR API +.TP +\fB\-U\fR, \fB\-\-allow\-cc\-unprotected\fR +allow allocation of unprotected/shared memory on CoCo guests .SH SEE ALSO .BR ibv_uc_pingpong (1), From 9fe1f2197bc93acbe2fe51df9341bbea4a10b9b7 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 9 Jun 2026 12:44:54 +0200 Subject: [PATCH 21/33] pyverbs: Add CoCo DMA bounce device cap flag Expose IBV_DEVICE_CC_DMA_BOUNCE to pyverbs so the extended device capability can be queried from Python. Signed-off-by: Jiri Pirko --- pyverbs/libibverbs_enums.pxd | 1 + pyverbs/libibverbs_enums.pyx | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pyverbs/libibverbs_enums.pxd b/pyverbs/libibverbs_enums.pxd index e7caf854b..c389aad5b 100644 --- a/pyverbs/libibverbs_enums.pxd +++ b/pyverbs/libibverbs_enums.pxd @@ -481,6 +481,7 @@ cdef extern from '': cdef unsigned long long IBV_DEVICE_RAW_SCATTER_FCS cdef unsigned long long IBV_DEVICE_PCI_WRITE_END_PADDING + cdef unsigned long long IBV_DEVICE_CC_DMA_BOUNCE cpdef enum ibv_parent_domain_init_attr_mask: IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS diff --git a/pyverbs/libibverbs_enums.pyx b/pyverbs/libibverbs_enums.pyx index d1c452479..b40709f1e 100644 --- a/pyverbs/libibverbs_enums.pyx +++ b/pyverbs/libibverbs_enums.pyx @@ -1,6 +1,7 @@ from pyverbs.libibverbs_enums cimport ( IBV_DEVICE_RAW_SCATTER_FCS, IBV_DEVICE_PCI_WRITE_END_PADDING, + IBV_DEVICE_CC_DMA_BOUNCE, IBV_ADVISE_MR_ADVICE_PREFETCH, IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE, IBV_ADVISE_MR_FLAG_FLUSH, @@ -12,6 +13,7 @@ from libc.stddef cimport size_t _IBV_DEVICE_RAW_SCATTER_FCS = IBV_DEVICE_RAW_SCATTER_FCS _IBV_DEVICE_PCI_WRITE_END_PADDING = IBV_DEVICE_PCI_WRITE_END_PADDING +_IBV_DEVICE_CC_DMA_BOUNCE = IBV_DEVICE_CC_DMA_BOUNCE _IBV_ALLOCATOR_USE_DEFAULT = IBV_ALLOCATOR_USE_DEFAULT _IBV_ADVISE_MR_ADVICE_PREFETCH = IBV_ADVISE_MR_ADVICE_PREFETCH _IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE = IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE From 7f3c8bd8c353bc749ae89b4c4bb98ad036494187 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 9 Jun 2026 12:44:54 +0200 Subject: [PATCH 22/33] pyverbs: Add parent domain CC unprotected flag Expose IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC to pyverbs so it can be passed in the parent domain init comp_mask. Signed-off-by: Jiri Pirko --- pyverbs/libibverbs_enums.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/pyverbs/libibverbs_enums.pxd b/pyverbs/libibverbs_enums.pxd index c389aad5b..fea916b78 100644 --- a/pyverbs/libibverbs_enums.pxd +++ b/pyverbs/libibverbs_enums.pxd @@ -486,6 +486,7 @@ cdef extern from '': cpdef enum ibv_parent_domain_init_attr_mask: IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT + IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC cdef void *IBV_ALLOCATOR_USE_DEFAULT From c85101b183f035e0a0d339e9c04f82a1996925e2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Jun 2026 19:01:57 +0200 Subject: [PATCH 23/33] pyverbs: Add ParentDomain CC unprotected alloc Add a comp_mask argument to ParentDomainInitAttr so callers can request IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC, the opt-in used by DMA-bounce devices on Confidential Computing (CoCo) guests. Signed-off-by: Jiri Pirko --- pyverbs/pd.pyx | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pyverbs/pd.pyx b/pyverbs/pd.pyx index d8845e8a0..1dc2bb06b 100644 --- a/pyverbs/pd.pyx +++ b/pyverbs/pd.pyx @@ -219,25 +219,33 @@ cdef class ParentDomainContext(PyverbsObject): cdef class ParentDomainInitAttr(PyverbsObject): - def __init__(self, PD pd not None, ParentDomainContext pd_context=None): + def __init__(self, PD pd not None, ParentDomainContext pd_context=None, + comp_mask=0): """ Represents ibv_parent_domain_init_attr C struct :param pd: PD to initialize the ParentDomain with :param pd_context: ParentDomainContext object including the alloc and free Python callbacks + :param comp_mask: Bit-mask of optional fields/flags. + The ALLOCATORS and PD_CONTEXT bits are set + automatically when pd_context is provided. """ super().__init__() self.pd = pd self.init_attr.pd = pd.pd + pd_context_bits = v.IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS | \ + v.IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT + if pd_context is None and (comp_mask & pd_context_bits): + raise PyverbsUserError('comp_mask bits ALLOCATORS/PD_CONTEXT require ' + 'pd_context to be provided') + self.init_attr.comp_mask = comp_mask if pd_context: self.init_attr.alloc = pd_alloc self.init_attr.free = pd_free self.init_attr.pd_context = pd_context - # The only way to use Python callbacks is to pass the (Python) - # functions through pd_context. Hence, we must set PD_CONTEXT - # in the comp mask. - self.init_attr.comp_mask = v.IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT | \ - v.IBV_PARENT_DOMAIN_INIT_ATTR_ALLOCATORS + # Python callbacks can only be passed through pd_context, so enable + # both pd_context-backed bits in the comp mask. + self.init_attr.comp_mask |= pd_context_bits @property def comp_mask(self): From c2ff90f8c79f3ac13a6b1914213e715ce2dcf0d6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Jun 2026 19:02:42 +0200 Subject: [PATCH 24/33] pyverbs: Add ibv_buf provider-aware buffer support Wrap the provider-aware buffer API (ibv_alloc_buf, ibv_free_buf, ibv_reg_buf_mr) with new Buf and BufMR classes. Buf owns a buffer allocated through a PD and is tracked in a per-PD weakset, so it is torn down before the PD it belongs to. BufMR registers an MR over a (sub)range of a Buf and deregisters it on close without freeing the underlying buffer. Signed-off-by: Jiri Pirko --- pyverbs/libibverbs.pxd | 8 +++ pyverbs/libibverbs_enums.pxd | 1 + pyverbs/mr.pxd | 12 ++++ pyverbs/mr.pyx | 118 ++++++++++++++++++++++++++++++++++- pyverbs/pd.pxd | 1 + pyverbs/pd.pyx | 8 ++- 6 files changed, 143 insertions(+), 5 deletions(-) diff --git a/pyverbs/libibverbs.pxd b/pyverbs/libibverbs.pxd index 96b4b098d..566089a99 100644 --- a/pyverbs/libibverbs.pxd +++ b/pyverbs/libibverbs.pxd @@ -83,6 +83,9 @@ cdef extern from 'infiniband/verbs.h': unsigned int lkey unsigned int rkey + cdef struct ibv_buf: + pass + cdef struct ibv_query_device_ex_input: unsigned int comp_mask @@ -663,6 +666,7 @@ cdef extern from 'infiniband/verbs.h': int fd uint64_t fd_offset ibv_dmah *dmah + ibv_buf *buf ibv_device **ibv_get_device_list(int *n) int ibv_get_device_index(ibv_device *device); @@ -685,6 +689,10 @@ cdef extern from 'infiniband/verbs.h': ibv_mr *ibv_reg_mr(ibv_pd *pd, void *addr, size_t length, int access) ibv_mr *ibv_reg_dmabuf_mr(ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) + void *ibv_alloc_buf(ibv_pd *pd, size_t size, ibv_buf **buf) + void ibv_free_buf(ibv_buf *buf) + ibv_mr *ibv_reg_buf_mr(ibv_pd *pd, ibv_buf *buf, void *addr, + size_t length, int access) int ibv_rereg_mr(ibv_mr *mr, int flags, ibv_pd *pd, void *addr, size_t length, int access) int ibv_dereg_mr(ibv_mr *mr) diff --git a/pyverbs/libibverbs_enums.pxd b/pyverbs/libibverbs_enums.pxd index fea916b78..9219a367d 100644 --- a/pyverbs/libibverbs_enums.pxd +++ b/pyverbs/libibverbs_enums.pxd @@ -522,6 +522,7 @@ cdef extern from '': IBV_REG_MR_MASK_FD IBV_REG_MR_MASK_FD_OFFSET IBV_REG_MR_MASK_DMAH + IBV_REG_MR_MASK_BUF cdef extern from "": diff --git a/pyverbs/mr.pxd b/pyverbs/mr.pxd index 94ed2c251..acfcea367 100644 --- a/pyverbs/mr.pxd +++ b/pyverbs/mr.pxd @@ -24,6 +24,18 @@ cdef class MR(PyverbsCM): cdef class MREx(MR): cdef object dmah + cdef object backing_buf + +cdef class Buf(PyverbsCM): + cdef v.ibv_buf *bufh + cdef void *addr + cdef size_t size + cdef object pd + cdef object mrs + cdef add_ref(self, obj) + +cdef class BufMR(MR): + cdef object backing_buf cdef class MWBindInfo(PyverbsCM): cdef v.ibv_mw_bind_info info diff --git a/pyverbs/mr.pyx b/pyverbs/mr.pyx index 706653b7c..727c7abb4 100644 --- a/pyverbs/mr.pyx +++ b/pyverbs/mr.pyx @@ -623,18 +623,21 @@ cdef class MREx(MR): This class provides more flexibility in memory registration compared to the basic MR class. """ def __init__(self, PD pd not None, length=0, access=0, address=None, - iova=None, fd=None, fd_offset=0, dmah=None, implicit=False, **kwargs): + iova=None, fd=None, fd_offset=0, dmah=None, implicit=False, + Buf buf=None, **kwargs): """ Register a memory region using the extended API ibv_reg_mr_ex. :param pd: A PD object :param length: Length (in bytes) of MR's buffer :param access: Access flags, see ibv_access_flags enum - :param address: Memory address to register (Optional) + :param address: Memory address to register; defaults to the start of + buf when buf is given (Optional) :param iova: IOVA address to register (Optional) :param fd: File descriptor for dma-buf based registration (Optional) :param fd_offset: Offset in the dma-buf (Optional) :param dmah: DMA handle for registration (Optional) :param implicit: If True, register implicit MR + :param buf: A Buf object to register the MR on (Optional). :param kwargs: Additional arguments :return: The newly created MREx on success """ @@ -645,8 +648,14 @@ cdef class MREx(MR): self.is_user_addr = False self.mmap_length = 0 + if buf is not None: + self.is_user_addr = True + if address is None: + self.buf = buf.addr + else: + self.buf = address # Handle memory allocation if no address is provided - if not address and length > 0 and fd is None: + elif not address and length > 0 and fd is None: self._allocate_buffer(length, self.is_huge, &mmap_len) if self.buf == NULL: raise PyverbsError(f'Failed to allocate MR buffer of size {length}') @@ -667,6 +676,9 @@ cdef class MREx(MR): in_.addr = self.buf in_.access = access + if buf is not None: + in_.comp_mask |= e.IBV_REG_MR_MASK_BUF + in_.buf = buf.bufh if iova is not None: in_.comp_mask |= e.IBV_REG_MR_MASK_IOVA in_.iova = iova @@ -687,6 +699,9 @@ cdef class MREx(MR): self.pd = pd pd.add_ref(self) + if buf is not None: + self.backing_buf = buf + buf.add_ref(self) if dmah is not None: (dmah).add_ref(self) self.dmah = dmah @@ -707,3 +722,100 @@ cdef class MREx(MR): if self.mr != NULL: super(MREx, self).close() self.dmah = None + +cdef class Buf(PyverbsCM): + """ + Represents an ibv_buf buffer allocated through a PD. + The device provider selects the backing memory for the given PD. + """ + def __init__(self, PD pd not None, size): + """ + Allocate a buffer of the given size from the provider associated with + the given protection domain (or parent domain). + :param pd: A PD/ParentDomain object used for the allocation + :param size: Size (in bytes) of the buffer to allocate + :return: The newly created Buf on success + """ + super().__init__() + self.mrs = weakref.WeakSet() + self.addr = v.ibv_alloc_buf(pd.pd, size, &self.bufh) + if self.addr == NULL: + raise PyverbsRDMAErrno(f'Failed to allocate ibv_buf of size {size}') + self.size = size + self.pd = pd + pd.add_ref(self) + self.logger.debug(f'Allocated ibv_buf of size {size}') + + def __dealloc__(self): + self.close() + + cpdef close(self): + """ + Frees the underlying buffer using ibv_free_buf(). + :return: None + """ + if self.bufh != NULL: + if self.logger: + self.logger.debug('Closing Buf') + close_weakrefs([self.mrs]) + v.ibv_free_buf(self.bufh) + self.bufh = NULL + self.addr = NULL + self.pd = None + + cdef add_ref(self, obj): + if isinstance(obj, (BufMR, MREx)): + self.mrs.add(obj) + else: + raise PyverbsError('Unrecognized object type') + + @property + def addr(self): + return self.addr + + @property + def size(self): + return self.size + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return 'Buf:\n' + \ + print_format.format('addr', self.addr) + \ + print_format.format('size', self.size) + + +cdef class BufMR(MR): + """ + BufMR represents a memory region registered for a Buf via ibv_reg_buf_mr(). + Unlike MR, the backing memory is owned by the Buf, so closing a BufMR only + deregisters the MR and never frees the buffer. The IBV_REG_MR_MASK_BUF path + of ibv_reg_mr_ex() is exercised through the MREx class instead. + """ + def __init__(self, PD pd not None, Buf buf not None, length=0, access=0, + offset=0): + """ + Register a memory region for (a subrange of) the given Buf. + :param pd: The same PD/ParentDomain used to allocate the Buf + :param buf: A Buf object allocated with ibv_alloc_buf() + :param length: Length (in bytes) to register + :param access: Access flags, see ibv_access_flags enum + :param offset: Byte offset within the Buf to start the registration + :return: The newly created BufMR on success + """ + self.logger = logging.getLogger(self.__class__.__name__) + cdef void *addr = (buf.addr + offset) + self.mr = v.ibv_reg_buf_mr(pd.pd, buf.bufh, addr, length, access) + if self.mr == NULL: + raise PyverbsRDMAErrno(f'Failed to register a buf MR. length: ' + f'{length}, access flags: {access}') + self.buf = addr + super().__init__(pd, length, access) + self.is_user_addr = True + self.is_huge = False + self.mmap_length = 0 + self.pd = pd + self.backing_buf = buf + pd.add_ref(self) + buf.add_ref(self) + self.logger.debug(f'Registered buf ibv_mr. Length: {length}, access ' + f'flags {access}') diff --git a/pyverbs/pd.pxd b/pyverbs/pd.pxd index 04e44531a..585afc559 100644 --- a/pyverbs/pd.pxd +++ b/pyverbs/pd.pxd @@ -16,6 +16,7 @@ cdef class PD(PyverbsCM): cdef remove_ref(self, obj) cdef object srqs cdef object mrs + cdef object bufs cdef object mws cdef object ahs cdef object qps diff --git a/pyverbs/pd.pyx b/pyverbs/pd.pyx index 1dc2bb06b..0fd05f79a 100644 --- a/pyverbs/pd.pyx +++ b/pyverbs/pd.pyx @@ -15,7 +15,7 @@ from pyverbs.base cimport close_weakrefs from pyverbs.wr cimport copy_sg_array from pyverbs.device cimport Context from pyverbs.cmid cimport CMID -from .mr cimport MR, MW, DMMR +from .mr cimport MR, MW, DMMR, Buf from pyverbs.srq cimport SRQ from pyverbs.addr cimport AH from pyverbs.cq cimport CQEX @@ -63,6 +63,7 @@ cdef class PD(PyverbsCM): self.logger.debug('Created PD') self.srqs = weakref.WeakSet() self.mrs = weakref.WeakSet() + self.bufs = weakref.WeakSet() self.mws = weakref.WeakSet() self.ahs = weakref.WeakSet() self.qps = weakref.WeakSet() @@ -113,7 +114,8 @@ cdef class PD(PyverbsCM): if self.logger: self.logger.debug('Closing PD') close_weakrefs([self.deks, self.mkeys, self.parent_domains, self.qps, - self.wqs, self.ahs, self.mws, self.mrs, self.srqs]) + self.wqs, self.ahs, self.mws, self.bufs, self.mrs, + self.srqs]) if not self._is_imported: rc = v.ibv_dealloc_pd(self.pd) if rc != 0: @@ -124,6 +126,8 @@ cdef class PD(PyverbsCM): cdef add_ref(self, obj): if isinstance(obj, MR) or isinstance(obj, DMMR): self.mrs.add(obj) + elif isinstance(obj, Buf): + self.bufs.add(obj) elif isinstance(obj, MW): self.mws.add(obj) elif isinstance(obj, AH): From a79ecd9e1124e8d65265f9552b75d66906284481 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 25 Jun 2026 15:17:44 +0200 Subject: [PATCH 25/33] pyverbs: Remove MREx.close() override to fix deallocation crash MREx.close() called the base close() and then reset its own cdef object member via 'self.dmah = None'. That assignment is both redundant and unsafe during deallocation. Cython's generated subclass deallocator clears the subclass cdef object members (dmah) before chaining to the base deallocator that runs the inherited MR.__dealloc__ -> self.close(). So when close() runs at deallocation time, dmah is already NULL and 'self.dmah = None' does an unguarded DECREF on NULL, segfaulting. This only stayed hidden while every MREx was closed explicitly first (which sets self.mr = NULL and makes the deallocation-time close() skip its body); an MREx reclaimed by the garbage collector crashes. Releasing dmah here is unnecessary: tp_dealloc already drops the reference, and MR.close() performs the ibv_dereg_mr(). Drop the override and inherit MR.close(), matching the other MR subclasses (e.g. DMMR) that do not reset their extra members in close(). Signed-off-by: Jiri Pirko --- pyverbs/mr.pyx | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pyverbs/mr.pyx b/pyverbs/mr.pyx index 727c7abb4..1921f70fa 100644 --- a/pyverbs/mr.pyx +++ b/pyverbs/mr.pyx @@ -717,11 +717,6 @@ cdef class MREx(MR): print_format.format('buf', self.buf) + \ print_format.format('handle', self.handle) - cpdef close(self): - """Close MREx and release its association with DMAHandle.""" - if self.mr != NULL: - super(MREx, self).close() - self.dmah = None cdef class Buf(PyverbsCM): """ From f4ff53eca9a9d6a1095d4a0c66a55a24d0cf4037 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Jun 2026 19:03:00 +0200 Subject: [PATCH 26/33] tests: Add tests for ibv_buf provider-aware buffers Add test_buf.py exercising the ibv_buf API (ibv_alloc_buf, ibv_reg_buf_mr, ibv_free_buf), and the ibv_reg_mr_ex() IBV_REG_MR_MASK_BUF path, over both a plain PD and a parent domain created with ALLOW_CC_UNPROTECTED_ALLOC, in API-only and RC/UD traffic variants. Add an is_cq_ex option to the rdma_traffic and atomic_traffic helpers so the tests can drive an extended CQ. Signed-off-by: Jiri Pirko --- tests/CMakeLists.txt | 1 + tests/test_buf.py | 362 +++++++++++++++++++++++++++++++++++++++++++ tests/utils.py | 16 +- 3 files changed, 373 insertions(+), 6 deletions(-) create mode 100644 tests/test_buf.py diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3ad9bfc5b..e42f94e34 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -14,6 +14,7 @@ rdma_python_test(tests rdmacm_utils.py test_addr.py test_atomic.py + test_buf.py test_cq.py test_cq_events.py test_cqex.py diff --git a/tests/test_buf.py b/tests/test_buf.py new file mode 100644 index 000000000..ec6cfe033 --- /dev/null +++ b/tests/test_buf.py @@ -0,0 +1,362 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2026 NVIDIA Corporation . All rights reserved. See COPYING file +""" +Tests for the provider-aware buffer API: ibv_alloc_buf(), ibv_free_buf(), +ibv_reg_buf_mr() and the ibv_reg_mr_ex() IBV_REG_MR_MASK_BUF path. +""" +import unittest +import errno +import resource + +from pyverbs.pyverbs_error import PyverbsRDMAError +from pyverbs.pd import PD, ParentDomain, ParentDomainInitAttr +from pyverbs.mr import MR, Buf, BufMR, MREx +from pyverbs.cq import CqInitAttrEx, CQEX +import pyverbs.device as d +from pyverbs.libibverbs_enums import ibv_access_flags, ibv_atomic_cap, \ + ibv_cq_init_attr_mask, ibv_wr_opcode, ibv_parent_domain_init_attr_mask, \ + IBV_WC_STANDARD_FLAGS, _IBV_DEVICE_CC_DMA_BOUNCE +from tests.base import PyverbsAPITestCase, RCResources, UDResources, \ + RDMATestCase +import tests.utils as u + + +# errnos that mean the environment cannot provide CC shared buffers / parent +# domains; the affected test is skipped rather than failed. +SKIP_ERRNOS = (errno.EOPNOTSUPP, errno.ENOENT, errno.ENODEV) + +PAGE_SIZE = resource.getpagesize() + + +def device_has_cc_dma_bounce(ctx): + """Whether the device reports IBV_DEVICE_CC_DMA_BOUNCE.""" + return bool(ctx.query_device_ex().device_cap_flags_ex & + _IBV_DEVICE_CC_DMA_BOUNCE) + + +def make_cc_pd(ctx): + """ + Allocate a base PD and a CC PD opting in to unprotected/shared memory for + CoCo guests. Returns a (base_pd, cc_pd) tuple; the base PD must outlive the + CC PD. Skips if CC PDs are unsupported. + """ + base_pd = PD(ctx) + attr = ParentDomainInitAttr( + pd=base_pd, + comp_mask=ibv_parent_domain_init_attr_mask. + IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC) + try: + pd = ParentDomain(ctx, attr=attr) + except PyverbsRDMAError as ex: + base_pd.close() + if ex.error_code in SKIP_ERRNOS: + raise unittest.SkipTest('CC PD is not supported') + raise + return base_pd, pd + + +def alloc_buf(pd, size): + """ + Allocate a buffer of ibv_buf type, skipping if the provider + does not support it + """ + try: + return Buf(pd, size) + except PyverbsRDMAError as ex: + if ex.error_code in SKIP_ERRNOS: + raise unittest.SkipTest('ibv_alloc_buf() is not supported') + raise + + +def register_buf_mr(pd, buf, length, access, offset=0, via_reg_mr_ex=False): + """ + Register (a subrange of) buf, either through ibv_reg_buf_mr() or, when + via_reg_mr_ex is set, through the ibv_reg_mr_ex() IBV_REG_MR_MASK_BUF path. + """ + try: + if via_reg_mr_ex: + return MREx(pd, length=length, access=access, buf=buf, + address=buf.addr + offset) + return BufMR(pd, buf, length, access, offset=offset) + except PyverbsRDMAError as ex: + if ex.error_code in SKIP_ERRNOS: + raise unittest.SkipTest('Buffer MR registration is not supported') + raise + + +def init_buf_resource(res, cc, via_reg_mr_ex, mr_access, buf_size, mr_offset): + """Store the buffer parameters before the base resource init runs.""" + res.cc = cc + res.via_reg_mr_ex = via_reg_mr_ex + res.mr_access = mr_access + res.buf_size = buf_size + res.mr_offset = mr_offset + res.base_pd = None + res.data_buf = None + + +def create_pd(res): + """Create the resource's PD: a plain PD or a CC PD.""" + if res.cc: + res.base_pd, res.pd = make_cc_pd(res.ctx) + else: + # A plain PD registers private memory, which a DMA-bounce device + # rejects; that device is exercised by the cc=True resources. + if device_has_cc_dma_bounce(res.ctx): + raise unittest.SkipTest('Plain-memory registration is rejected on ' + 'a DMA-bounce device') + res.pd = PD(res.ctx) + + +def create_cq(res): + """Create the resource's extended CQ, bound to the CC PD for cc.""" + comp_mask = ibv_cq_init_attr_mask.IBV_CQ_INIT_ATTR_MASK_FLAGS + if res.cc: + comp_mask |= ibv_cq_init_attr_mask.IBV_CQ_INIT_ATTR_MASK_PD + cqia = CqInitAttrEx(cqe=res.num_msgs, wc_flags=IBV_WC_STANDARD_FLAGS, + parent_domain=res.pd if res.cc else None, + comp_mask=comp_mask) + try: + res.cq = CQEX(res.ctx, cqia) + except PyverbsRDMAError as ex: + if ex.error_code in SKIP_ERRNOS: + raise unittest.SkipTest('Extended CQ is not supported') + raise + + +def create_buf_mr(res, mr_len, buf_size, offset): + """Allocate an ibv_buf and register (a subrange of) it as the MR.""" + if buf_size is None: + buf_size = mr_len + offset + res.data_buf = alloc_buf(res.pd, buf_size) + res.mr = register_buf_mr(res.pd, res.data_buf, mr_len, res.mr_access, + offset=offset, via_reg_mr_ex=res.via_reg_mr_ex) + + +class BufRC(RCResources): + """RC resources whose data buffer is allocated with ibv_alloc_buf().""" + def __init__(self, *args, cc=False, via_reg_mr_ex=False, + mr_access=ibv_access_flags.IBV_ACCESS_LOCAL_WRITE, + buf_size=None, mr_offset=0, **kwargs): + init_buf_resource(self, cc, via_reg_mr_ex, mr_access, buf_size, + mr_offset) + super().__init__(*args, **kwargs) + + def create_pd(self): + create_pd(self) + + def create_cq(self): + create_cq(self) + + def create_mr(self): + create_buf_mr(self, self.msg_size, self.buf_size, self.mr_offset) + + def create_qp_attr(self): + attr = super().create_qp_attr() + attr.qp_access_flags = self.mr_access + return attr + + +class BufUD(UDResources): + """UD resources whose data buffer is allocated with ibv_alloc_buf().""" + def __init__(self, *args, cc=False, via_reg_mr_ex=False, + mr_access=ibv_access_flags.IBV_ACCESS_LOCAL_WRITE, + buf_size=None, mr_offset=0, **kwargs): + init_buf_resource(self, cc, via_reg_mr_ex, mr_access, buf_size, + mr_offset) + super().__init__(*args, **kwargs) + + def create_pd(self): + create_pd(self) + + def create_cq(self): + create_cq(self) + + def create_mr(self): + # UD prepends a GRH on receive, so the buffer needs room for it. + mr_len = self.msg_size + self.GRH_SIZE + create_buf_mr(self, mr_len, self.buf_size, self.mr_offset) + + +class BufAPITest(PyverbsAPITestCase): + """Single-node API tests for ibv_alloc_buf()/ibv_reg_buf_mr().""" + + def get_pd(self, cc=False): + """Create a PD, or a CC PD when cc is set.""" + if cc: + _, pd = make_cc_pd(self.ctx) + return pd + if device_has_cc_dma_bounce(self.ctx): + raise unittest.SkipTest('Plain-memory registration is rejected on ' + 'a DMA-bounce device') + return PD(self.ctx) + + def get_buf(self, pd, size): + return alloc_buf(pd, size) + + def get_mr(self, pd, buf, length, offset=0, via_reg_mr_ex=False): + return register_buf_mr(pd, buf, length, + ibv_access_flags.IBV_ACCESS_LOCAL_WRITE, + offset=offset, via_reg_mr_ex=via_reg_mr_ex) + + def check_multiple_mrs_one_buf(self, pd, via_reg_mr_ex=False): + """Register two disjoint subranges of one buffer and access them.""" + buf = self.get_buf(pd, 2 * PAGE_SIZE) + mr1 = self.get_mr(pd, buf, PAGE_SIZE, offset=0, + via_reg_mr_ex=via_reg_mr_ex) + mr2 = self.get_mr(pd, buf, PAGE_SIZE, offset=PAGE_SIZE, + via_reg_mr_ex=via_reg_mr_ex) + self.assertNotEqual(mr1.lkey, mr2.lkey, + 'MR lkeys for disjoint subranges must differ') + self.assertEqual(mr1.buf, buf.addr, + 'MR1 address does not match buffer start') + self.assertEqual(mr2.buf, buf.addr + PAGE_SIZE, + 'MR2 address does not match its subrange start') + mr1.write('a' * PAGE_SIZE, PAGE_SIZE) + mr2.write('b' * PAGE_SIZE, PAGE_SIZE) + self.assertEqual(mr1.read(PAGE_SIZE, 0), b'a' * PAGE_SIZE, + 'MR1 readback does not match written data') + self.assertEqual(mr2.read(PAGE_SIZE, 0), b'b' * PAGE_SIZE, + 'MR2 readback does not match written data') + + def test_multiple_mrs_one_buf_plain_pd_reg_buf_mr(self): + """Plain PD: registered with ibv_reg_buf_mr().""" + self.check_multiple_mrs_one_buf(self.get_pd()) + + def test_multiple_mrs_one_buf_plain_pd_reg_mr_ex(self): + """Plain PD: registered with the ibv_reg_mr_ex() MASK_BUF path.""" + self.check_multiple_mrs_one_buf(self.get_pd(), via_reg_mr_ex=True) + + def test_multiple_mrs_one_buf_cc_pd_reg_buf_mr(self): + """CC PD: registered with ibv_reg_buf_mr().""" + self.check_multiple_mrs_one_buf(self.get_pd(cc=True)) + + def check_reg_wrong_allocating_pd_fails(self, pd, other_pd): + """Registering a buffer with a non-allocating PD must fail.""" + buf = self.get_buf(pd, PAGE_SIZE) + with self.assertRaises(PyverbsRDMAError) as cm: + register_buf_mr(other_pd, buf, PAGE_SIZE, + ibv_access_flags.IBV_ACCESS_LOCAL_WRITE) + self.assertEqual(cm.exception.error_code, errno.EINVAL, + 'Registering a buffer with a non-allocating PD ' + 'must fail with EINVAL') + + def test_buf_reg_wrong_allocating_pd_fails_plain_pd(self): + """Plain PDs: a non-allocating PD is rejected.""" + self.check_reg_wrong_allocating_pd_fails(self.get_pd(), self.get_pd()) + + def test_buf_reg_wrong_allocating_pd_fails_cc_pd(self): + """CC PDs: a non-allocating PD is rejected.""" + self.check_reg_wrong_allocating_pd_fails(self.get_pd(cc=True), + self.get_pd(cc=True)) + + def check_reg_length_exceeds_buffer_fails(self, pd): + """Registering a length larger than the buffer must fail.""" + buf = self.get_buf(pd, PAGE_SIZE) + with self.assertRaises(PyverbsRDMAError) as cm: + register_buf_mr(pd, buf, 2 * PAGE_SIZE, + ibv_access_flags.IBV_ACCESS_LOCAL_WRITE) + self.assertEqual(cm.exception.error_code, errno.EINVAL, + 'Registering a length larger than the buffer ' + 'must fail with EINVAL') + + def test_buf_reg_length_exceeds_buffer_fails_plain_pd(self): + """Plain PD: a too-large length is rejected.""" + self.check_reg_length_exceeds_buffer_fails(self.get_pd()) + + def test_buf_reg_length_exceeds_buffer_fails_cc_pd(self): + """CC PD: a too-large length is rejected.""" + self.check_reg_length_exceeds_buffer_fails(self.get_pd(cc=True)) + + def check_reg_offset_length_exceeds_buffer_fails(self, pd): + """Registering offset + length past the buffer must fail.""" + buf = self.get_buf(pd, PAGE_SIZE) + with self.assertRaises(PyverbsRDMAError) as cm: + register_buf_mr(pd, buf, PAGE_SIZE, + ibv_access_flags.IBV_ACCESS_LOCAL_WRITE, + offset=PAGE_SIZE) + self.assertEqual(cm.exception.error_code, errno.EINVAL, + 'Registering past the buffer end via offset ' + 'must fail with EINVAL') + + def test_buf_reg_offset_length_exceeds_buffer_fails_plain_pd(self): + """Plain PD: an out-of-range offset+length is rejected.""" + self.check_reg_offset_length_exceeds_buffer_fails(self.get_pd()) + + def test_buf_reg_offset_length_exceeds_buffer_fails_cc_pd(self): + """CC PD: an out-of-range offset+length is rejected.""" + self.check_reg_offset_length_exceeds_buffer_fails(self.get_pd(cc=True)) + + def test_plain_mr_rejected_on_bounce_device(self): + """DMA-bounce device: a plain ibv_reg_mr() is rejected.""" + if not device_has_cc_dma_bounce(self.ctx): + raise unittest.SkipTest('Device does not report CC_DMA_BOUNCE') + with PD(self.ctx) as pd: + with self.assertRaises( + PyverbsRDMAError, + msg='Plain ibv_reg_mr() must be rejected on a ' + 'DMA-bounce device'): + MR(pd, PAGE_SIZE, ibv_access_flags.IBV_ACCESS_LOCAL_WRITE) + + +class BufTrafficTest(RDMATestCase): + """RC/UD traffic over ibv_alloc_buf() data buffers.""" + def test_buf_rc_send_cc_pd_reg_buf_mr(self): + """CC PD: RC send/recv over a buffer MR.""" + self.create_players(BufRC, cc=True) + u.traffic(**self.traffic_args, is_cq_ex=True) + + def test_buf_rc_send_large_msg_cc_pd_reg_buf_mr(self): + """CC PD: RC send/recv with a multi-page buffer MR.""" + self.create_players(BufRC, cc=True, msg_size=16384) + u.traffic(**self.traffic_args, is_cq_ex=True) + + def test_buf_rc_send_mr_subrange_cc_pd_reg_buf_mr(self): + """CC PD: RC send/recv over a buffer-subrange MR.""" + self.create_players(BufRC, cc=True, mr_offset=PAGE_SIZE, + buf_size=PAGE_SIZE + 16384, msg_size=8192) + u.traffic(**self.traffic_args, is_cq_ex=True) + + def test_buf_rc_rdma_write_imm_cc_pd_reg_buf_mr(self): + """CC PD: RC RDMA write-with-immediate into a buffer MR.""" + access = (ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | + ibv_access_flags.IBV_ACCESS_REMOTE_WRITE) + self.create_players(BufRC, cc=True, mr_access=access) + u.traffic(**self.traffic_args, is_cq_ex=True, + send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE_WITH_IMM) + + def test_buf_rc_rdma_read_cc_pd_reg_buf_mr(self): + """CC PD: RC RDMA read from a buffer MR.""" + access = (ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | + ibv_access_flags.IBV_ACCESS_REMOTE_READ) + self.create_players(BufRC, cc=True, mr_access=access) + u.rdma_traffic(**self.traffic_args, is_cq_ex=True, + send_op=ibv_wr_opcode.IBV_WR_RDMA_READ) + + def test_buf_rc_atomic_fetch_add_cc_pd_reg_buf_mr(self): + """CC PD: RC atomic fetch&add on a buffer MR.""" + with d.Context(name=self.dev_name) as ctx: + atomic_caps = ctx.query_device().atomic_caps + if atomic_caps == ibv_atomic_cap.IBV_ATOMIC_NONE: + raise unittest.SkipTest('Atomic operations are not supported') + access = (ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | + ibv_access_flags.IBV_ACCESS_REMOTE_WRITE | + ibv_access_flags.IBV_ACCESS_REMOTE_ATOMIC) + self.create_players(BufRC, cc=True, mr_access=access, msg_size=8) + u.atomic_traffic(**self.traffic_args, is_cq_ex=True, + send_op=ibv_wr_opcode.IBV_WR_ATOMIC_FETCH_AND_ADD) + + def test_buf_ud_send_cc_pd_reg_buf_mr(self): + """CC PD: UD send/recv over a buffer MR.""" + self.create_players(BufUD, cc=True) + u.traffic(**self.traffic_args, is_cq_ex=True) + + def test_buf_rc_send_plain_pd_reg_buf_mr(self): + """Plain PD: RC send/recv over a buffer MR.""" + self.create_players(BufRC, cc=False) + u.traffic(**self.traffic_args, is_cq_ex=True) + + def test_buf_rc_send_cc_pd_reg_mr_ex(self): + """CC PD: RC send/recv via ibv_reg_mr_ex().""" + self.create_players(BufRC, cc=True, via_reg_mr_ex=True) + u.traffic(**self.traffic_args, is_cq_ex=True) diff --git a/tests/utils.py b/tests/utils.py index 0860b304b..9a74e5425 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1264,7 +1264,7 @@ def prepare_validate_data(client=None, server=None): def rdma_traffic(client, server, iters, gid_idx, port, new_send=False, - send_op=None, force_page_faults=False): + send_op=None, force_page_faults=False, is_cq_ex=False): """ Runs basic RDMA traffic between two sides. No receive WQEs are posted. For RDMA send with immediate, use traffic(). @@ -1277,6 +1277,7 @@ def rdma_traffic(client, server, iters, gid_idx, port, new_send=False, :param send_op: The send_wr opcode. :param force_page_faults: If True, use madvise to hint that we don't need the MR's buffer to force page faults (useful for ODP testing). + :param is_cq_ex: If True, use poll_cq_ex() rather than poll_cq() :return: """ # Using the new post send API, we need the SGE, not the SendWR @@ -1287,6 +1288,7 @@ def rdma_traffic(client, server, iters, gid_idx, port, new_send=False, else: ah_client = None ah_server = None + poll = poll_cq_ex if is_cq_ex else poll_cq send_element_idx = 1 if new_send else 0 same_side_check = send_op in [ibv_wr_opcode.IBV_WR_RDMA_READ, ibv_wr_opcode.IBV_WR_ATOMIC_CMP_AND_SWP, @@ -1298,7 +1300,7 @@ def rdma_traffic(client, server, iters, gid_idx, port, new_send=False, prepare_validate_data(client=client, server=server) c_send_wr = get_send_elements(client, False, send_op)[send_element_idx] send(client, c_send_wr, send_op, new_send, ah=ah_client) - poll_cq(client.cq) + poll(client.cq) if same_side_check: msg_received = client.mem_read(client.msg_size) else: @@ -1308,7 +1310,7 @@ def rdma_traffic(client, server, iters, gid_idx, port, new_send=False, s_send_wr = get_send_elements(server, True, send_op)[send_element_idx] prepare_validate_data(client=client, server=server) send(server, s_send_wr, send_op, new_send, ah=ah_server) - poll_cq(server.cq) + poll(server.cq) if same_side_check: msg_received = server.mem_read(client.msg_size) else: @@ -1319,7 +1321,7 @@ def rdma_traffic(client, server, iters, gid_idx, port, new_send=False, def atomic_traffic(client, server, iters, gid_idx, port, new_send=False, send_op=None, receiver_val=1, sender_val=2, swap=0, - client_wr=1, server_wr=1, **kwargs): + client_wr=1, server_wr=1, is_cq_ex=False, **kwargs): """ Runs atomic traffic between two sides. :param client: Client side, clients base class is BaseTraffic @@ -1333,6 +1335,7 @@ def atomic_traffic(client, server, iters, gid_idx, port, new_send=False, :param sender_val: The requested value on the sender SendWR. :param client_wr: Number of WR the client will post before polling all of them :param server_wr: Number of WR the server will post before polling all of them + :param is_cq_ex: If True, use poll_cq_ex() rather than poll_cq() :param kwargs: General arguments (shared with other traffic functions). """ send_element_idx = 1 if new_send else 0 @@ -1342,6 +1345,7 @@ def atomic_traffic(client, server, iters, gid_idx, port, new_send=False, else: ah_client = None ah_server = None + poll = poll_cq_ex if is_cq_ex else poll_cq for _ in range(iters): client.mr.write(int.to_bytes(sender_val, 1, byteorder='big') * 8, 8) @@ -1355,7 +1359,7 @@ def atomic_traffic(client, server, iters, gid_idx, port, new_send=False, c_send_wr.set_qp_type_xrc(server.srq.get_srq_num()) send(client, c_send_wr, send_op, new_send, ah=ah_client, cmp_add=sender_val, swap=swap) - poll_cq(client.cq, count=client_wr) + poll(client.cq, count=client_wr) validate_atomic(send_op, server, client, receiver_val=receiver_val + sender_val * (client_wr - 1), send_cmp_add=sender_val, send_swp=swap) @@ -1370,7 +1374,7 @@ def atomic_traffic(client, server, iters, gid_idx, port, new_send=False, s_send_wr.set_qp_type_xrc(client.srq.get_srq_num()) send(server, s_send_wr, send_op, new_send, ah=ah_server, cmp_add=sender_val, swap=swap) - poll_cq(server.cq, count=server_wr) + poll(server.cq, count=server_wr) validate_atomic(send_op, client, server, receiver_val=receiver_val + sender_val * (server_wr - 1), send_cmp_add=sender_val, send_swp=swap) From c3cc7dfdfee972b492c80309998975d585cfcf1e Mon Sep 17 00:00:00 2001 From: David Hu Date: Fri, 26 Jun 2026 05:10:41 +0000 Subject: [PATCH 27/33] provider/irdma: Implement alloc_parent_domain for CoCo Allocation of system_cc_dmabuf is only carried out if CoCo DMA bounce feature in device, and cc unprotected attribute are both present. Proper cleanup is added to cleanup path. Signed-off-by: David Hu --- providers/irdma/umain.c | 1 + providers/irdma/umain.h | 12 +++++++++++ providers/irdma/uverbs.c | 43 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/providers/irdma/umain.c b/providers/irdma/umain.c index 03f87e048..b44f75f4d 100644 --- a/providers/irdma/umain.c +++ b/providers/irdma/umain.c @@ -87,6 +87,7 @@ static const struct verbs_context_ops irdma_uctx_srq_ops = { static const struct verbs_context_ops irdma_uctx_ops = { .alloc_mw = irdma_ualloc_mw, .alloc_pd = irdma_ualloc_pd, + .alloc_parent_domain = irdma_ualloc_parent_domain, .attach_mcast = irdma_uattach_mcast, .bind_mw = irdma_ubind_mw, .cq_event = irdma_cq_event, diff --git a/providers/irdma/umain.h b/providers/irdma/umain.h index f40ffb641..76568a256 100644 --- a/providers/irdma/umain.h +++ b/providers/irdma/umain.h @@ -46,8 +46,14 @@ struct irdma_upd { void *arm_cq_page; void *arm_cq; uint32_t pd_id; + bool is_parent_domain; }; +struct irdma_parent_domain { + struct irdma_upd base_pd; + struct ibv_dmabuf_heap *dmabuf_heap; +} + struct irdma_uvcontext { struct verbs_context ibv_ctx; struct irdma_upd *iwupd; @@ -124,6 +130,8 @@ int irdma_uquery_device_ex(struct ibv_context *context, int irdma_uquery_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr); struct ibv_pd *irdma_ualloc_pd(struct ibv_context *context); +struct ibv_pd *irdma_ualloc_parent_domain(struct ibv_context *context, + struct ibv_parent_domain_init_attr *attr); int irdma_ufree_pd(struct ibv_pd *pd); struct ibv_mr *irdma_ureg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access); @@ -179,4 +187,8 @@ void irdma_async_event(struct ibv_context *context, void irdma_set_hw_attrs(struct irdma_hw_attrs *attrs); void *irdma_mmap(int fd, off_t offset); void irdma_munmap(void *map); +static inline struct to_iparent_domain(struct ibv_pd *pd) +{ + return container_of(pd, struct irdma_parent_domain, base_pd); +} #endif /* IRDMA_UMAIN_H */ diff --git a/providers/irdma/uverbs.c b/providers/irdma/uverbs.c index 76400aa9f..317fc00cd 100644 --- a/providers/irdma/uverbs.c +++ b/providers/irdma/uverbs.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "umain.h" #include "abi.h" @@ -98,6 +99,38 @@ struct ibv_pd *irdma_ualloc_pd(struct ibv_context *context) return NULL; } +/** + * irdma_ualloc_parent_domain - alloc protection domain with CoCo support + * @context: user context of the device + * @attr: parent domain attributes + */ +struct ibv_pd *irdma_ualloc_parent_domain(struct ibv_context *context, + struct ibv_parent_domain_init_attr *attr) +{ + struct irdma_parent_domain *iparent; + struct irdma_upd *base_pd; = container_of(attr->pd, struct irdma_upd, ibv_pd); + struct irdma_uvcontext *iwvctx = container_of(context, struct irdma_uvcontext, ibv_ctx.context); + + iparent = calloc(1, sizeof(iparent)); + if (!parent) + return NULL; + + /* Inherit kernel ID and context from base PD */ + iparent->base_pd.ibv_pd.context = context; + iparent->base_pd.pd_id = base_pd->pd_id; + iparent->base_pd.is_parent_domain = true; + + if ((attr->comp_mask & IBV_PARENT_DOMAIN_INIT_ATTR_ALLOW_CC_UNPROTECTED_ALLOC) && (iwvctx->uk_attrs.feature_flags & IBV_DEVICE_CC_DMA_BOUNCE)) { + iwupd->dmabuf_heap = ibv_dmabuf_heap_cc_shared_init(); + if (!iparent->dmabuf_heap) { + free(iparent); + return NULL; + } + } + + return &iwupd->ibv_pd; +} + /** * irdma_ufree_pd - free pd resources * @pd: pd to free resources @@ -108,6 +141,16 @@ int irdma_ufree_pd(struct ibv_pd *pd) int ret; iwupd = container_of(pd, struct irdma_upd, ibv_pd); + + /* If it is a parent domain, just clean up userspace resources */ + if (iwupd->is_parent_domain) { + struct irdma_parent_domain *iparent = container_of(iwupd, struct irdma_parent_domain, base_pd); + if (iparent->dmabuf_heap) + ibv_dmabuf_heap_destroy(iparent->dmabuf_heap); + free(iparent); + return 0; + } + ret = ibv_cmd_dealloc_pd(pd); if (ret) return ret; From 0c1e6e3752057e86ee95f4f71efd96a4f871c8b5 Mon Sep 17 00:00:00 2001 From: David Hu Date: Fri, 26 Jun 2026 06:08:45 +0000 Subject: [PATCH 28/33] provider/irdma: Implement ibv_alloc_buf and ibv_free_buf Signed-off-by: David Hu --- providers/irdma/umain.c | 2 + providers/irdma/umain.h | 9 +++++ providers/irdma/uverbs.c | 85 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/providers/irdma/umain.c b/providers/irdma/umain.c index b44f75f4d..569f987f2 100644 --- a/providers/irdma/umain.c +++ b/providers/irdma/umain.c @@ -88,6 +88,8 @@ static const struct verbs_context_ops irdma_uctx_ops = { .alloc_mw = irdma_ualloc_mw, .alloc_pd = irdma_ualloc_pd, .alloc_parent_domain = irdma_ualloc_parent_domain, + .alloc_buf = irdma_ualloc_buf, + .free_buf = irdma_ufree_buf, .attach_mcast = irdma_uattach_mcast, .bind_mw = irdma_ubind_mw, .cq_event = irdma_cq_event, diff --git a/providers/irdma/umain.h b/providers/irdma/umain.h index 76568a256..057411f04 100644 --- a/providers/irdma/umain.h +++ b/providers/irdma/umain.h @@ -54,6 +54,13 @@ struct irdma_parent_domain { struct ibv_dmabuf_heap *dmabuf_heap; } +struct irdma_buf { + struct ibv_buf ibv_buf; + void *buf; + size_t length; + bool is_dmabuf; +} + struct irdma_uvcontext { struct verbs_context ibv_ctx; struct irdma_upd *iwupd; @@ -133,6 +140,8 @@ struct ibv_pd *irdma_ualloc_pd(struct ibv_context *context); struct ibv_pd *irdma_ualloc_parent_domain(struct ibv_context *context, struct ibv_parent_domain_init_attr *attr); int irdma_ufree_pd(struct ibv_pd *pd); +void *irdma_ualloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf); +void irdma_ufree_buf(struct ibv_buf *buf); struct ibv_mr *irdma_ureg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access); struct ibv_mr *irdma_ureg_mr_dmabuf(struct ibv_pd *pd, uint64_t offset, diff --git a/providers/irdma/uverbs.c b/providers/irdma/uverbs.c index 317fc00cd..1e91b2963 100644 --- a/providers/irdma/uverbs.c +++ b/providers/irdma/uverbs.c @@ -160,6 +160,91 @@ int irdma_ufree_pd(struct ibv_pd *pd) return 0; } +/** + * irdma_ualloc_buf - allocate a provider aware buffer + * @pd: protection domain + * @size: requested buffer size + * @buf: returns the abstract buffer handle + */ +void *irdma_ualloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf) +{ + struct irdma_upd *iwupd = container_of(pd, struct irdma_upd, ibv_pd); + struct ibv_dmabuf_heap *dmabuf_heap = NULL; + struct irdma_buf *ibuf; + int dmabuf_fd = -1; + void *addr; + int ret; + + if (iwupd->is_parent_domain) { + struct irdma_parent_domain *iparent = container_of(iwupd, struct irdma_parent_domain, ipd); + dmabuf_heap = iparent->dmabuf_heap; + } + + ibuf = calloc(1, sizeof(*ibuf)); + if (!ibuf) + return NULL; + + size = roundup(size, IRDMA_HW_PAGE_SIZE); + + /* CC DMA Bounce buffer path */ + if (dmabuf_heap) { + addr = ibv_dmabuf_heap_alloc(dmabuf_heap, size, &dmabuf_fd); + if (!addr) + goto err_free_ibuf; + + if (ibv_dontfork_range(addr, size)) { + ibv_dmabuf_heap_free(addr, size, dmabuf_fd); + goto err_free_ibuf; + } + + ibuf->is_dmabuf = true; + ibv_buf_init_dmabuf(&ibuf->ibv_buf, pd, addr, size, dmabuf_fd); + } else { + ret = posix_memalign(&addr, IRDMA_HW_PAGE_SIZE, size); + if (ret) { + errno = ret; + goto err_free_ibuf; + } + + if (ibv_dontfork_range(addr, size)) { + free(addr); + goto err_free_ibuf; + } + + memset(addr, 0, size); + ibuf->is_dmabuf = false; + ibv_ubuf_init(&ibuf->ibv_buf, pd, addr, size); + } + + ibuf->buf = addr; + ibuf->length = size; + *buf = &ibuf->ibv_buf; + + return addr; + +err_free_ibuf: + free(ibuf); + return NULL; +} + +/** + * irdma_ufree_buf - free a provider aware buffer + * @buf: abstract buffer handle + */ +void irdma_ufree_buf(struct ibv_buf *buf) +{ + struct irdma_buf *ibuf = container_of(buf, struct irdma_buf, ibv_buf); + + ibv_dontfork_range(ibuf->buf, ibuf->length); + + if (ibuf->is_dmabuf) + ibv_dmabuf_heap_free(ibuf->buf, ibuf->length, buf->dmabuf_fd); + else + free(ibuf->buf); + + free(ibuf); +} + /** * irdma_ureg_mr - register user memory region * @pd: pd for the mr From b6fd436f4684cd51ba648bc99eb5e974802963da Mon Sep 17 00:00:00 2001 From: David Hu Date: Sat, 27 Jun 2026 05:10:16 +0000 Subject: [PATCH 29/33] provider/irdma: Add dmabuf allocator for internal queues Move common allocation logic into shared allocator function __irdma_alloc_buf. As a result, uAPI irdma_ualloc_buf is updated accordingly. Proper cleanup paths are also updated accordingly. Signed-off-by: David Hu --- providers/irdma/umain.h | 4 ++ providers/irdma/uverbs.c | 124 +++++++++++++++++++++------------------ 2 files changed, 72 insertions(+), 56 deletions(-) diff --git a/providers/irdma/umain.h b/providers/irdma/umain.h index 057411f04..87939d975 100644 --- a/providers/irdma/umain.h +++ b/providers/irdma/umain.h @@ -85,6 +85,7 @@ struct irdma_usrq { struct verbs_mr vmr; pthread_spinlock_t lock; struct irdma_srq_uk srq; + struct irdma_buf srq_buf; size_t buf_size; }; @@ -92,6 +93,8 @@ struct irdma_ucq { struct verbs_cq verbs_cq; struct verbs_mr vmr; struct verbs_mr vmr_shadow_area; + struct irdma_buf cq_buf; + struct irdma_buf shadow_buf; pthread_spinlock_t lock; size_t buf_size; bool is_armed; @@ -112,6 +115,7 @@ struct irdma_uqp { struct irdma_ucq *send_cq; struct irdma_ucq *recv_cq; struct verbs_mr vmr; + struct irdma_buf sq_buf; size_t buf_size; uint32_t irdma_drv_opt; pthread_spinlock_t lock; diff --git a/providers/irdma/uverbs.c b/providers/irdma/uverbs.c index 1e91b2963..a9bb0a91e 100644 --- a/providers/irdma/uverbs.c +++ b/providers/irdma/uverbs.c @@ -30,6 +30,66 @@ static inline void print_fw_ver(uint64_t fw_ver, char *str, size_t len) snprintf(str, len, "%d.%d", major, minor); } +static int __irdma_alloc_buf(struct ibv_pd *pd, size_t size, size_t alignment, + struct irdma_buf *ibuf) +{ + struct ibv_dmabuf_heap *heap = NULL; + int dmabuf_fd = -1; + int ret; + + if (pd) { + struct irdma_upd *iwupd = container_of(pd, struct irdma_upd, ibv_pd); + if (iwupd->is_parent_domain) { + struct irdma_parent_domain *iparent = container_of(iwupd, struct irdma_parent_domain, base_pd); + heap = iparent->dmabuf_heap; + } + } + + ibuf->length = roundup(size, alignment); + + if (heap) { + ibuf->buf = ibv_dmabuf_heap_alloc(heap, ibuf->length, &dmabuf_fd); + if (!ibuf->buf) + return ENOMEM; + + if (ibv_dontfork_range(ibuf->buf, ibuf->length)) { + ibv_dmabuf_heap_free(ibuf->buf, ibuf->length, dmabuf_fd); + return ENOMEM; + } + + ibuf->is_dmabuf = true; + ibv_buf_init_dmabuf(&ibuf->ibv_buf, pd, ibuf->buf, ibuf->length, dmabuf_fd); + } else { + ret = posix_memalign(&ibuf->buf, alignment, ibuf->length); + if (ret) + return ret; + + if (ibv_dontfork_range(ibuf->buf, ibuf->length)) { + free(ibuf->buf); + return ENOMEM; + } + + memset(ibuf->buf, 0, ibuf->length); + ibuf->is_dmabuf = false; + ibv_buf_init(&ibuf->ibv_buf, pd, ibuf->buf, ibuf->length); + } + + return 0; +} + +static void __irdma_free_buf(struct irdma_buf *ibuf) +{ + if (!ibuf->buf) + return; + + ibv_dofork_range(ibuf->buf, ibuf->length); + + if (ibuf->is_dmabuf) + ibv_dmabuf_heap_free(ibuf->buf, ibuf->length, ibuf->ibv_buf.dmabuf_fd); + else + free(ibuf->buf); +} + /** * irdma_uquery_device_ex - query device attributes including extended properties * @context: user context for the device @@ -108,10 +168,10 @@ struct ibv_pd *irdma_ualloc_parent_domain(struct ibv_context *context, struct ibv_parent_domain_init_attr *attr) { struct irdma_parent_domain *iparent; - struct irdma_upd *base_pd; = container_of(attr->pd, struct irdma_upd, ibv_pd); + struct irdma_upd *base_pd = container_of(attr->pd, struct irdma_upd, ibv_pd); struct irdma_uvcontext *iwvctx = container_of(context, struct irdma_uvcontext, ibv_ctx.context); - iparent = calloc(1, sizeof(iparent)); + iparent = calloc(1, sizeof(*iparent)); if (!parent) return NULL; @@ -161,70 +221,28 @@ int irdma_ufree_pd(struct ibv_pd *pd) } /** - * irdma_ualloc_buf - allocate a provider aware buffer + * irdma_ualloc_buf - Allocate a provider aware user buffer * @pd: protection domain * @size: requested buffer size * @buf: returns the abstract buffer handle */ void *irdma_ualloc_buf(struct ibv_pd *pd, size_t size, struct ibv_buf **buf) { - struct irdma_upd *iwupd = container_of(pd, struct irdma_upd, ibv_pd); - struct ibv_dmabuf_heap *dmabuf_heap = NULL; struct irdma_buf *ibuf; - int dmabuf_fd = -1; - void *addr; - int ret; - if (iwupd->is_parent_domain) { - struct irdma_parent_domain *iparent = container_of(iwupd, struct irdma_parent_domain, ipd); - dmabuf_heap = iparent->dmabuf_heap; - } - + /* The user app owns this struct, so we must heap-allocate it */ ibuf = calloc(1, sizeof(*ibuf)); if (!ibuf) return NULL; - size = roundup(size, IRDMA_HW_PAGE_SIZE); - - /* CC DMA Bounce buffer path */ - if (dmabuf_heap) { - addr = ibv_dmabuf_heap_alloc(dmabuf_heap, size, &dmabuf_fd); - if (!addr) - goto err_free_ibuf; - - if (ibv_dontfork_range(addr, size)) { - ibv_dmabuf_heap_free(addr, size, dmabuf_fd); - goto err_free_ibuf; - } - - ibuf->is_dmabuf = true; - ibv_buf_init_dmabuf(&ibuf->ibv_buf, pd, addr, size, dmabuf_fd); - } else { - ret = posix_memalign(&addr, IRDMA_HW_PAGE_SIZE, size); - if (ret) { - errno = ret; - goto err_free_ibuf; - } - - if (ibv_dontfork_range(addr, size)) { - free(addr); - goto err_free_ibuf; - } - - memset(addr, 0, size); - ibuf->is_dmabuf = false; - ibv_ubuf_init(&ibuf->ibv_buf, pd, addr, size); + if (__irdma_alloc_buf(pd, size, IRDMA_HW_PAGE_SIZE, ibuf)) { + free(ibuf); + return NULL; } - ibuf->buf = addr; - ibuf->length = size; *buf = &ibuf->ibv_buf; return addr; - -err_free_ibuf: - free(ibuf); - return NULL; } /** @@ -235,13 +253,7 @@ void irdma_ufree_buf(struct ibv_buf *buf) { struct irdma_buf *ibuf = container_of(buf, struct irdma_buf, ibv_buf); - ibv_dontfork_range(ibuf->buf, ibuf->length); - - if (ibuf->is_dmabuf) - ibv_dmabuf_heap_free(ibuf->buf, ibuf->length, buf->dmabuf_fd); - else - free(ibuf->buf); - + __irdma_free_buf(ibuf); free(ibuf); } From 7225039ff3046100de3b1ba4a3906b171a3e60fc Mon Sep 17 00:00:00 2001 From: David Hu Date: Sat, 27 Jun 2026 06:26:32 +0000 Subject: [PATCH 30/33] provider/irdma: Update irdma-abi.h to support dmabuf For simplicity, irdma legacy ABI is updated to support passing dmabuf fd to irdma driver. Alternatively, a major write is needed to migrate queues ops via IOCTL ABI. Signed-off-by: David Hu --- kernel-headers/rdma/irdma-abi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel-headers/rdma/irdma-abi.h b/kernel-headers/rdma/irdma-abi.h index 36f20802b..0f623a0fc 100644 --- a/kernel-headers/rdma/irdma-abi.h +++ b/kernel-headers/rdma/irdma-abi.h @@ -73,11 +73,19 @@ struct irdma_resize_cq_req { struct irdma_create_cq_req { __aligned_u64 user_cq_buf; __aligned_u64 user_shadow_area; + __s32 cq_dmabuf_fd; + __s32 shadow_dmabuf_fd; + __u8 is_cq_dmabuf; + __u8 is_shadow_dmabuf; + __u8 rsvd[6]; }; struct irdma_create_srq_req { __aligned_u64 user_srq_buf; __aligned_u64 user_shadow_area; + __s32 srq_dmabuf_fd; + __u8 is_srq_dmabuf; + __u8 rsvd[3]; }; struct irdma_create_srq_resp { @@ -88,6 +96,9 @@ struct irdma_create_srq_resp { struct irdma_create_qp_req { __aligned_u64 user_wqe_bufs; __aligned_u64 user_compl_ctx; + __s32 sq_dmabuf_fd; + __u8 is_sq_dmabuf; + __u8 rsvd[3]; }; struct irdma_mem_reg_req { From 3b412cc3b19c5ee34e33faffe12f4edb5de46a6c Mon Sep 17 00:00:00 2001 From: David Hu Date: Sat, 27 Jun 2026 06:44:35 +0000 Subject: [PATCH 31/33] provider/irdma: Migrate CQ creation to be dmabuf aware Cleanup path is also properly updated to be dmabuf aware Signed-off-by: David Hu --- providers/irdma/uverbs.c | 85 +++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 32 deletions(-) diff --git a/providers/irdma/uverbs.c b/providers/irdma/uverbs.c index a9bb0a91e..6cb02a0c2 100644 --- a/providers/irdma/uverbs.c +++ b/providers/irdma/uverbs.c @@ -665,6 +665,8 @@ static struct ibv_cq_ex *ucreate_cq(struct ibv_context *context, struct ibv_cq_init_attr_ex *attr_ex, bool ext_cq) { + struct ibv_pd *pd = (attr_ex->comp_mask & IBV_CQ_INIT_ATTR_MASK_PD) ? attr_ex->parent_domain : NULL; + struct irdma_cq_uk_init_info info = {}; struct irdma_ureg_mr reg_mr_cmd = {}; struct irdma_ucreate_cq_ex cmd = {}; @@ -721,46 +723,56 @@ static struct ibv_cq_ex *ucreate_cq(struct ibv_context *context, total_size = (cq_pages << IRDMA_HW_PAGE_SHIFT) + IRDMA_DB_SHADOW_AREA_SIZE; iwucq->buf_size = total_size; - info.cq_base = irdma_calloc_hw_buf(total_size); - if (!info.cq_base) - goto err_cq_base; - - reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ; - reg_mr_cmd.cq_pages = cq_pages; - - ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.cq_base, - total_size, (uintptr_t)info.cq_base, - IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr, - ®_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd), - ®_mr_resp, sizeof(reg_mr_resp)); + ret = __irdma_alloc_buf(pd, total_size, IRDMA_HW_PAGE_SIZE, &iwucq->cq_buf); if (ret) { errno = ret; - goto err_dereg_mr; + goto err_cq_base; } + info.cq_base = iwucq->cq_buf.buf; - iwucq->vmr.ibv_mr.pd = &iwvctx->iwupd->ibv_pd; + if (!iwucq->cq_buf.is_dmabuf) { + reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ; + reg_mr_cmd.cq_pages = cq_pages; - if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) { - info.shadow_area = irdma_calloc_hw_buf(IRDMA_DB_SHADOW_AREA_SIZE); - if (!info.shadow_area) + ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.cq_base, + total_size, (uintptr_t)info.cq_base, + IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr, + ®_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd), + ®_mr_resp, sizeof(reg_mr_resp)); + if (ret) { + errno = ret; goto err_dereg_mr; + } - reg_mr_shadow_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ; - reg_mr_shadow_cmd.cq_pages = 1; + iwucq->vmr.ibv_mr.pd = &iwvctx->iwupd->ibv_pd; + } - ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.shadow_area, - IRDMA_DB_SHADOW_AREA_SIZE, (uintptr_t)info.shadow_area, - IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr_shadow_area, - ®_mr_shadow_cmd.ibv_cmd, sizeof(reg_mr_shadow_cmd), - ®_mr_shadow_resp, sizeof(reg_mr_shadow_resp)); + if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) { + ret = __irdma_alloc_buf(pd, IRDMA_DB_SHADOW_AREA_SIZE, IRDMA_HW_PAGE_SIZE, &iwucq->shadow_buf); if (ret) { errno = ret; - goto err_dereg_shadow; + goto err_dereg_mr } + info.shadow_area = iwucq->shadow_buf.buf; + + if (!iwucq->shadow_buf.is_dmabuf) { + reg_mr_shadow_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ; + reg_mr_shadow_cmd.cq_pages = 1; + + ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.shadow_area, + IRDMA_DB_SHADOW_AREA_SIZE, (uintptr_t)info.shadow_area, + IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr_shadow_area, + ®_mr_shadow_cmd.ibv_cmd, sizeof(reg_mr_shadow_cmd), + ®_mr_shadow_resp, sizeof(reg_mr_shadow_resp)); + if (ret) { + errno = ret; + goto err_dereg_shadow; + } - iwucq->vmr_shadow_area.ibv_mr.pd = &iwvctx->iwupd->ibv_pd; - + iwucq->vmr_shadow_area.ibv_mr.pd = &iwvctx->iwupd->ibv_pd; + } } else { + /* If not resizing, the shadow area is appended to the end of the main CQ buffer */ info.shadow_area = (__le64 *)((__u8 *)info.cq_base + (cq_pages << IRDMA_HW_PAGE_SHIFT)); } @@ -769,6 +781,13 @@ static struct ibv_cq_ex *ucreate_cq(struct ibv_context *context, cmd.user_cq_buf = (__u64)((uintptr_t)info.cq_base); cmd.user_shadow_area = (__u64)((uintptr_t)info.shadow_area); + cmd.is_cq_dmabuf = iwucq->cq_buf.is_dmabuf; + cmd.cq_dmabuf_fd = iwucq->cq_buf.ibv_buf.dmabuf_fd; + if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) { + cmd.is_shadow_dmabuf = iwucq->shadow_buf.is_dmabuf; + cmd.shadow_dmabuf_fd = iwucq->shadow_buf.ibv_buf.dmabuf_fd; + } + ret = ibv_cmd_create_cq_ex(context, attr_ex, NULL, &iwucq->verbs_cq, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp), 0); @@ -888,12 +907,14 @@ int irdma_udestroy_cq(struct ibv_cq *cq) if (ret) goto err; - ibv_cmd_dereg_mr(&iwucq->vmr); - irdma_free_hw_buf(iwucq->cq.cq_base, iwucq->buf_size); - + if (!iwucq->cq_buf.is_dmabuf) + ibv_cmd_dereg_mr(&iwucq->vmr); + __irdma_free_buf(&iwucq->cq_buf); + if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) { - ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area); - irdma_free_hw_buf(iwucq->cq.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE); + if (!iwucq->shadow_buf.is_dmabuf) + ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area); + __irdma_free_buf(&iwucq->shadow_buf); } free(iwucq); return 0; From fbd917ce57447fd888b962c561f683db4dd3236b Mon Sep 17 00:00:00 2001 From: David Hu Date: Sat, 27 Jun 2026 07:05:27 +0000 Subject: [PATCH 32/33] provider/irdma: Migrate QP creation to be dmabuf aware Cleanup path is also updated accordingly Signed-off-by: David Hu --- providers/irdma/uverbs.c | 43 +++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/providers/irdma/uverbs.c b/providers/irdma/uverbs.c index 6cb02a0c2..f979bbb4c 100644 --- a/providers/irdma/uverbs.c +++ b/providers/irdma/uverbs.c @@ -1581,7 +1581,9 @@ static int irdma_destroy_vmapped_qp(struct irdma_uqp *iwuqp) if (iwuqp->qp.push_wqe) irdma_munmap(iwuqp->qp.push_wqe); - ibv_cmd_dereg_mr(&iwuqp->vmr); + if (!iwucq->sq_buf.is_dmabuf) + ibv_cmd_dereg_mr(&iwuqp->vmr); + __irdma_free_buf(&iwucq->sq_buf); return 0; } @@ -1622,28 +1624,36 @@ static int irdma_vmapped_qp(struct irdma_uqp *iwuqp, struct ibv_pd *pd, if (pgsz > 0) os_pgsz = pgsz; } - info->sq = irdma_calloc_hw_buf_sz(totalqpsize, os_pgsz); - if (!info->sq) - return ENOMEM; + + ret = __irdma_alloc_buf(pd, totalqpsize, os_pgsz, &iwuqp->sq_buf); + if (ret) + return ret; + iwuqp->buf_size = totalqpsize; + info->sq = iwuqp->sq_buf.buf; info->rq = &info->sq[sqsize / IRDMA_QP_WQE_MIN_SIZE]; info->shadow_area = info->rq[rqsize / IRDMA_QP_WQE_MIN_SIZE].elem; - reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_QP; - reg_mr_cmd.sq_pages = sqsize >> IRDMA_HW_PAGE_SHIFT; - reg_mr_cmd.rq_pages = rqsize >> IRDMA_HW_PAGE_SHIFT; + if (!iwuqp->sq_buf.is_dmabuf) { + reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_QP; + reg_mr_cmd.sq_pages = sqsize >> IRDMA_HW_PAGE_SHIFT; + reg_mr_cmd.rq_pages = rqsize >> IRDMA_HW_PAGE_SHIFT; - ret = ibv_cmd_reg_mr(pd, info->sq, totalqpsize, - (uintptr_t)info->sq, IBV_ACCESS_LOCAL_WRITE, - &iwuqp->vmr, ®_mr_cmd.ibv_cmd, - sizeof(reg_mr_cmd), ®_mr_resp, - sizeof(reg_mr_resp)); - if (ret) - goto err_dereg_mr; + ret = ibv_cmd_reg_mr(pd, info->sq, totalqpsize, + (uintptr_t)info->sq, IBV_ACCESS_LOCAL_WRITE, + &iwuqp->vmr, ®_mr_cmd.ibv_cmd, + sizeof(reg_mr_cmd), ®_mr_resp, + sizeof(reg_mr_resp)); + if (ret) + goto err_dereg_mr; + } cmd.user_wqe_bufs = (__u64)((uintptr_t)info->sq); cmd.user_compl_ctx = (__u64)(uintptr_t)&iwuqp->qp; + cmd.is_sq_dmabuf = iwuqp->sq_buf.is_dmabuf; + cmd.sq_dmabuf_fd = iwuqp->sq_buf.ibv_buf.dmabuf_fd; + ret = ibv_cmd_create_qp(pd, &iwuqp->ibv_qp, attr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(struct irdma_ucreate_qp_resp)); @@ -1667,9 +1677,10 @@ static int irdma_vmapped_qp(struct irdma_uqp *iwuqp, struct ibv_pd *pd, return 0; err_qp: - ibv_cmd_dereg_mr(&iwuqp->vmr); + if (!iwuqp->sq_buf.is_dmabuf) + ibv_cmd_dereg_mr(&iwuqp->vmr); err_dereg_mr: - irdma_free_hw_buf(info->sq, iwuqp->buf_size); + __irdma_free_buf(&iwuqp->sq_buf); return ret; } From 1dba9e5c2133190fdd47ff7e344e6f70d4745a27 Mon Sep 17 00:00:00 2001 From: David Hu Date: Sat, 27 Jun 2026 07:17:18 +0000 Subject: [PATCH 33/33] provider/irdma: Migrate srq creation to be dmabuf aware Cleanup path is also updated accordingly Signed-off-by: David Hu --- providers/irdma/uverbs.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/providers/irdma/uverbs.c b/providers/irdma/uverbs.c index f979bbb4c..44227ad72 100644 --- a/providers/irdma/uverbs.c +++ b/providers/irdma/uverbs.c @@ -505,8 +505,9 @@ int irdma_udestroy_srq(struct ibv_srq *ibsrq) if (ret) return ret; - ibv_cmd_dereg_mr(&iwusrq->vmr); - irdma_free_hw_buf(iwusrq->srq.srq_base, iwusrq->buf_size); + if (!iwusrq->srq_buf.is_dmabuf) + ibv_cmd_dereg_mr(&iwusrq->vmr); + __irdma_free_buf(&iwusrq->srq_buf); free(iwusrq); return 0; err: @@ -568,30 +569,32 @@ struct ibv_srq *irdma_ucreate_srq(struct ibv_pd *pd, size = roundup(depth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE); total_size = size + IRDMA_DB_SHADOW_AREA_SIZE; iwusrq->buf_size = total_size; - info.srq = irdma_calloc_hw_buf(total_size); - - if (!info.srq) { - ret = ENOMEM; + ret = __irdma_alloc_buf(pd, total_size, IRDMA_HW_PAGE_SIZE, &iwusrq->srq_buf); + if (ret) goto err_sges; - } + info.srq = iwusrq->srq_buf.buf; - memset(info.srq, 0, total_size); - reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_SRQ; - reg_mr_cmd.rq_pages = size >> IRDMA_HW_PAGE_SHIFT; + if (!iwusrq->srq_buf.is_dmabuf) { + reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_SRQ; + reg_mr_cmd.rq_pages = size >> IRDMA_HW_PAGE_SHIFT; - ret = ibv_cmd_reg_mr(pd, info.srq, total_size, - (uintptr_t)info.srq, IBV_ACCESS_LOCAL_WRITE, - &iwusrq->vmr, ®_mr_cmd.ibv_cmd, - sizeof(reg_mr_cmd), ®_mr_resp, - sizeof(reg_mr_resp)); - if (ret) - goto err_cmd_reg; + ret = ibv_cmd_reg_mr(pd, info.srq, total_size, + (uintptr_t)info.srq, IBV_ACCESS_LOCAL_WRITE, + &iwusrq->vmr, ®_mr_cmd.ibv_cmd, + sizeof(reg_mr_cmd), ®_mr_resp, + sizeof(reg_mr_resp)); + if (ret) + goto err_cmd_reg; - iwusrq->vmr.ibv_mr.pd = pd; + iwusrq->vmr.ibv_mr.pd = pd; + } info.shadow_area = (__le64 *)((__u8 *)info.srq + size); cmd.user_srq_buf = (__u64)((uintptr_t)info.srq); cmd.user_shadow_area = (__u64)((uintptr_t)info.shadow_area); + cmd.is_srq_dmabuf = iwusrq->srq_buf.is_dmabuf; + cmd.srq_dmabuf_fd = iwusrq->srq_buf.ibv_buf.dmabuf_fd; + ret = ibv_cmd_create_srq(pd, &iwusrq->v_srq.srq, initattr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret)