[Feature][Core] Support Fabric detection to adapt the MNNVL protocol for the GB series (#33540)

Signed-off-by: Kebe <mail@kebe7jun.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Thomas Vegas <tvegas@nvidia.com> Co-authored-by: youkaichao <youkaichao@gmail.com>

[Feature][Core] Support Fabric detection to adapt the MNNVL protocol for the GB series (#33540)
Signed-off-by: Kebe <mail@kebe7jun.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Thomas Vegas <tvegas@nvidia.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
528e9b14 · Kebe · GitHub · d95b4be4 · 528e9b14
Unverified Commit 528e9b14 authored Feb 02, 2026 by Kebe Committed by GitHub Feb 02, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 1 deletion

csrc/cumem_allocator.cpp csrc/cumem_allocator.cpp +18 -1

No files found.
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -115,11 +115,28 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
  if (flag) {  // support GPUDirect RDMA if possible
    prop.allocFlags.gpuDirectRDMACapable = 1;
  }
+  int fab_flag = 0;
+  CUDA_CHECK(cuDeviceGetAttribute(
+      &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
+  if (fab_flag) {  // support fabric handle if possible
+    prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+  }
 #endif

 #ifndef USE_ROCM
  // Allocate memory using cuMemCreate
-  CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+  CUresult ret = (CUresult)cuMemCreate(p_memHandle, size, &prop, 0);
+  if (ret) {
+    if (fab_flag &&
+        (ret == CUDA_ERROR_NOT_PERMITTED || ret == CUDA_ERROR_NOT_SUPPORTED)) {
+      // Fabric allocation may fail without multi-node nvlink,
+      // fallback to POSIX file descriptor
+      prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+      CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+    } else {
+      CUDA_CHECK(ret);
+    }
+  }
  if (error_code != 0) {
    return;
  }