adapt to vllm-plugin-FL

0c698cda · caihl · aadf7b41 · 0c698cda · 0c698cda · 0c698cda
Commit 0c698cda authored Mar 05, 2026 by caihl
5 changed files
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -90,7 +90,7 @@ void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                          reinterpret_cast<half*>(out.data_ptr()), out.numel());
      break;
    }
-#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) || defined(USE_ROCM))
    case at::ScalarType::BFloat16: {
      fa->allreduce<nv_bfloat16>(
          stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),

--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -105,7 +105,7 @@ DINLINE half& assign_add(half& a, half b) {
 }
 DINLINE float& assign_add(float& a, float b) { return a += b; }
-#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) || defined(USE_ROCM))
 DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
 template <>
 DINLINE nv_bfloat16 downcast_s(float val) {

--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -41,11 +41,11 @@
  #if defined(HIP_VERSION) && HIP_VERSION < 70000000
 // On ROCm versions before 7.0, __syncwarp isn't defined. The below
 // implementation is copy/pasted from the implementation in ROCm 7.0
-__device__ inline void __syncwarp() {
+//__device__ inline void __syncwarp() {
-  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
+//  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
-  __builtin_amdgcn_wave_barrier();
+//  __builtin_amdgcn_wave_barrier();
-  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
+//  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
-}
+//}
  #endif
 #else
  #define FINAL_MASK 0xffffffff

--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -25,6 +25,8 @@ elif current_platform.is_rocm():
            "Rocm platform requires upstream flash-attn "
            "to be installed. Please install flash-attn first."
        ) from e
+else:
+    from flash_attn import flash_attn_varlen_func
 def get_flash_attn_version(requires_alibi: bool = False) -> int | None:

--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -143,15 +143,15 @@ class CudaCommunicator(DeviceCommunicatorBase):
            out = qr_comm.quick_all_reduce(input_)
            assert out is not None
            return out
-        ca_comm = self.ca_comm
+        #ca_comm = self.ca_comm
-        if (
+        #if (
-            ca_comm is not None
+        #    ca_comm is not None
-            and not ca_comm.disabled
+        #    and not ca_comm.disabled
-            and ca_comm.should_custom_ar(input_)
+        #    and ca_comm.should_custom_ar(input_)
-        ):
+        #):
-            out = ca_comm.custom_all_reduce(input_)
+        #    out = ca_comm.custom_all_reduce(input_)
-            assert out is not None
+        #    assert out is not None
-            return out
+        #    return out
        symm_mem_comm = self.symm_mem_comm
        if symm_mem_comm is not None and symm_mem_comm.should_use_symm_mem(input_):
            out = symm_mem_comm.all_reduce(input_)