[Feature] support vllm 0.18.1

df4818c6 · zhuwenwen · a26e8dc7 · df4818c6 · df4818c6 · df4818c6
Commit df4818c6 authored Apr 01, 2026 by zhuwenwen
7 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201;gfx926;gfx928;gfx936")

 # ROCm installation prefix. Default to /opt/rocm but allow override via
 # -DROCM_PATH=/your/rocm/path when invoking cmake.
@@ -1185,7 +1185,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 endif()

 # For CUDA and HIP builds also build the triton_kernels external package.
-if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+if(VLLM_GPU_LANG STREQUAL "CUDA")
    include(cmake/external_projects/triton_kernels.cmake)
 endif()


--- a/csrc/cuda_vec_utils.cuh
+++ b/csrc/cuda_vec_utils.cuh
@@ -8,6 +8,8 @@
 #include <cassert>

 #ifdef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
  #include <hip/hip_runtime.h>
 #else
  #include <cuda_bf16.h>

--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -38,15 +38,15 @@
 #ifdef USE_ROCM
  #define FINAL_MASK 0xffffffffffffffffULL

-  #if defined(HIP_VERSION) && HIP_VERSION < 70000000
-// On ROCm versions before 7.0, __syncwarp isn't defined. The below
-// implementation is copy/pasted from the implementation in ROCm 7.0
-__device__ inline void __syncwarp() {
-  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
-  __builtin_amdgcn_wave_barrier();
-  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
-}
-  #endif
+//   #if defined(HIP_VERSION) && HIP_VERSION < 70000000
+// // On ROCm versions before 7.0, __syncwarp isn't defined. The below
+// // implementation is copy/pasted from the implementation in ROCm 7.0
+// __device__ inline void __syncwarp() {
+//   __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
+//   __builtin_amdgcn_wave_barrier();
+//   __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
+// }
+//   #endif
 #else
  #define FINAL_MASK 0xffffffff
 #endif

--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -12,7 +12,9 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>

+#ifndef USE_ROCM
 #include "compat.cuh"
+#endif
 #include "matrix_view.cuh"
 #include "qdq_2.cuh"
 #include "qdq_3.cuh"

--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -16,8 +16,13 @@ packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 runai-model-streamer[s3,gcs,azure]==0.15.7
-conch-triton-kernels==1.2.1
+# conch-triton-kernels==1.2.1
 timm>=1.0.17
 # amd-quark: required for Quark quantization on ROCm 
 # To be consistent with test_quark.py
 amd-quark>=0.8.99
+
+# Other necessary dependencies
+torch == 2.10.0
+triton == 3.5.1
+flash_attn == 2.8.3
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -22,6 +22,16 @@ from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME


+from typing import Optional, Union
+import subprocess
+from pathlib import Path
+
+pwd = os.path.dirname(os.path.abspath(__file__))
+add_git_version = False
+if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1:
+    add_git_version = True
+
+
 def load_module_from_path(module_name, path):
    spec = importlib.util.spec_from_file_location(module_name, path)
    module = importlib.util.module_from_spec(spec)
@@ -298,7 +308,7 @@ class cmake_build_ext(build_ext):
            os.makedirs(os.path.dirname(dst_file), exist_ok=True)
            self.copy_file(file, dst_file)

-        if _is_cuda() or _is_hip():
+        if _is_cuda():
            # copy vllm/third_party/triton_kernels/**/*.py from self.build_lib
            # to current directory so that they can be included in the editable
            # build
@@ -798,6 +808,94 @@ def get_nvcc_cuda_version() -> Version:
    return nvcc_cuda_version


+def get_sha(root: Union[str, Path]) -> str:
+    try:
+        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
+    except Exception:
+        return 'Unknown'
+
+
+def get_version_add(sha: Optional[str] = None) -> str:
+    command = "git config --global --add safe.directory "+pwd  
+    subprocess.run(command, shell=True, capture_output=False, text=True) 
+    vllm_root = os.path.dirname(os.path.abspath(__file__))
+    add_version_path = os.path.join(os.path.join(vllm_root, "vllm"), "version.py")
+    major, minor, _ = torch.__version__.split('.')
+    if add_git_version:
+        if sha != 'Unknown':
+            if sha is None:
+                sha = get_sha(vllm_root)
+            version = 'das.' + sha[:7]
+    else:
+        version = 'das'
+
+
+    # dtk version
+    if os.getenv("ROCM_PATH"):
+        rocm_path = os.getenv('ROCM_PATH', "")
+        rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
+        with open(rocm_version_path, 'r',encoding='utf-8') as file:
+            lines = file.readlines()
+        rocm_version=lines[0].replace(".", "")
+        version += ".dtk" + rocm_version
+    
+    new_version_content = f"""
+try:
+    __version__ = "0.18.1"
+    __version_tuple__ = (0, 18, 1)
+    __hcu_version__ = f'0.18.1+{version}' 
+    
+    from vllm.version import __version__, __version_tuple__, __hcu_version__
+except Exception as e:
+    import warnings
+
+    warnings.warn(f"Failed to read commit hash:\\n + str(e)",
+                  RuntimeWarning,
+                  stacklevel=2)
+    __version__ = "dev"
+    __version_tuple__ = (0, 0, __version__)
+    
+    
+def _prev_minor_version_was(version_str):
+    '''Check whether a given version matches the previous minor version.
+
+    Return True if version_str matches the previous minor version.
+
+    For example - return True if the current version if 0.7.4 and the
+    supplied version_str is '0.6'.
+
+    Used for --show-hidden-metrics-for-version.
+    '''
+    # Match anything if this is a dev tree
+    if __version_tuple__[0:2] == (0, 0):
+        return True
+
+    # Note - this won't do the right thing when we release 1.0!
+    # assert __version_tuple__[0] == 0
+    assert isinstance(__version_tuple__[1], int)
+    return version_str == f"{{__version_tuple__[0]}}.{{__version_tuple__[1] - 1}}"
+
+
+def _prev_minor_version():
+    '''For the purpose of testing, return a previous minor version number.'''
+    # In dev tree, this will return "0.-1", but that will work fine"
+    assert isinstance(__version_tuple__[1], int)
+    return f"{{__version_tuple__[0]}}.{{__version_tuple__[1] - 1}}"
+"""
+    
+    with open(add_version_path, encoding="utf-8",mode="w") as file:
+        file.write(new_version_content)
+    file.close()
+    
+    
+def get_version():
+    get_version_add()
+    version_file = 'vllm/version.py'
+    with open(version_file, encoding='utf-8') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__hcu_version__']
+
+
 def get_vllm_version() -> str:
    # Allow overriding the version. This is useful to build platform-specific
    # wheels (e.g. CPU, TPU) without modifying the source.
@@ -806,6 +904,7 @@ def get_vllm_version() -> str:
        os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
        return get_version(write_to="vllm/_version.py")

+    if not _is_hip():
        version = get_version(write_to="vllm/_version.py")
        sep = "+" if "+" not in version else "."  # dev versions might contain +

@@ -824,9 +923,10 @@ def get_vllm_version() -> str:
                    version += f"{sep}cu{cuda_version_str}"
    elif _is_hip():
        # Get the Rocm Version
-        rocm_version = get_rocm_version() or torch.version.hip
-        if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION:
-            version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
+        # rocm_version = get_rocm_version() or torch.version.hip
+        # if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION:
+        #     version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
+        version = get_version()
    elif _is_tpu():
        version += f"{sep}tpu"
    elif _is_cpu():
@@ -892,10 +992,10 @@ if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
    # Optional since this doesn't get built (produce an .so file). This is just
    # copying the relevant .py files from the source repository.
-    ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True))
+    # ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True))

-if _is_hip():
-    ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
+# if _is_hip():
+#     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))

 if _is_cuda():
    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -42,10 +42,10 @@ except ImportError as e:
    logger.warning("Failed to import from vllm._C with %r", e)

 # import custom ops, trigger op registration
-try:
-    import vllm._rocm_C  # noqa: F401
-except ImportError as e:
-    logger.warning("Failed to import from vllm._rocm_C with %r", e)
+# try:
+#     import vllm._rocm_C  # noqa: F401
+# except ImportError as e:
+#     logger.warning("Failed to import from vllm._rocm_C with %r", e)

 # Models not supported by ROCm.
 _ROCM_UNSUPPORTED_MODELS: list[str] = []