Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
31aec25b
Commit
31aec25b
authored
Apr 18, 2026
by
zhuwenwen
Browse files
[Feature] Support vllm 0.19.0
parent
2a69949b
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
137 additions
and
27 deletions
+137
-27
CMakeLists.txt
CMakeLists.txt
+2
-2
cmake/utils.cmake
cmake/utils.cmake
+2
-1
csrc/cuda_vec_utils.cuh
csrc/cuda_vec_utils.cuh
+2
-0
csrc/fused_qknorm_rope_kernel.cu
csrc/fused_qknorm_rope_kernel.cu
+9
-9
csrc/quantization/gptq/q_gemm.cu
csrc/quantization/gptq/q_gemm.cu
+2
-0
requirements/rocm.txt
requirements/rocm.txt
+7
-2
setup.py
setup.py
+109
-9
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+4
-4
No files found.
CMakeLists.txt
View file @
31aec25b
...
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
...
@@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
set
(
PYTHON_SUPPORTED_VERSIONS
"3.10"
"3.11"
"3.12"
"3.13"
)
set
(
PYTHON_SUPPORTED_VERSIONS
"3.10"
"3.11"
"3.12"
"3.13"
)
# Supported AMD GPU architectures.
# Supported AMD GPU architectures.
set
(
HIP_SUPPORTED_ARCHS
"gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201"
)
set
(
HIP_SUPPORTED_ARCHS
"gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201
;gfx928;gfx936;gfx938
"
)
# ROCm installation prefix. Default to /opt/rocm but allow override via
# ROCm installation prefix. Default to /opt/rocm but allow override via
# -DROCM_PATH=/your/rocm/path when invoking cmake.
# -DROCM_PATH=/your/rocm/path when invoking cmake.
...
@@ -1202,7 +1202,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
...
@@ -1202,7 +1202,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
endif
()
endif
()
# For CUDA and HIP builds also build the triton_kernels external package.
# For CUDA and HIP builds also build the triton_kernels external package.
if
(
VLLM_GPU_LANG STREQUAL
"CUDA"
OR VLLM_GPU_LANG STREQUAL
"HIP"
)
if
(
VLLM_GPU_LANG STREQUAL
"CUDA"
)
include
(
cmake/external_projects/triton_kernels.cmake
)
include
(
cmake/external_projects/triton_kernels.cmake
)
endif
()
endif
()
...
...
cmake/utils.cmake
View file @
31aec25b
...
@@ -106,7 +106,8 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
...
@@ -106,7 +106,8 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"-D__CUDA_NO_HALF_OPERATORS__"
"-D__CUDA_NO_HALF_OPERATORS__"
"-D__CUDA_NO_HALF_CONVERSIONS__"
"-D__CUDA_NO_HALF_CONVERSIONS__"
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
"-D__CUDA_NO_HALF2_OPERATORS__"
)
"-D__CUDA_NO_HALF2_OPERATORS__"
"--gpu-max-threads-per-block=1024"
)
endif
()
endif
()
elseif
(
${
GPU_LANG
}
STREQUAL
"HIP"
)
elseif
(
${
GPU_LANG
}
STREQUAL
"HIP"
)
...
...
csrc/cuda_vec_utils.cuh
View file @
31aec25b
...
@@ -8,6 +8,8 @@
...
@@ -8,6 +8,8 @@
#include <cassert>
#include <cassert>
#ifdef USE_ROCM
#ifdef USE_ROCM
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#else
#else
#include <cuda_bf16.h>
#include <cuda_bf16.h>
...
...
csrc/fused_qknorm_rope_kernel.cu
View file @
31aec25b
...
@@ -38,15 +38,15 @@
...
@@ -38,15 +38,15 @@
#ifdef USE_ROCM
#ifdef USE_ROCM
#define FINAL_MASK 0xffffffffffffffffULL
#define FINAL_MASK 0xffffffffffffffffULL
#if defined(HIP_VERSION) && HIP_VERSION < 70000000
//
#if defined(HIP_VERSION) && HIP_VERSION < 70000000
// On ROCm versions before 7.0, __syncwarp isn't defined. The below
//
// On ROCm versions before 7.0, __syncwarp isn't defined. The below
// implementation is copy/pasted from the implementation in ROCm 7.0
//
// implementation is copy/pasted from the implementation in ROCm 7.0
__device__
inline
void
__syncwarp
()
{
//
__device__ inline void __syncwarp() {
__builtin_amdgcn_fence
(
__ATOMIC_RELEASE
,
"wavefront"
);
//
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
__builtin_amdgcn_wave_barrier
();
//
__builtin_amdgcn_wave_barrier();
__builtin_amdgcn_fence
(
__ATOMIC_ACQUIRE
,
"wavefront"
);
//
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
}
//
}
#endif
//
#endif
#else
#else
#define FINAL_MASK 0xffffffff
#define FINAL_MASK 0xffffffff
#endif
#endif
...
...
csrc/quantization/gptq/q_gemm.cu
View file @
31aec25b
...
@@ -12,7 +12,9 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
...
@@ -12,7 +12,9 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_fp16.h>
#ifndef USE_ROCM
#include "compat.cuh"
#include "compat.cuh"
#endif
#include "matrix_view.cuh"
#include "matrix_view.cuh"
#include "qdq_2.cuh"
#include "qdq_2.cuh"
#include "qdq_3.cuh"
#include "qdq_3.cuh"
...
...
requirements/rocm.txt
View file @
31aec25b
...
@@ -16,8 +16,13 @@ packaging>=24.2
...
@@ -16,8 +16,13 @@ packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
setuptools-scm>=8
runai-model-streamer[s3,gcs,azure]==0.15.7
runai-model-streamer[s3,gcs,azure]==0.15.7
conch-triton-kernels==1.2.1
#
conch-triton-kernels==1.2.1
timm>=1.0.17
timm>=1.0.17
# amd-quark: required for Quark quantization on ROCm
# amd-quark: required for Quark quantization on ROCm
# To be consistent with test_quark.py
# To be consistent with test_quark.py
amd-quark>=0.8.99
amd-quark>=0.8.99
\ No newline at end of file
# Other necessary dependencies
torch == 2.10.0
torchvision == 0.25.0
flash_attn == 2.8.3
setup.py
View file @
31aec25b
...
@@ -22,6 +22,16 @@ from setuptools_scm import get_version
...
@@ -22,6 +22,16 @@ from setuptools_scm import get_version
from
torch.utils.cpp_extension
import
CUDA_HOME
,
ROCM_HOME
from
torch.utils.cpp_extension
import
CUDA_HOME
,
ROCM_HOME
from
typing
import
Optional
,
Union
import
subprocess
from
pathlib
import
Path
pwd
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
add_git_version
=
False
if
int
(
os
.
environ
.
get
(
'ADD_GIT_VERSION'
,
'0'
))
==
1
:
add_git_version
=
True
def
load_module_from_path
(
module_name
,
path
):
def
load_module_from_path
(
module_name
,
path
):
spec
=
importlib
.
util
.
spec_from_file_location
(
module_name
,
path
)
spec
=
importlib
.
util
.
spec_from_file_location
(
module_name
,
path
)
module
=
importlib
.
util
.
module_from_spec
(
spec
)
module
=
importlib
.
util
.
module_from_spec
(
spec
)
...
@@ -365,7 +375,7 @@ class cmake_build_ext(build_ext):
...
@@ -365,7 +375,7 @@ class cmake_build_ext(build_ext):
os
.
makedirs
(
os
.
path
.
dirname
(
dst_file
),
exist_ok
=
True
)
os
.
makedirs
(
os
.
path
.
dirname
(
dst_file
),
exist_ok
=
True
)
self
.
copy_file
(
file
,
dst_file
)
self
.
copy_file
(
file
,
dst_file
)
if
_is_cuda
()
or
_is_hip
()
:
if
_is_cuda
():
# copy vllm/third_party/triton_kernels/**/*.py from self.build_lib
# copy vllm/third_party/triton_kernels/**/*.py from self.build_lib
# to current directory so that they can be included in the editable
# to current directory so that they can be included in the editable
# build
# build
...
@@ -866,6 +876,94 @@ def get_nvcc_cuda_version() -> Version:
...
@@ -866,6 +876,94 @@ def get_nvcc_cuda_version() -> Version:
return
nvcc_cuda_version
return
nvcc_cuda_version
def
get_sha
(
root
:
Union
[
str
,
Path
])
->
str
:
try
:
return
subprocess
.
check_output
([
'git'
,
'rev-parse'
,
'HEAD'
],
cwd
=
root
).
decode
(
'ascii'
).
strip
()
except
Exception
:
return
'Unknown'
def
get_version_add
(
sha
:
Optional
[
str
]
=
None
)
->
str
:
command
=
"git config --global --add safe.directory "
+
pwd
subprocess
.
run
(
command
,
shell
=
True
,
capture_output
=
False
,
text
=
True
)
vllm_root
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
add_version_path
=
os
.
path
.
join
(
os
.
path
.
join
(
vllm_root
,
"vllm"
),
"version.py"
)
major
,
minor
,
_
=
torch
.
__version__
.
split
(
'.'
)
if
add_git_version
:
if
sha
!=
'Unknown'
:
if
sha
is
None
:
sha
=
get_sha
(
vllm_root
)
version
=
'das.'
+
sha
[:
7
]
else
:
version
=
'das'
# dtk version
if
os
.
getenv
(
"ROCM_PATH"
):
rocm_path
=
os
.
getenv
(
'ROCM_PATH'
,
""
)
rocm_version_path
=
os
.
path
.
join
(
rocm_path
,
'.info'
,
"rocm_version"
)
with
open
(
rocm_version_path
,
'r'
,
encoding
=
'utf-8'
)
as
file
:
lines
=
file
.
readlines
()
rocm_version
=
lines
[
0
].
replace
(
"."
,
""
)
version
+=
".dtk"
+
rocm_version
new_version_content
=
f
"""
try:
__version__ = "0.19.0"
__version_tuple__ = (0, 19, 0)
__hcu_version__ = f'0.19.0+
{
version
}
'
from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e:
import warnings
warnings.warn(f"Failed to read commit hash:
\\
n + str(e)",
RuntimeWarning,
stacklevel=2)
__version__ = "dev"
__version_tuple__ = (0, 0, __version__)
def _prev_minor_version_was(version_str):
'''Check whether a given version matches the previous minor version.
Return True if version_str matches the previous minor version.
For example - return True if the current version if 0.7.4 and the
supplied version_str is '0.6'.
Used for --show-hidden-metrics-for-version.
'''
# Match anything if this is a dev tree
if __version_tuple__[0:2] == (0, 0):
return True
# Note - this won't do the right thing when we release 1.0!
# assert __version_tuple__[0] == 0
assert isinstance(__version_tuple__[1], int)
return version_str == f"{{__version_tuple__[0]}}.{{__version_tuple__[1] - 1}}"
def _prev_minor_version():
'''For the purpose of testing, return a previous minor version number.'''
# In dev tree, this will return "0.-1", but that will work fine"
assert isinstance(__version_tuple__[1], int)
return f"{{__version_tuple__[0]}}.{{__version_tuple__[1] - 1}}"
"""
with
open
(
add_version_path
,
encoding
=
"utf-8"
,
mode
=
"w"
)
as
file
:
file
.
write
(
new_version_content
)
file
.
close
()
def
get_version
():
get_version_add
()
version_file
=
'vllm/version.py'
with
open
(
version_file
,
encoding
=
'utf-8'
)
as
f
:
exec
(
compile
(
f
.
read
(),
version_file
,
'exec'
))
return
locals
()[
'__hcu_version__'
]
def
get_vllm_version
()
->
str
:
def
get_vllm_version
()
->
str
:
# Allow overriding the version. This is useful to build platform-specific
# Allow overriding the version. This is useful to build platform-specific
# wheels (e.g. CPU, TPU) without modifying the source.
# wheels (e.g. CPU, TPU) without modifying the source.
...
@@ -874,8 +972,9 @@ def get_vllm_version() -> str:
...
@@ -874,8 +972,9 @@ def get_vllm_version() -> str:
os
.
environ
[
"SETUPTOOLS_SCM_PRETEND_VERSION"
]
=
env_version
os
.
environ
[
"SETUPTOOLS_SCM_PRETEND_VERSION"
]
=
env_version
return
get_version
(
write_to
=
"vllm/_version.py"
)
return
get_version
(
write_to
=
"vllm/_version.py"
)
version
=
get_version
(
write_to
=
"vllm/_version.py"
)
if
not
_is_hip
():
sep
=
"+"
if
"+"
not
in
version
else
"."
# dev versions might contain +
version
=
get_version
(
write_to
=
"vllm/_version.py"
)
sep
=
"+"
if
"+"
not
in
version
else
"."
# dev versions might contain +
if
_no_device
():
if
_no_device
():
if
envs
.
VLLM_TARGET_DEVICE
==
"empty"
:
if
envs
.
VLLM_TARGET_DEVICE
==
"empty"
:
...
@@ -892,9 +991,10 @@ def get_vllm_version() -> str:
...
@@ -892,9 +991,10 @@ def get_vllm_version() -> str:
version
+=
f
"
{
sep
}
cu
{
cuda_version_str
}
"
version
+=
f
"
{
sep
}
cu
{
cuda_version_str
}
"
elif
_is_hip
():
elif
_is_hip
():
# Get the Rocm Version
# Get the Rocm Version
rocm_version
=
get_rocm_version
()
or
torch
.
version
.
hip
# rocm_version = get_rocm_version() or torch.version.hip
if
rocm_version
and
rocm_version
!=
envs
.
VLLM_MAIN_CUDA_VERSION
:
# if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION:
version
+=
f
"
{
sep
}
rocm
{
rocm_version
.
replace
(
'.'
,
''
)[:
3
]
}
"
# version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
version
=
get_version
()
elif
_is_tpu
():
elif
_is_tpu
():
version
+=
f
"
{
sep
}
tpu"
version
+=
f
"
{
sep
}
tpu"
elif
_is_cpu
():
elif
_is_cpu
():
...
@@ -960,10 +1060,10 @@ if _is_cuda() or _is_hip():
...
@@ -960,10 +1060,10 @@ if _is_cuda() or _is_hip():
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm.cumem_allocator"
))
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm.cumem_allocator"
))
# Optional since this doesn't get built (produce an .so file). This is just
# Optional since this doesn't get built (produce an .so file). This is just
# copying the relevant .py files from the source repository.
# copying the relevant .py files from the source repository.
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm.triton_kernels"
,
optional
=
True
))
#
ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True))
if
_is_hip
():
#
if _is_hip():
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._rocm_C"
))
#
ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
if
_is_cuda
():
if
_is_cuda
():
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm.vllm_flash_attn._vllm_fa2_C"
))
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm.vllm_flash_attn._vllm_fa2_C"
))
...
...
vllm/platforms/rocm.py
View file @
31aec25b
...
@@ -43,10 +43,10 @@ except ImportError as e:
...
@@ -43,10 +43,10 @@ except ImportError as e:
logger
.
warning
(
"Failed to import from vllm._C with %r"
,
e
)
logger
.
warning
(
"Failed to import from vllm._C with %r"
,
e
)
# import custom ops, trigger op registration
# import custom ops, trigger op registration
try
:
#
try:
import
vllm._rocm_C
# noqa: F401
#
import vllm._rocm_C # noqa: F401
except
ImportError
as
e
:
#
except ImportError as e:
logger
.
warning
(
"Failed to import from vllm._rocm_C with %r"
,
e
)
#
logger.warning("Failed to import from vllm._rocm_C with %r", e)
# Models not supported by ROCm.
# Models not supported by ROCm.
_ROCM_UNSUPPORTED_MODELS
:
list
[
str
]
=
[]
_ROCM_UNSUPPORTED_MODELS
:
list
[
str
]
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment