Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
52675626
Commit
52675626
authored
Mar 31, 2025
by
zhuwenwen
Browse files
skip fp8 kernels and paged_attention_rocm
parent
87223113
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
51 additions
and
48 deletions
+51
-48
CMakeLists.txt
CMakeLists.txt
+3
-3
csrc/attention/attention_dtypes.h
csrc/attention/attention_dtypes.h
+1
-1
csrc/layernorm_quant_kernels.cu
csrc/layernorm_quant_kernels.cu
+2
-0
csrc/ops.h
csrc/ops.h
+7
-7
csrc/quantization/fp8/amd/quant_utils.cuh
csrc/quantization/fp8/amd/quant_utils.cuh
+2
-0
csrc/quantization/fused_kernels/quant_conversions.cuh
csrc/quantization/fused_kernels/quant_conversions.cuh
+1
-1
csrc/torch_bindings.cpp
csrc/torch_bindings.cpp
+6
-6
setup.py
setup.py
+3
-4
vllm/_custom_ops.py
vllm/_custom_ops.py
+24
-24
vllm/envs.py
vllm/envs.py
+2
-2
No files found.
CMakeLists.txt
View file @
52675626
...
...
@@ -243,11 +243,11 @@ set(VLLM_EXT_SRC
"csrc/attention/attention_kernels_opt.cu"
"csrc/attention/attention_kernels_opt_tc.cu"
"csrc/opt/layernorm_kernels_opt.cu"
"csrc/layernorm_quant_kernels.cu"
#
"csrc/layernorm_quant_kernels.cu"
# "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
#
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
...
...
@@ -656,4 +656,4 @@ endif()
if
(
VLLM_GPU_LANG STREQUAL
"CUDA"
)
include
(
cmake/external_projects/flashmla.cmake
)
include
(
cmake/external_projects/vllm_flash_attn.cmake
)
endif
()
endif
()
\ No newline at end of file
csrc/attention/attention_dtypes.h
View file @
52675626
...
...
@@ -4,4 +4,4 @@
#include "dtype_float16.cuh"
#include "dtype_float32.cuh"
#include "dtype_bfloat16.cuh"
//
#include "dtype_fp8.cuh"
#include "dtype_fp8.cuh"
csrc/layernorm_quant_kernels.cu
View file @
52675626
...
...
@@ -6,7 +6,9 @@
*/
#include "type_convert.cuh"
#ifndef USE_ROCM
#include "quantization/fp8/common.cuh"
#endif
#include "dispatch_utils.h"
#include <torch/cuda.h>
...
...
csrc/ops.h
View file @
52675626
...
...
@@ -194,13 +194,13 @@ void fused_add_rms_norm_opt(torch::Tensor& input, torch::Tensor& residual,
// torch::Tensor& weight,
// torch::Tensor& scale, double epsilon);
void
rms_norm_dynamic_per_token_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
const
&
input
,
torch
::
Tensor
const
&
weight
,
torch
::
Tensor
&
scales
,
double
const
epsilon
,
std
::
optional
<
torch
::
Tensor
>
scale_ub
,
std
::
optional
<
torch
::
Tensor
>
residual
);
//
void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
//
torch::Tensor const& input,
//
torch::Tensor const& weight,
//
torch::Tensor& scales,
//
double const epsilon,
//
std::optional<torch::Tensor> scale_ub,
//
std::optional<torch::Tensor> residual);
void
rotary_embedding
(
torch
::
Tensor
&
positions
,
torch
::
Tensor
&
query
,
torch
::
Tensor
&
key
,
int64_t
head_size
,
...
...
csrc/quantization/fp8/amd/quant_utils.cuh
View file @
52675626
#pragma once
#ifndef USE_ROCM
#include <hip/hip_fp8.h>
#endif
#include <hip/hip_fp16.h>
#include <hip/hip_bf16.h>
...
...
csrc/quantization/fused_kernels/quant_conversions.cuh
View file @
52675626
...
...
@@ -6,7 +6,7 @@
#include "quantization/vectorization.cuh"
// TODO(luka/varun):refactor common.cuh to use this file instead
#include "quantization/fp8/common.cuh"
//
#include "quantization/fp8/common.cuh"
namespace
vllm
{
...
...
csrc/torch_bindings.cpp
View file @
52675626
...
...
@@ -319,12 +319,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// &fused_add_rms_norm_static_fp8_quant);
// Fused Layernorm + Quant kernels
ops
.
def
(
"rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
"Tensor weight, Tensor! scale, float epsilon, "
"Tensor? scale_ub, Tensor!? residual) -> ()"
);
ops
.
impl
(
"rms_norm_dynamic_per_token_quant"
,
torch
::
kCUDA
,
&
rms_norm_dynamic_per_token_quant
);
//
ops.def(
//
"rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
//
"Tensor weight, Tensor! scale, float epsilon, "
//
"Tensor? scale_ub, Tensor!? residual) -> ()");
//
ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
//
&rms_norm_dynamic_per_token_quant);
// Rotary embedding
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
...
...
setup.py
View file @
52675626
...
...
@@ -605,10 +605,9 @@ def get_gaudi_sw_version():
def
get_vllm_version
()
->
str
:
if
not
_is_hip
():
from
setuptools_scm
import
get_version
version
=
get_version
(
write_to
=
"vllm/_version.py"
)
sep
=
"+"
if
"+"
not
in
version
else
"."
# dev versions might contain +
# from setuptools_scm import get_version
# version = get_version(write_to="vllm/_version.py")
# sep = "+" if "+" not in version else "." # dev versions might contain +
if
_no_device
():
if
envs
.
VLLM_TARGET_DEVICE
==
"empty"
:
...
...
vllm/_custom_ops.py
View file @
52675626
...
...
@@ -428,30 +428,30 @@ def paged_attention_v2_opt_tc_with_mask(
attn_masks
,
attn_masks_stride
)
def
paged_attention_rocm
(
out
:
torch
.
Tensor
,
exp_sum
:
torch
.
Tensor
,
max_logits
:
torch
.
Tensor
,
tmp_out
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key_cache
:
torch
.
Tensor
,
value_cache
:
torch
.
Tensor
,
num_kv_heads
:
int
,
scale
:
float
,
block_tables
:
torch
.
Tensor
,
seq_lens
:
torch
.
Tensor
,
block_size
:
int
,
max_seq_len
:
int
,
alibi_slopes
:
Optional
[
torch
.
Tensor
],
kv_cache_dtype
:
str
,
k_scale
:
torch
.
Tensor
,
v_scale
:
torch
.
Tensor
,
)
->
None
:
torch
.
ops
.
_rocm_C
.
paged_attention
(
out
,
exp_sum
,
max_logits
,
tmp_out
,
query
,
key_cache
,
value_cache
,
num_kv_heads
,
scale
,
block_tables
,
seq_lens
,
block_size
,
max_seq_len
,
alibi_slopes
,
kv_cache_dtype
,
k_scale
,
v_scale
)
#
def paged_attention_rocm(
#
out: torch.Tensor,
#
exp_sum: torch.Tensor,
#
max_logits: torch.Tensor,
#
tmp_out: torch.Tensor,
#
query: torch.Tensor,
#
key_cache: torch.Tensor,
#
value_cache: torch.Tensor,
#
num_kv_heads: int,
#
scale: float,
#
block_tables: torch.Tensor,
#
seq_lens: torch.Tensor,
#
block_size: int,
#
max_seq_len: int,
#
alibi_slopes: Optional[torch.Tensor],
#
kv_cache_dtype: str,
#
k_scale: torch.Tensor,
#
v_scale: torch.Tensor,
#
) -> None:
#
torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
#
key_cache, value_cache, num_kv_heads,
#
scale, block_tables, seq_lens,
#
block_size, max_seq_len, alibi_slopes,
#
kv_cache_dtype, k_scale, v_scale)
# pos encoding ops
...
...
vllm/envs.py
View file @
52675626
...
...
@@ -55,7 +55,7 @@ if TYPE_CHECKING:
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL
:
bool
=
True
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM
:
bool
=
False
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"
fork
"
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"
spawn
"
VLLM_ASSETS_CACHE
:
str
=
os
.
path
.
join
(
VLLM_CACHE_ROOT
,
"assets"
)
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
VLLM_VIDEO_FETCH_TIMEOUT
:
int
=
30
...
...
@@ -742,4 +742,4 @@ def compute_hash() -> str:
hash_str
=
hashlib
.
md5
(
str
(
factors
).
encode
()).
hexdigest
()
return
hash_str
return
hash_str
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment