Commit 52675626 authored by zhuwenwen's avatar zhuwenwen
Browse files

skip fp8 kernels and paged_attention_rocm

parent 87223113
...@@ -243,11 +243,11 @@ set(VLLM_EXT_SRC ...@@ -243,11 +243,11 @@ set(VLLM_EXT_SRC
"csrc/attention/attention_kernels_opt.cu" "csrc/attention/attention_kernels_opt.cu"
"csrc/attention/attention_kernels_opt_tc.cu" "csrc/attention/attention_kernels_opt_tc.cu"
"csrc/opt/layernorm_kernels_opt.cu" "csrc/opt/layernorm_kernels_opt.cu"
"csrc/layernorm_quant_kernels.cu" # "csrc/layernorm_quant_kernels.cu"
# "csrc/quantization/gptq/q_gemm.cu" # "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu" # "csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" # "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu" "csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu" "csrc/prepare_inputs/advance_step.cu"
...@@ -656,4 +656,4 @@ endif() ...@@ -656,4 +656,4 @@ endif()
if (VLLM_GPU_LANG STREQUAL "CUDA") if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake) include(cmake/external_projects/flashmla.cmake)
include(cmake/external_projects/vllm_flash_attn.cmake) include(cmake/external_projects/vllm_flash_attn.cmake)
endif () endif ()
\ No newline at end of file
...@@ -4,4 +4,4 @@ ...@@ -4,4 +4,4 @@
#include "dtype_float16.cuh" #include "dtype_float16.cuh"
#include "dtype_float32.cuh" #include "dtype_float32.cuh"
#include "dtype_bfloat16.cuh" #include "dtype_bfloat16.cuh"
// #include "dtype_fp8.cuh" #include "dtype_fp8.cuh"
...@@ -6,7 +6,9 @@ ...@@ -6,7 +6,9 @@
*/ */
#include "type_convert.cuh" #include "type_convert.cuh"
#ifndef USE_ROCM
#include "quantization/fp8/common.cuh" #include "quantization/fp8/common.cuh"
#endif
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include <torch/cuda.h> #include <torch/cuda.h>
......
...@@ -194,13 +194,13 @@ void fused_add_rms_norm_opt(torch::Tensor& input, torch::Tensor& residual, ...@@ -194,13 +194,13 @@ void fused_add_rms_norm_opt(torch::Tensor& input, torch::Tensor& residual,
// torch::Tensor& weight, // torch::Tensor& weight,
// torch::Tensor& scale, double epsilon); // torch::Tensor& scale, double epsilon);
void rms_norm_dynamic_per_token_quant(torch::Tensor& out, // void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
torch::Tensor const& input, // torch::Tensor const& input,
torch::Tensor const& weight, // torch::Tensor const& weight,
torch::Tensor& scales, // torch::Tensor& scales,
double const epsilon, // double const epsilon,
std::optional<torch::Tensor> scale_ub, // std::optional<torch::Tensor> scale_ub,
std::optional<torch::Tensor> residual); // std::optional<torch::Tensor> residual);
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
torch::Tensor& key, int64_t head_size, torch::Tensor& key, int64_t head_size,
......
#pragma once #pragma once
#ifndef USE_ROCM
#include <hip/hip_fp8.h> #include <hip/hip_fp8.h>
#endif
#include <hip/hip_fp16.h> #include <hip/hip_fp16.h>
#include <hip/hip_bf16.h> #include <hip/hip_bf16.h>
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "quantization/vectorization.cuh" #include "quantization/vectorization.cuh"
// TODO(luka/varun):refactor common.cuh to use this file instead // TODO(luka/varun):refactor common.cuh to use this file instead
#include "quantization/fp8/common.cuh" // #include "quantization/fp8/common.cuh"
namespace vllm { namespace vllm {
......
...@@ -319,12 +319,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -319,12 +319,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// &fused_add_rms_norm_static_fp8_quant); // &fused_add_rms_norm_static_fp8_quant);
// Fused Layernorm + Quant kernels // Fused Layernorm + Quant kernels
ops.def( // ops.def(
"rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, " // "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
"Tensor weight, Tensor! scale, float epsilon, " // "Tensor weight, Tensor! scale, float epsilon, "
"Tensor? scale_ub, Tensor!? residual) -> ()"); // "Tensor? scale_ub, Tensor!? residual) -> ()");
ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA, // ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
&rms_norm_dynamic_per_token_quant); // &rms_norm_dynamic_per_token_quant);
// Rotary embedding // Rotary embedding
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key. // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
......
...@@ -605,10 +605,9 @@ def get_gaudi_sw_version(): ...@@ -605,10 +605,9 @@ def get_gaudi_sw_version():
def get_vllm_version() -> str: def get_vllm_version() -> str:
if not _is_hip(): # from setuptools_scm import get_version
from setuptools_scm import get_version # version = get_version(write_to="vllm/_version.py")
version = get_version(write_to="vllm/_version.py") # sep = "+" if "+" not in version else "." # dev versions might contain +
sep = "+" if "+" not in version else "." # dev versions might contain +
if _no_device(): if _no_device():
if envs.VLLM_TARGET_DEVICE == "empty": if envs.VLLM_TARGET_DEVICE == "empty":
......
...@@ -428,30 +428,30 @@ def paged_attention_v2_opt_tc_with_mask( ...@@ -428,30 +428,30 @@ def paged_attention_v2_opt_tc_with_mask(
attn_masks, attn_masks_stride) attn_masks, attn_masks_stride)
def paged_attention_rocm( # def paged_attention_rocm(
out: torch.Tensor, # out: torch.Tensor,
exp_sum: torch.Tensor, # exp_sum: torch.Tensor,
max_logits: torch.Tensor, # max_logits: torch.Tensor,
tmp_out: torch.Tensor, # tmp_out: torch.Tensor,
query: torch.Tensor, # query: torch.Tensor,
key_cache: torch.Tensor, # key_cache: torch.Tensor,
value_cache: torch.Tensor, # value_cache: torch.Tensor,
num_kv_heads: int, # num_kv_heads: int,
scale: float, # scale: float,
block_tables: torch.Tensor, # block_tables: torch.Tensor,
seq_lens: torch.Tensor, # seq_lens: torch.Tensor,
block_size: int, # block_size: int,
max_seq_len: int, # max_seq_len: int,
alibi_slopes: Optional[torch.Tensor], # alibi_slopes: Optional[torch.Tensor],
kv_cache_dtype: str, # kv_cache_dtype: str,
k_scale: torch.Tensor, # k_scale: torch.Tensor,
v_scale: torch.Tensor, # v_scale: torch.Tensor,
) -> None: # ) -> None:
torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query, # torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
key_cache, value_cache, num_kv_heads, # key_cache, value_cache, num_kv_heads,
scale, block_tables, seq_lens, # scale, block_tables, seq_lens,
block_size, max_seq_len, alibi_slopes, # block_size, max_seq_len, alibi_slopes,
kv_cache_dtype, k_scale, v_scale) # kv_cache_dtype, k_scale, v_scale)
# pos encoding ops # pos encoding ops
......
...@@ -55,7 +55,7 @@ if TYPE_CHECKING: ...@@ -55,7 +55,7 @@ if TYPE_CHECKING:
VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_VIDEO_FETCH_TIMEOUT: int = 30 VLLM_VIDEO_FETCH_TIMEOUT: int = 30
...@@ -742,4 +742,4 @@ def compute_hash() -> str: ...@@ -742,4 +742,4 @@ def compute_hash() -> str:
hash_str = hashlib.md5(str(factors).encode()).hexdigest() hash_str = hashlib.md5(str(factors).encode()).hexdigest()
return hash_str return hash_str
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment