Commit 52675626 authored by zhuwenwen's avatar zhuwenwen
Browse files

skip fp8 kernels and paged_attention_rocm

parent 87223113
......@@ -243,11 +243,11 @@ set(VLLM_EXT_SRC
"csrc/attention/attention_kernels_opt.cu"
"csrc/attention/attention_kernels_opt_tc.cu"
"csrc/opt/layernorm_kernels_opt.cu"
"csrc/layernorm_quant_kernels.cu"
# "csrc/layernorm_quant_kernels.cu"
# "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
# "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
......@@ -656,4 +656,4 @@ endif()
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
include(cmake/external_projects/vllm_flash_attn.cmake)
endif ()
endif ()
\ No newline at end of file
......@@ -4,4 +4,4 @@
#include "dtype_float16.cuh"
#include "dtype_float32.cuh"
#include "dtype_bfloat16.cuh"
// #include "dtype_fp8.cuh"
#include "dtype_fp8.cuh"
......@@ -6,7 +6,9 @@
*/
#include "type_convert.cuh"
#ifndef USE_ROCM
#include "quantization/fp8/common.cuh"
#endif
#include "dispatch_utils.h"
#include <torch/cuda.h>
......
......@@ -194,13 +194,13 @@ void fused_add_rms_norm_opt(torch::Tensor& input, torch::Tensor& residual,
// torch::Tensor& weight,
// torch::Tensor& scale, double epsilon);
void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
torch::Tensor const& input,
torch::Tensor const& weight,
torch::Tensor& scales,
double const epsilon,
std::optional<torch::Tensor> scale_ub,
std::optional<torch::Tensor> residual);
// void rms_norm_dynamic_per_token_quant(torch::Tensor& out,
// torch::Tensor const& input,
// torch::Tensor const& weight,
// torch::Tensor& scales,
// double const epsilon,
// std::optional<torch::Tensor> scale_ub,
// std::optional<torch::Tensor> residual);
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
torch::Tensor& key, int64_t head_size,
......
#pragma once
#ifndef USE_ROCM
#include <hip/hip_fp8.h>
#endif
#include <hip/hip_fp16.h>
#include <hip/hip_bf16.h>
......
......@@ -6,7 +6,7 @@
#include "quantization/vectorization.cuh"
// TODO(luka/varun):refactor common.cuh to use this file instead
#include "quantization/fp8/common.cuh"
// #include "quantization/fp8/common.cuh"
namespace vllm {
......
......@@ -319,12 +319,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// &fused_add_rms_norm_static_fp8_quant);
// Fused Layernorm + Quant kernels
ops.def(
"rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
"Tensor weight, Tensor! scale, float epsilon, "
"Tensor? scale_ub, Tensor!? residual) -> ()");
ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
&rms_norm_dynamic_per_token_quant);
// ops.def(
// "rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
// "Tensor weight, Tensor! scale, float epsilon, "
// "Tensor? scale_ub, Tensor!? residual) -> ()");
// ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
// &rms_norm_dynamic_per_token_quant);
// Rotary embedding
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
......
......@@ -605,10 +605,9 @@ def get_gaudi_sw_version():
def get_vllm_version() -> str:
if not _is_hip():
from setuptools_scm import get_version
version = get_version(write_to="vllm/_version.py")
sep = "+" if "+" not in version else "." # dev versions might contain +
# from setuptools_scm import get_version
# version = get_version(write_to="vllm/_version.py")
# sep = "+" if "+" not in version else "." # dev versions might contain +
if _no_device():
if envs.VLLM_TARGET_DEVICE == "empty":
......
......@@ -428,30 +428,30 @@ def paged_attention_v2_opt_tc_with_mask(
attn_masks, attn_masks_stride)
def paged_attention_rocm(
out: torch.Tensor,
exp_sum: torch.Tensor,
max_logits: torch.Tensor,
tmp_out: torch.Tensor,
query: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
num_kv_heads: int,
scale: float,
block_tables: torch.Tensor,
seq_lens: torch.Tensor,
block_size: int,
max_seq_len: int,
alibi_slopes: Optional[torch.Tensor],
kv_cache_dtype: str,
k_scale: torch.Tensor,
v_scale: torch.Tensor,
) -> None:
torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
key_cache, value_cache, num_kv_heads,
scale, block_tables, seq_lens,
block_size, max_seq_len, alibi_slopes,
kv_cache_dtype, k_scale, v_scale)
# def paged_attention_rocm(
# out: torch.Tensor,
# exp_sum: torch.Tensor,
# max_logits: torch.Tensor,
# tmp_out: torch.Tensor,
# query: torch.Tensor,
# key_cache: torch.Tensor,
# value_cache: torch.Tensor,
# num_kv_heads: int,
# scale: float,
# block_tables: torch.Tensor,
# seq_lens: torch.Tensor,
# block_size: int,
# max_seq_len: int,
# alibi_slopes: Optional[torch.Tensor],
# kv_cache_dtype: str,
# k_scale: torch.Tensor,
# v_scale: torch.Tensor,
# ) -> None:
# torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
# key_cache, value_cache, num_kv_heads,
# scale, block_tables, seq_lens,
# block_size, max_seq_len, alibi_slopes,
# kv_cache_dtype, k_scale, v_scale)
# pos encoding ops
......
......@@ -55,7 +55,7 @@ if TYPE_CHECKING:
VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
......@@ -742,4 +742,4 @@ def compute_hash() -> str:
hash_str = hashlib.md5(str(factors).encode()).hexdigest()
return hash_str
return hash_str
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment