Commit 02c3e313 authored by zhuwenwen's avatar zhuwenwen
Browse files

fix interface error

parent 4d3a2c28
...@@ -119,9 +119,6 @@ void paged_attention_v1_launcher( ...@@ -119,9 +119,6 @@ void paged_attention_v1_launcher(
case 128: case 128:
LAUNCH_PAGED_ATTENTION_V1(128); LAUNCH_PAGED_ATTENTION_V1(128);
break; break;
case 160:
LAUNCH_PAGED_ATTENTION_V2(160);
break;
case 192: case 192:
LAUNCH_PAGED_ATTENTION_V1(192); LAUNCH_PAGED_ATTENTION_V1(192);
break; break;
......
...@@ -125,9 +125,6 @@ void paged_attention_v2_launcher( ...@@ -125,9 +125,6 @@ void paged_attention_v2_launcher(
case 128: case 128:
LAUNCH_PAGED_ATTENTION_V2(128); LAUNCH_PAGED_ATTENTION_V2(128);
break; break;
case 160:
LAUNCH_PAGED_ATTENTION_V2(160);
break;
case 192: case 192:
LAUNCH_PAGED_ATTENTION_V2(192); LAUNCH_PAGED_ATTENTION_V2(192);
break; break;
......
...@@ -51,9 +51,6 @@ ...@@ -51,9 +51,6 @@
} else if (HEADDIM == 128) { \ } else if (HEADDIM == 128) { \
constexpr static int HEAD_SIZE = 128; \ constexpr static int HEAD_SIZE = 128; \
return __VA_ARGS__(); \ return __VA_ARGS__(); \
} else if (HEADDIM == 160) { \
constexpr static int HEAD_SIZE = 160; \
return __VA_ARGS__(); \
} else if (HEADDIM == 192) { \ } else if (HEADDIM == 192) { \
constexpr static int HEAD_SIZE = 192; \ constexpr static int HEAD_SIZE = 192; \
return __VA_ARGS__(); \ return __VA_ARGS__(); \
......
...@@ -43,9 +43,6 @@ ...@@ -43,9 +43,6 @@
} else if (HEADDIM == 128) { \ } else if (HEADDIM == 128) { \
constexpr static int HEAD_SIZE = 128; \ constexpr static int HEAD_SIZE = 128; \
return __VA_ARGS__(); \ return __VA_ARGS__(); \
} else if (HEADDIM == 160) { \
constexpr static int HEAD_SIZE = 160; \
return __VA_ARGS__(); \
} else if (HEADDIM == 192) { \ } else if (HEADDIM == 192) { \
constexpr static int HEAD_SIZE = 192; \ constexpr static int HEAD_SIZE = 192; \
return __VA_ARGS__(); \ return __VA_ARGS__(); \
......
...@@ -160,7 +160,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, ...@@ -160,7 +160,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
if (experts_num_exceed_limit) { if (experts_num_exceed_limit) {
// set dynamic shared mem // set dynamic shared mem
auto kernel = vllm::moe_align_block_size_kernel<scalar_t, true>; auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t, true>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem)); (void*)kernel, shared_mem));
...@@ -175,7 +175,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, ...@@ -175,7 +175,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
topk_ids.numel()); topk_ids.numel());
} else { } else {
// set dynamic shared mem // set dynamic shared mem
auto kernel = vllm::moe_align_block_size_kernel<scalar_t, false>; auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t, false>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem)); (void*)kernel, shared_mem));
kernel<<<1, num_experts, shared_mem, stream>>>( kernel<<<1, num_experts, shared_mem, stream>>>(
......
...@@ -123,6 +123,8 @@ __global__ void act_and_mul_kernel_with_param( ...@@ -123,6 +123,8 @@ __global__ void act_and_mul_kernel_with_param(
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
out[token_idx * d + idx] = ACT_FN(x, param) * y; out[token_idx * d + idx] = ACT_FN(x, param) * y;
} }
}
} // namespace vllm } // namespace vllm
......
...@@ -35,7 +35,7 @@ class PagedAttention: ...@@ -35,7 +35,7 @@ class PagedAttention:
@staticmethod @staticmethod
def get_supported_head_sizes() -> List[int]: def get_supported_head_sizes() -> List[int]:
return [64, 80, 96, 112, 120, 128, 160, 192, 256] return [64, 80, 96, 112, 120, 128, 192, 256]
@staticmethod @staticmethod
def get_kv_cache_shape( def get_kv_cache_shape(
......
...@@ -475,7 +475,7 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -475,7 +475,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_TREE_DECODING": "VLLM_TREE_DECODING":
lambda: lambda:
(os.environ.get("VLLM_TREE_DECODING", "0").strip().lower() in (os.environ.get("VLLM_TREE_DECODING", "0").strip().lower() in
("1", "true")) ("1", "true")),
# By default, vLLM will check the peer-to-peer capability itself, # By default, vLLM will check the peer-to-peer capability itself,
# in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
# If this env var is set to 1, vLLM will skip the peer-to-peer check, # If this env var is set to 1, vLLM will skip the peer-to-peer check,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment