Commit 02c3e313 authored by zhuwenwen's avatar zhuwenwen
Browse files

fix interface error

parent 4d3a2c28
......@@ -119,9 +119,6 @@ void paged_attention_v1_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V1(128);
break;
case 160:
LAUNCH_PAGED_ATTENTION_V2(160);
break;
case 192:
LAUNCH_PAGED_ATTENTION_V1(192);
break;
......
......@@ -125,9 +125,6 @@ void paged_attention_v2_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V2(128);
break;
case 160:
LAUNCH_PAGED_ATTENTION_V2(160);
break;
case 192:
LAUNCH_PAGED_ATTENTION_V2(192);
break;
......
......@@ -51,9 +51,6 @@
} else if (HEADDIM == 128) { \
constexpr static int HEAD_SIZE = 128; \
return __VA_ARGS__(); \
} else if (HEADDIM == 160) { \
constexpr static int HEAD_SIZE = 160; \
return __VA_ARGS__(); \
} else if (HEADDIM == 192) { \
constexpr static int HEAD_SIZE = 192; \
return __VA_ARGS__(); \
......
......@@ -43,9 +43,6 @@
} else if (HEADDIM == 128) { \
constexpr static int HEAD_SIZE = 128; \
return __VA_ARGS__(); \
} else if (HEADDIM == 160) { \
constexpr static int HEAD_SIZE = 160; \
return __VA_ARGS__(); \
} else if (HEADDIM == 192) { \
constexpr static int HEAD_SIZE = 192; \
return __VA_ARGS__(); \
......
......@@ -160,7 +160,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
if (experts_num_exceed_limit) {
// set dynamic shared mem
auto kernel = vllm::moe_align_block_size_kernel<scalar_t, true>;
auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t, true>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem));
......@@ -175,7 +175,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
topk_ids.numel());
} else {
// set dynamic shared mem
auto kernel = vllm::moe_align_block_size_kernel<scalar_t, false>;
auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t, false>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem));
kernel<<<1, num_experts, shared_mem, stream>>>(
......
......@@ -123,6 +123,8 @@ __global__ void act_and_mul_kernel_with_param(
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
out[token_idx * d + idx] = ACT_FN(x, param) * y;
}
}
} // namespace vllm
......
......@@ -35,7 +35,7 @@ class PagedAttention:
@staticmethod
def get_supported_head_sizes() -> List[int]:
return [64, 80, 96, 112, 120, 128, 160, 192, 256]
return [64, 80, 96, 112, 120, 128, 192, 256]
@staticmethod
def get_kv_cache_shape(
......
......@@ -475,7 +475,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_TREE_DECODING":
lambda:
(os.environ.get("VLLM_TREE_DECODING", "0").strip().lower() in
("1", "true"))
("1", "true")),
# By default, vLLM will check the peer-to-peer capability itself,
# in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
# If this env var is set to 1, vLLM will skip the peer-to-peer check,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment