"vscode:/vscode.git/clone" did not exist on "d6a518fdde9780f5c9aabe8cf1f2fafd29af3cbc"
Commit 48a9e546 authored by 王敏's avatar 王敏
Browse files

Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev

parents 6372a1f3 c11b09df
...@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group): ...@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
assert torch.equal(actual, expected) assert torch.equal(actual, expected)
@pytest.mark.skipif(current_platform.is_rocm(),
reason="ROCM is not supported.")
@pytest.mark.parametrize("batch_size", list(range(1, 257))) @pytest.mark.parametrize("batch_size", list(range(1, 257)))
@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False]) @pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
......
...@@ -32,7 +32,7 @@ def test_deepseek_mla_attn_backend_module(): ...@@ -32,7 +32,7 @@ def test_deepseek_mla_attn_backend_module():
trust_remote_code=True, trust_remote_code=True,
enable_chunked_prefill=False, enable_chunked_prefill=False,
) )
assert model_runner.attn_backend.__name__ == "TritonMLABackend" assert model_runner.attn_backend.__name__ == "FlashMLABackend" # "TritonMLABackend"
@pytest.mark.parametrize("batch_size", list(range(1, 257, 3))) @pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
......
...@@ -1971,7 +1971,8 @@ def wvSplitKQ(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype, ...@@ -1971,7 +1971,8 @@ def wvSplitKQ(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype,
# moe # moe
def moe_sum(input: torch.Tensor, output: torch.Tensor): def moe_sum(input: torch.Tensor, output: torch.Tensor):
torch.ops._moe_C.moe_sum(input, output) torch.ops._moe_C.moe_sum(input, output)
def moe_sum_opt1(input: torch.Tensor, output: torch.Tensor):
torch.ops._moe_C.moe_sum_opt1(input, output)
def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int, def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
block_size: int, sorted_token_ids: torch.Tensor, block_size: int, sorted_token_ids: torch.Tensor,
......
...@@ -215,6 +215,9 @@ class P2pNcclConnector(KVConnectorBase_V1): ...@@ -215,6 +215,9 @@ class P2pNcclConnector(KVConnectorBase_V1):
inject_kv_into_layer(kv_cache_layer, kv_cache, inject_kv_into_layer(kv_cache_layer, kv_cache,
request.slot_mapping, request.request_id) request.slot_mapping, request.request_id)
tensor = self.p2p_nccl_engine.recv_store.pop(request.request_id + "#" + layer_name, None)
if tensor is not None:
del tensor
def wait_for_layer_load(self, layer_name: str) -> None: def wait_for_layer_load(self, layer_name: str) -> None:
"""Blocking until the KV for a specific layer is loaded into vLLM's """Blocking until the KV for a specific layer is loaded into vLLM's
......
...@@ -1004,7 +1004,7 @@ class EngineArgs: ...@@ -1004,7 +1004,7 @@ class EngineArgs:
enable_sleep_mode=self.enable_sleep_mode, enable_sleep_mode=self.enable_sleep_mode,
model_impl=self.model_impl, model_impl=self.model_impl,
override_attention_dtype=self.override_attention_dtype, override_attention_dtype=self.override_attention_dtype,
enable_chunked_prefill=self.enable_chunked_prefill enable_chunked_prefill=self.enable_chunked_prefill,
) )
def create_load_config(self) -> LoadConfig: def create_load_config(self) -> LoadConfig:
......
...@@ -50,6 +50,7 @@ def get_config_quant_dtype( ...@@ -50,6 +50,7 @@ def get_config_quant_dtype(
use_int8_w8a8: bool, use_int8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
use_int4_w4a16: bool, use_int4_w4a16: bool,
use_int4_w4a8: bool,
) -> Optional[torch.dtype]: ) -> Optional[torch.dtype]:
if use_fp8_w8a8: if use_fp8_w8a8:
return torch.float8_e4m3fn return torch.float8_e4m3fn
...@@ -126,6 +127,7 @@ class FusedMoEQuantConfig: ...@@ -126,6 +127,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8: bool = False, use_int8_w8a8: bool = False,
use_int8_w8a16: bool = False, use_int8_w8a16: bool = False,
use_int4_w4a16: bool = False, use_int4_w4a16: bool = False,
use_int4_w4a8: bool = False,
per_act_token_quant: bool = False, per_act_token_quant: bool = False,
per_out_ch_quant: bool = False, per_out_ch_quant: bool = False,
block_shape: Optional[list[int]] = None, block_shape: Optional[list[int]] = None,
...@@ -136,6 +138,7 @@ class FusedMoEQuantConfig: ...@@ -136,6 +138,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8, use_int8_w8a8,
use_int8_w8a16, use_int8_w8a16,
use_int4_w4a16, use_int4_w4a16,
use_int4_w4a8,
] ]
]) <= 1, "Quantization flags are mutually exclusive." ]) <= 1, "Quantization flags are mutually exclusive."
...@@ -144,6 +147,7 @@ class FusedMoEQuantConfig: ...@@ -144,6 +147,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8=use_int8_w8a8, use_int8_w8a8=use_int8_w8a8,
use_int8_w8a16=use_int8_w8a16, use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16, use_int4_w4a16=use_int4_w4a16,
use_int4_w4a8=use_int4_w4a8,
) )
return FusedMoEQuantConfig( return FusedMoEQuantConfig(
quant_dtype, quant_dtype,
......
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 2,
"num_stages": 2,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"24": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"96": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
}
}
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"24": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"48": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"96": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"1536": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"3072": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
}
}
...@@ -652,7 +652,6 @@ def invoke_fused_moe_kernel(A: torch.Tensor, ...@@ -652,7 +652,6 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
B_scale: Optional[torch.Tensor], B_scale: Optional[torch.Tensor],
B_zp: Optional[torch.Tensor], B_zp: Optional[torch.Tensor],
topk_weights: Optional[torch.Tensor], topk_weights: Optional[torch.Tensor],
topk_ids: torch.Tensor,
sorted_token_ids: torch.Tensor, sorted_token_ids: torch.Tensor,
expert_ids: torch.Tensor, expert_ids: torch.Tensor,
num_tokens_post_padded: torch.Tensor, num_tokens_post_padded: torch.Tensor,
...@@ -1603,7 +1602,8 @@ def fused_experts_impl( ...@@ -1603,7 +1602,8 @@ def fused_experts_impl(
qtype = get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8, qtype = get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a8=use_int8_w8a8, use_int8_w8a8=use_int8_w8a8,
use_int8_w8a16=use_int8_w8a16, use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16) use_int4_w4a16=use_int4_w4a16,
use_int4_w4a8=use_int4_w4a8)
get_config_func = functools.partial( get_config_func = functools.partial(
try_get_optimal_moe_config, try_get_optimal_moe_config,
...@@ -1689,7 +1689,6 @@ def fused_experts_impl( ...@@ -1689,7 +1689,6 @@ def fused_experts_impl(
w1_scale, w1_scale,
w1_zp, w1_zp,
curr_topk_weights, curr_topk_weights,
curr_topk_ids,
sorted_token_ids, sorted_token_ids,
expert_ids, expert_ids,
num_tokens_post_padded, num_tokens_post_padded,
...@@ -1729,7 +1728,6 @@ def fused_experts_impl( ...@@ -1729,7 +1728,6 @@ def fused_experts_impl(
w2_scale, w2_scale,
w2_zp, w2_zp,
curr_topk_weights, curr_topk_weights,
curr_topk_ids,
sorted_token_ids, sorted_token_ids,
expert_ids, expert_ids,
num_tokens_post_padded, num_tokens_post_padded,
...@@ -1877,7 +1875,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): ...@@ -1877,7 +1875,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
use_int8_w8a8: bool = False, use_int8_w8a8: bool = False,
use_int8_w8a16: bool = False, use_int8_w8a16: bool = False,
use_int4_w4a16: bool = False, use_int4_w4a16: bool = False,
use_int4_w4a8: bool =False, use_int4_w4a8: bool = False,
per_act_token_quant: bool = False, per_act_token_quant: bool = False,
block_shape: Optional[List[int]] = None, block_shape: Optional[List[int]] = None,
): ):
...@@ -1896,7 +1894,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): ...@@ -1896,7 +1894,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
self.use_int4_w4a16 = use_int4_w4a16 self.use_int4_w4a16 = use_int4_w4a16
self.use_int8_w8a8 = use_int8_w8a8 self.use_int8_w8a8 = use_int8_w8a8
self.use_int8_w8a16 = use_int8_w8a16 self.use_int8_w8a16 = use_int8_w8a16
self.use_int4_w4a8= use_int4_w4a8 self.use_int4_w4a8 = use_int4_w4a8
@property @property
def activation_formats( def activation_formats(
...@@ -2027,7 +2025,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): ...@@ -2027,7 +2025,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
use_int8_w8a8=self.use_int8_w8a8, use_int8_w8a8=self.use_int8_w8a8,
use_int8_w8a16=self.use_int8_w8a16, use_int8_w8a16=self.use_int8_w8a16,
use_int4_w4a16=self.use_int4_w4a16, use_int4_w4a16=self.use_int4_w4a16,
use_int4_w4a8= self.use_int4_w4a8, use_int4_w4a8=self.use_int4_w4a8,
per_channel_quant=self.per_act_token_quant, per_channel_quant=self.per_act_token_quant,
block_shape=self.block_shape) block_shape=self.block_shape)
...@@ -2068,7 +2066,7 @@ def modular_triton_fused_moe( ...@@ -2068,7 +2066,7 @@ def modular_triton_fused_moe(
use_int8_w8a8: bool, use_int8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
use_int4_w4a16: bool, use_int4_w4a16: bool,
use_int4_w4a8:bool, use_int4_w4a8: bool,
per_act_token_quant: bool, per_act_token_quant: bool,
block_shape: Optional[List[int]] = None, block_shape: Optional[List[int]] = None,
) -> mk.FusedMoEModularKernel: ) -> mk.FusedMoEModularKernel:
...@@ -2079,7 +2077,7 @@ def modular_triton_fused_moe( ...@@ -2079,7 +2077,7 @@ def modular_triton_fused_moe(
use_int8_w8a8=use_int8_w8a8, use_int8_w8a8=use_int8_w8a8,
use_int8_w8a16=use_int8_w8a16, use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16, use_int4_w4a16=use_int4_w4a16,
use_int4_w4a8= use_int4_w4a8, use_int4_w4a8=use_int4_w4a8,
per_act_token_quant=per_act_token_quant, per_act_token_quant=per_act_token_quant,
block_shape=block_shape, block_shape=block_shape,
), ),
......
...@@ -795,7 +795,6 @@ class FusedMoE(torch.nn.Module): ...@@ -795,7 +795,6 @@ class FusedMoE(torch.nn.Module):
if quant_config is None: if quant_config is None:
# Not considering quant for now, temporarily # Not considering quant for now, temporarily
self.use_nn_moe = int(os.environ.get('MOE_NN', 1)) == 1 self.use_nn_moe = int(os.environ.get('MOE_NN', 1)) == 1
# self.use_nn_moe = os.environ.get('MOE_NN') == '1'
else: else:
self.use_nn_moe = False self.use_nn_moe = False
......
...@@ -36,9 +36,10 @@ class ActivationMethod(IntEnum): ...@@ -36,9 +36,10 @@ class ActivationMethod(IntEnum):
@cache @cache
def is_rocm_aiter_moe_enabled() -> bool: def is_rocm_aiter_moe_enabled() -> bool:
return current_platform.is_rocm() \ return False
and envs.VLLM_ROCM_USE_AITER_MOE \ # return current_platform.is_rocm() \
and envs.VLLM_ROCM_USE_AITER # and envs.VLLM_ROCM_USE_AITER_MOE \
# and envs.VLLM_ROCM_USE_AITER
def rocm_aiter_asm_moe_tkw1_impl( def rocm_aiter_asm_moe_tkw1_impl(
......
...@@ -25,6 +25,7 @@ class TBOModelInputSplit(): ...@@ -25,6 +25,7 @@ class TBOModelInputSplit():
self.req_num_right = 0 self.req_num_right = 0
self.scheduler_output_left = None self.scheduler_output_left = None
self.scheduler_output_right = None self.scheduler_output_right = None
self.query_start_loc_right = None
input_split = TBOModelInputSplit() input_split = TBOModelInputSplit()
...@@ -136,78 +137,39 @@ def prepare_tbo_atten_metadata( ...@@ -136,78 +137,39 @@ def prepare_tbo_atten_metadata(
assert num_reqs > 0 assert num_reqs > 0
seq_len_offset = req_offset seq_len_offset = req_offset
if req_offset == 0: #left
query_start_offset = 0
else:
query_start_offset = req_offset + 1
# Get the number of scheduled tokens for each request. # Get the number of scheduled tokens for each request.
tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
num_scheduled_tokens = np.array(tokens, dtype=np.int32) num_scheduled_tokens = np.array(tokens, dtype=np.int32)
max_num_scheduled_tokens = max(tokens) max_num_scheduled_tokens = max(tokens)
# Get request indices. if req_offset > 0: #right
# E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2] if input_split.query_start_loc_right == None:
req_indices = np.repeat(runner.arange_np[:num_reqs], # TODO: create when system init
num_scheduled_tokens) + req_offset input_split.query_start_loc_right = torch.zeros(runner.max_num_reqs + 1,
dtype=torch.int32,
device=runner.device)
# cu_num_tokens: [2, 5, 3] -> [2, 7, 10] cu_num_tokens, arange = runner._get_cumsum_and_arange(
# arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] num_scheduled_tokens)
cu_num_tokens, arange = runner._get_cumsum_and_arange(
num_scheduled_tokens)
# Get positions. # Prepare the attention metadata.
positions_np = runner.positions_np[:total_num_scheduled_tokens] runner.query_start_loc_np[0] = 0
np.add(runner.input_batch.num_computed_tokens_cpu[req_indices], runner.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
arange,
out=positions_np)
# Calculate the slot mapping for each KV cache group.
for kv_cache_group_id, kv_cache_group_spec in enumerate( input_split.query_start_loc_right[0: num_reqs + 1].copy_(
runner.kv_cache_config.kv_cache_groups): runner.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
block_size = kv_cache_group_spec.kv_cache_spec.block_size # Note: pad query_start_loc to be non-decreasing, as kernels
block_table: BlockTable = runner.input_batch.block_table[ # like FlashAttention requires that
kv_cache_group_id] input_split.query_start_loc_right[num_reqs + 1:].fill_(
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
# -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
# where K is the max_num_blocks_per_req and the block size is 2.
# NOTE(woosuk): We can't simply use `token_indices // block_size`
# here because M (max_model_len) is not necessarily divisible by
# block_size.
block_table_indices = (
req_indices * block_table.max_num_blocks_per_req +
positions_np // block_size)
block_table_cpu = block_table.get_cpu_tensor()
block_numbers = block_table_cpu.flatten(
)[block_table_indices].numpy()
block_offsets = positions_np % block_size
np.add(
block_numbers * block_size,
block_offsets,
out=block_table.slot_mapping_np[:total_num_scheduled_tokens])
# Prepare the attention metadata.
runner.query_start_loc_np[0] = 0
runner.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
runner.seq_lens_np[:num_reqs] = (
runner.input_batch.num_computed_tokens_cpu[req_offset : req_offset + num_reqs] +
num_scheduled_tokens)
runner.query_start_loc[query_start_offset: query_start_offset + num_reqs + 1].copy_(
runner.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
# Note: pad query_start_loc to be non-decreasing, as kernels
# like FlashAttention requires that
if req_offset > 0: #right
runner.query_start_loc[query_start_offset + num_reqs + 1:].fill_(
runner.query_start_loc_cpu[num_reqs].item()) runner.query_start_loc_cpu[num_reqs].item())
runner.seq_lens[seq_len_offset :seq_len_offset + num_reqs].copy_(runner.seq_lens_cpu[:num_reqs], query_start_loc = input_split.query_start_loc_right[: num_reqs + 1]
non_blocking=True)
# Fill unused with -1. Needed for reshape_and_cache
if req_offset > 0: #right
runner.seq_lens[seq_len_offset + num_reqs:].fill_(0)
query_start_loc = runner.query_start_loc[query_start_offset: query_start_offset + num_reqs + 1] else:
query_start_loc = runner.query_start_loc[:num_reqs + 1]
seq_lens = runner.seq_lens[seq_len_offset : seq_len_offset + num_reqs] seq_lens = runner.seq_lens[seq_len_offset : seq_len_offset + num_reqs]
common_attn_metadata = CommonAttentionMetadata( common_attn_metadata = CommonAttentionMetadata(
...@@ -240,6 +202,9 @@ def prepare_tbo_atten_metadata( ...@@ -240,6 +202,9 @@ def prepare_tbo_atten_metadata(
origin_slot_mapping = metadata_builder.block_table.slot_mapping origin_slot_mapping = metadata_builder.block_table.slot_mapping
metadata_builder.block_table.slot_mapping = \ metadata_builder.block_table.slot_mapping = \
origin_slot_mapping[input_split.scheduler_output_left.total_num_scheduled_tokens:] origin_slot_mapping[input_split.scheduler_output_left.total_num_scheduled_tokens:]
origin_slot_map_cpu = metadata_builder.block_table.slot_mapping_cpu
metadata_builder.block_table.slot_mapping_cpu = \
origin_slot_map_cpu[input_split.scheduler_output_left.total_num_scheduled_tokens:]
if isinstance(metadata_builder, MLACommonMetadataBuilder): # now support prefill only if isinstance(metadata_builder, MLACommonMetadataBuilder): # now support prefill only
_num_decodes_record = metadata_builder._num_decodes _num_decodes_record = metadata_builder._num_decodes
_num_prefills_record = metadata_builder._num_prefills _num_prefills_record = metadata_builder._num_prefills
...@@ -257,6 +222,7 @@ def prepare_tbo_atten_metadata( ...@@ -257,6 +222,7 @@ def prepare_tbo_atten_metadata(
if req_offset > 0: if req_offset > 0:
metadata_builder.block_table.block_table = origin_block_table metadata_builder.block_table.block_table = origin_block_table
metadata_builder.block_table.slot_mapping = origin_slot_mapping metadata_builder.block_table.slot_mapping = origin_slot_mapping
metadata_builder.block_table.slot_mapping_cpu = origin_slot_map_cpu
if isinstance(metadata_builder, MLACommonMetadataBuilder): # now support prefill only if isinstance(metadata_builder, MLACommonMetadataBuilder): # now support prefill only
metadata_builder._num_decodes = _num_decodes_record metadata_builder._num_decodes = _num_decodes_record
...@@ -304,18 +270,16 @@ def tbo_split_and_execute_model( ...@@ -304,18 +270,16 @@ def tbo_split_and_execute_model(
inputs_embeds, inputs_embeds,
scheduler_output: "SchedulerOutput", scheduler_output: "SchedulerOutput",
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
skip_cuda_graphs: bool = True,
) -> Union[ModelRunnerOutput, IntermediateTensors]: ) -> Union[ModelRunnerOutput, IntermediateTensors]:
use_tbo = False use_tbo = False
if isinstance(runner.attn_metadata_builders[0], MLACommonMetadataBuilder) and \ if isinstance(runner.attn_metadata_builders[0], MLACommonMetadataBuilder) and \
runner.attn_metadata_builders[0]._num_decodes > 0: #is mla decode runner.attn_metadata_builders[0]._num_decodes > 0: #is mla decode
use_tbo = False use_tbo = False
else: else:
if len(scheduler_output.num_scheduled_tokens) > 1: if len(scheduler_output.num_scheduled_tokens) > 1 and num_input_tokens > envs.VLLM_TBO_MIN_TOKENS:
split_scheduler_output(runner, scheduler_output) split_scheduler_output(runner, scheduler_output)
if input_split.scheduler_output_left.total_num_scheduled_tokens >= envs.VLLM_TBO_MIN_TOKENS and \ use_tbo = True
input_split.scheduler_output_right.total_num_scheduled_tokens >= envs.VLLM_TBO_MIN_TOKENS:
use_tbo = True
if use_tbo: if use_tbo:
num_input_tokens_left = input_split.scheduler_output_left.total_num_scheduled_tokens num_input_tokens_left = input_split.scheduler_output_left.total_num_scheduled_tokens
num_input_tokens_right = num_input_tokens - num_input_tokens_left num_input_tokens_right = num_input_tokens - num_input_tokens_left
...@@ -338,11 +302,12 @@ def tbo_split_and_execute_model( ...@@ -338,11 +302,12 @@ def tbo_split_and_execute_model(
else: else:
# Run the decoder. # Run the decoder.
# Use persistent buffers for CUDA graphs. # Use persistent buffers for CUDA graphs.
envs.VLLM_ENABLE_TBO = False
with set_forward_context(attn_metadata, with set_forward_context(attn_metadata,
runner.vllm_config, runner.vllm_config,
num_tokens=num_input_tokens, num_tokens=num_input_tokens,
num_tokens_across_dp=num_tokens_across_dp, num_tokens_across_dp=num_tokens_across_dp,
skip_cuda_graphs=True): skip_cuda_graphs=skip_cuda_graphs):
runner.maybe_setup_kv_connector(scheduler_output) runner.maybe_setup_kv_connector(scheduler_output)
model_output = runner.model( model_output = runner.model(
...@@ -355,4 +320,5 @@ def tbo_split_and_execute_model( ...@@ -355,4 +320,5 @@ def tbo_split_and_execute_model(
runner.maybe_wait_for_kv_save() runner.maybe_wait_for_kv_save()
finished_sending, finished_recving = ( finished_sending, finished_recving = (
runner.get_finished_kv_transfers(scheduler_output)) runner.get_finished_kv_transfers(scheduler_output))
envs.VLLM_ENABLE_TBO = True
return model_output, finished_sending, finished_recving return model_output, finished_sending, finished_recving
\ No newline at end of file
...@@ -38,6 +38,7 @@ class CachedRequestState: ...@@ -38,6 +38,7 @@ class CachedRequestState:
block_ids: tuple[list[int], ...] block_ids: tuple[list[int], ...]
num_computed_tokens: int num_computed_tokens: int
output_token_ids: list[int] output_token_ids: list[int]
spec_token_ids: list[int] = None
mrope_positions: Optional[torch.Tensor] = None mrope_positions: Optional[torch.Tensor] = None
mrope_position_delta: Optional[int] = None mrope_position_delta: Optional[int] = None
...@@ -288,9 +289,16 @@ class InputBatch: ...@@ -288,9 +289,16 @@ class InputBatch:
end_idx = start_idx + len(request.output_token_ids) end_idx = start_idx + len(request.output_token_ids)
self.token_ids_cpu[req_index, self.token_ids_cpu[req_index,
start_idx:end_idx] = request.output_token_ids start_idx:end_idx] = request.output_token_ids
num_spec_tokens = 0
if request.spec_token_ids != None:
num_spec_tokens = len(request.spec_token_ids)
self.token_ids_cpu[req_index,
end_idx:end_idx + num_spec_tokens] = request.spec_token_ids
# Number of token ids in token_ids_cpu. # Number of token ids in token_ids_cpu.
# NOTE(woosuk): This may include spec decode tokens. # NOTE(woosuk): This may include spec decode tokens.
self.num_tokens[req_index] = request.num_tokens self.num_tokens[req_index] = request.num_tokens + num_spec_tokens
# Number of tokens without spec decode tokens. # Number of tokens without spec decode tokens.
self.num_tokens_no_spec[req_index] = request.num_tokens self.num_tokens_no_spec[req_index] = request.num_tokens
......
...@@ -482,6 +482,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -482,6 +482,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Update the cached states. # Update the cached states.
req_state.num_computed_tokens = num_computed_tokens req_state.num_computed_tokens = num_computed_tokens
spec_token_ids = (
scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
if not is_last_rank: if not is_last_rank:
# When using PP, the scheduler sends the sampled tokens back, # When using PP, the scheduler sends the sampled tokens back,
...@@ -498,6 +500,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -498,6 +500,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
elif num_new_tokens > 0: elif num_new_tokens > 0:
req_state.output_token_ids.extend( req_state.output_token_ids.extend(
new_token_ids[-num_new_tokens:]) new_token_ids[-num_new_tokens:])
if len(spec_token_ids) > 0:
req_state.spec_token_ids = spec_token_ids
# Update the block IDs. # Update the block IDs.
if not resumed_from_preemption: if not resumed_from_preemption:
...@@ -537,8 +541,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -537,8 +541,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.input_batch.num_tokens[req_index] = end_token_index self.input_batch.num_tokens[req_index] = end_token_index
# Add spec_token_ids to token_ids_cpu. # Add spec_token_ids to token_ids_cpu.
spec_token_ids = (
scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
if spec_token_ids: if spec_token_ids:
num_spec_tokens = len(spec_token_ids) num_spec_tokens = len(spec_token_ids)
start_index = self.input_batch.num_tokens_no_spec[req_index] start_index = self.input_batch.num_tokens_no_spec[req_index]
...@@ -635,7 +637,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -635,7 +637,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# where M is the max_model_len. # where M is the max_model_len.
token_indices = (positions_np + token_indices = (positions_np +
req_indices * self.input_batch.token_ids_cpu.shape[1]) req_indices * self.input_batch.token_ids_cpu.shape[1])
# NOTE(woosuk): We use torch.index_select instead of np.take here # NOTE(woosuk): We use torch.index_select instead of np.take here
# because torch.index_select is much faster than np.take for large # because torch.index_select is much faster than np.take for large
# tensors. # tensors.
...@@ -1381,7 +1383,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1381,7 +1383,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
model_output, finished_sending, finished_recving = \ model_output, finished_sending, finished_recving = \
tbo_split_and_execute_model(self, attn_metadata, num_input_tokens, tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
num_tokens_across_dp, input_ids, positions, num_tokens_across_dp, input_ids, positions,
inputs_embeds, scheduler_output, intermediate_tensors) inputs_embeds, scheduler_output, intermediate_tensors,
skip_cuda_graphs)
else: else:
# Run the model. # Run the model.
# Use persistent buffers for CUDA graphs. # Use persistent buffers for CUDA graphs.
...@@ -2096,8 +2099,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -2096,8 +2099,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
hidden_states = outputs hidden_states = outputs
if self.speculative_config and self.speculative_config.use_eagle() and not is_profile: if self.speculative_config and self.speculative_config.use_eagle() and not is_profile:
assert isinstance(self.drafter, EagleProposer) #assert isinstance(self.drafter, EagleProposer)
self.drafter.dummy_run(num_tokens, attn_metadata) if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer):
self.drafter.dummy_run(num_tokens, attn_metadata)
# This is necessary to avoid blocking DP. # This is necessary to avoid blocking DP.
# For dummy runs, we typically skip EPLB since we don't have any real # For dummy runs, we typically skip EPLB since we don't have any real
...@@ -2231,8 +2235,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -2231,8 +2235,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
def profile_run(self) -> None: def profile_run(self) -> None:
# set profiling flag to avoid torch compile # set profiling flag to avoid torch compile
set_profilling(True) #set_profilling(True)
self._sync_device() #self._sync_device()
# Profile with multimodal encoder & encoder cache. # Profile with multimodal encoder & encoder cache.
# TODO: handle encoder-decoder models once we support them. # TODO: handle encoder-decoder models once we support them.
...@@ -2317,7 +2321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -2317,7 +2321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
del hidden_states, output del hidden_states, output
self.encoder_cache.clear() self.encoder_cache.clear()
gc.collect() gc.collect()
set_profilling(False) #set_profilling(False)
def capture_model(self) -> None: def capture_model(self) -> None:
if not self.use_cuda_graph: if not self.use_cuda_graph:
...@@ -2685,10 +2689,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -2685,10 +2689,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config) kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
if self.speculative_config and self.speculative_config.use_eagle(): if self.speculative_config and self.speculative_config.use_eagle():
assert isinstance(self.drafter, EagleProposer) #assert isinstance(self.drafter, EagleProposer)
# validate all draft model layers belong to the same kv cache # validate all draft model layers belong to the same kv cache
# group # group
self.drafter.validate_same_kv_cache_group(kv_cache_config) if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer):
self.drafter.validate_same_kv_cache_group(kv_cache_config)
if has_kv_transfer_group(): if has_kv_transfer_group():
get_kv_transfer_group().register_kv_caches(kv_caches) get_kv_transfer_group().register_kv_caches(kv_caches)
......
...@@ -80,6 +80,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler, ...@@ -80,6 +80,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
request._output_token_ids[fix_offset] = generated_token_ids request._output_token_ids[fix_offset] = generated_token_ids
request._all_token_ids[fix_offset] = generated_token_ids request._all_token_ids[fix_offset] = generated_token_ids
requsets_valid_token_len[req_id] += 1 requsets_valid_token_len[req_id] += 1
generated_token_ids = [generated_token_ids]
else: else:
valid_output_end = valid_output_len + len(generated_token_ids) - request.num_output_tokens valid_output_end = valid_output_len + len(generated_token_ids) - request.num_output_tokens
if valid_output_end == 0: if valid_output_end == 0:
...@@ -107,7 +108,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler, ...@@ -107,7 +108,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
pooler_output = None pooler_output = None
if pooler_outputs: if pooler_outputs:
pooler_output = pooler_outputs[req_index] pooler_output = pooler_outputs[req_idx]
stopped = check_stop(request, scheduler.max_model_len, stopped = check_stop(request, scheduler.max_model_len,
pooler_output, True) pooler_output, True)
if stopped: if stopped:
...@@ -118,7 +119,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler, ...@@ -118,7 +119,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
and request.sampling_params.logprobs is not None and logprobs: and request.sampling_params.logprobs is not None and logprobs:
# NOTE: once we support N tokens per step (spec decode), # NOTE: once we support N tokens per step (spec decode),
# the outer lists can be of length > 1. # the outer lists can be of length > 1.
new_logprobs = logprobs.slice(req_index, req_index + 1) new_logprobs = logprobs.slice(req_idx, req_idx + 1)
if new_token_ids and scheduler.structured_output_manager.should_advance( if new_token_ids and scheduler.structured_output_manager.should_advance(
request): request):
......
...@@ -472,12 +472,12 @@ class V1ZeroModelRunner(GPUModelRunner): ...@@ -472,12 +472,12 @@ class V1ZeroModelRunner(GPUModelRunner):
# If attention doesn't support CUDA Graphs for this batch, but we # If attention doesn't support CUDA Graphs for this batch, but we
# compiled with full CUDA graphs, we have to skip them entirely. # compiled with full CUDA graphs, we have to skip them entirely.
skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs
if envs.VLLM_ENABLE_TBO and (not self.use_cuda_graph or skip_cuda_graphs): if envs.VLLM_ENABLE_TBO and (not self.use_cuda_graph or skip_cuda_graphs):
model_output, finished_sending, finished_recving = \ model_output, finished_sending, finished_recving = \
tbo_split_and_execute_model(self, attn_metadata, num_input_tokens, tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
num_tokens_across_dp, input_ids, positions, num_tokens_across_dp, input_ids, positions,
inputs_embeds, scheduler_output, intermediate_tensors) inputs_embeds, scheduler_output, intermediate_tensors,
skip_cuda_graphs)
else: else:
# Run the model. # Run the model.
# Use persistent buffers for CUDA graphs. # Use persistent buffers for CUDA graphs.
......
...@@ -9,6 +9,6 @@ class ZeroV1ModelRunnerOutput(ModelRunnerOutput): ...@@ -9,6 +9,6 @@ class ZeroV1ModelRunnerOutput(ModelRunnerOutput):
# [num_reqs] # [num_reqs]
fix_req_ids: list[str] = None fix_req_ids: list[str] = None
fix_sampled_token_ids:list[list[int]] = None fix_sampled_token_ids:list[list[int]] = None
fix_draft_req_ids:list[list[int]] = None fix_draft_req_ids:list[str] = None
fix_draft_tokens_ids:list[list[int]] = None fix_draft_tokens_ids:list[list[int]] = None
is_output_valid:bool = True is_output_valid:bool = True
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment