Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
48a9e546
Commit
48a9e546
authored
Sep 07, 2025
by
王敏
Browse files
Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev
parents
6372a1f3
c11b09df
Changes
98
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
416 additions
and
100 deletions
+416
-100
tests/v1/tpu/untest_tpu_qkv_linear.py
tests/v1/tpu/untest_tpu_qkv_linear.py
+0
-0
tests/worker/test_encoder_decoder_model_runner.py
tests/worker/test_encoder_decoder_model_runner.py
+2
-0
tests/worker/test_model_runner.py
tests/worker/test_model_runner.py
+1
-1
vllm/_custom_ops.py
vllm/_custom_ops.py
+2
-1
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
...ted/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+3
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/model_executor/layers/fused_moe/config.py
vllm/model_executor/layers/fused_moe/config.py
+4
-0
vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=BW200_nn.json
...s/fused_moe/configs/E=128,N=176,device_name=BW200_nn.json
+164
-0
vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=K100_AI_nn.json
...fused_moe/configs/E=128,N=176,device_name=K100_AI_nn.json
+164
-0
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+7
-9
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+0
-1
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+4
-3
vllm/two_batch_overlap/v1/model_input_split_v1.py
vllm/two_batch_overlap/v1/model_input_split_v1.py
+33
-67
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_input_batch.py
+9
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+16
-11
vllm/zero_overhead/v1/core.py
vllm/zero_overhead/v1/core.py
+3
-2
vllm/zero_overhead/v1/gpu_model_runner.py
vllm/zero_overhead/v1/gpu_model_runner.py
+2
-2
vllm/zero_overhead/v1/outputs.py
vllm/zero_overhead/v1/outputs.py
+1
-1
No files found.
tests/v1/tpu/test_tpu_qkv_linear.py
→
tests/v1/tpu/
un
test_tpu_qkv_linear.py
View file @
48a9e546
File moved
tests/worker/test_encoder_decoder_model_runner.py
View file @
48a9e546
...
...
@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
assert
torch
.
equal
(
actual
,
expected
)
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"ROCM is not supported."
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
257
)))
@
pytest
.
mark
.
parametrize
(
"multiple_seqs_per_seq_group"
,
[
True
,
False
])
def
test_prepare_decode_cuda_graph
(
batch_size
,
multiple_seqs_per_seq_group
):
...
...
tests/worker/test_model_runner.py
View file @
48a9e546
...
...
@@ -32,7 +32,7 @@ def test_deepseek_mla_attn_backend_module():
trust_remote_code
=
True
,
enable_chunked_prefill
=
False
,
)
assert
model_runner
.
attn_backend
.
__name__
==
"TritonMLABackend"
assert
model_runner
.
attn_backend
.
__name__
==
"FlashMLABackend"
#
"TritonMLABackend"
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
257
,
3
)))
...
...
vllm/_custom_ops.py
View file @
48a9e546
...
...
@@ -1971,7 +1971,8 @@ def wvSplitKQ(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype,
# moe
def
moe_sum
(
input
:
torch
.
Tensor
,
output
:
torch
.
Tensor
):
torch
.
ops
.
_moe_C
.
moe_sum
(
input
,
output
)
def
moe_sum_opt1
(
input
:
torch
.
Tensor
,
output
:
torch
.
Tensor
):
torch
.
ops
.
_moe_C
.
moe_sum_opt1
(
input
,
output
)
def
moe_align_block_size
(
topk_ids
:
torch
.
Tensor
,
num_experts
:
int
,
block_size
:
int
,
sorted_token_ids
:
torch
.
Tensor
,
...
...
vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
View file @
48a9e546
...
...
@@ -215,6 +215,9 @@ class P2pNcclConnector(KVConnectorBase_V1):
inject_kv_into_layer
(
kv_cache_layer
,
kv_cache
,
request
.
slot_mapping
,
request
.
request_id
)
tensor
=
self
.
p2p_nccl_engine
.
recv_store
.
pop
(
request
.
request_id
+
"#"
+
layer_name
,
None
)
if
tensor
is
not
None
:
del
tensor
def
wait_for_layer_load
(
self
,
layer_name
:
str
)
->
None
:
"""Blocking until the KV for a specific layer is loaded into vLLM's
...
...
vllm/engine/arg_utils.py
View file @
48a9e546
...
...
@@ -1004,7 +1004,7 @@ class EngineArgs:
enable_sleep_mode
=
self
.
enable_sleep_mode
,
model_impl
=
self
.
model_impl
,
override_attention_dtype
=
self
.
override_attention_dtype
,
enable_chunked_prefill
=
self
.
enable_chunked_prefill
enable_chunked_prefill
=
self
.
enable_chunked_prefill
,
)
def
create_load_config
(
self
)
->
LoadConfig
:
...
...
vllm/model_executor/layers/fused_moe/config.py
View file @
48a9e546
...
...
@@ -50,6 +50,7 @@ def get_config_quant_dtype(
use_int8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
use_int4_w4a16
:
bool
,
use_int4_w4a8
:
bool
,
)
->
Optional
[
torch
.
dtype
]:
if
use_fp8_w8a8
:
return
torch
.
float8_e4m3fn
...
...
@@ -126,6 +127,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8
:
bool
=
False
,
use_int8_w8a16
:
bool
=
False
,
use_int4_w4a16
:
bool
=
False
,
use_int4_w4a8
:
bool
=
False
,
per_act_token_quant
:
bool
=
False
,
per_out_ch_quant
:
bool
=
False
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
...
...
@@ -136,6 +138,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8
,
use_int8_w8a16
,
use_int4_w4a16
,
use_int4_w4a8
,
]
])
<=
1
,
"Quantization flags are mutually exclusive."
...
...
@@ -144,6 +147,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8
=
use_int8_w8a8
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int4_w4a16
=
use_int4_w4a16
,
use_int4_w4a8
=
use_int4_w4a8
,
)
return
FusedMoEQuantConfig
(
quant_dtype
,
...
...
vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=BW200_nn.json
0 → 100644
View file @
48a9e546
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=K100_AI_nn.json
0 → 100644
View file @
48a9e546
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
48a9e546
...
...
@@ -652,7 +652,6 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
B_scale
:
Optional
[
torch
.
Tensor
],
B_zp
:
Optional
[
torch
.
Tensor
],
topk_weights
:
Optional
[
torch
.
Tensor
],
topk_ids
:
torch
.
Tensor
,
sorted_token_ids
:
torch
.
Tensor
,
expert_ids
:
torch
.
Tensor
,
num_tokens_post_padded
:
torch
.
Tensor
,
...
...
@@ -1603,7 +1602,8 @@ def fused_experts_impl(
qtype
=
get_config_quant_dtype
(
use_fp8_w8a8
=
use_fp8_w8a8
,
use_int8_w8a8
=
use_int8_w8a8
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int4_w4a16
=
use_int4_w4a16
)
use_int4_w4a16
=
use_int4_w4a16
,
use_int4_w4a8
=
use_int4_w4a8
)
get_config_func
=
functools
.
partial
(
try_get_optimal_moe_config
,
...
...
@@ -1689,7 +1689,6 @@ def fused_experts_impl(
w1_scale
,
w1_zp
,
curr_topk_weights
,
curr_topk_ids
,
sorted_token_ids
,
expert_ids
,
num_tokens_post_padded
,
...
...
@@ -1729,7 +1728,6 @@ def fused_experts_impl(
w2_scale
,
w2_zp
,
curr_topk_weights
,
curr_topk_ids
,
sorted_token_ids
,
expert_ids
,
num_tokens_post_padded
,
...
...
@@ -1877,7 +1875,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
use_int8_w8a8
:
bool
=
False
,
use_int8_w8a16
:
bool
=
False
,
use_int4_w4a16
:
bool
=
False
,
use_int4_w4a8
:
bool
=
False
,
use_int4_w4a8
:
bool
=
False
,
per_act_token_quant
:
bool
=
False
,
block_shape
:
Optional
[
List
[
int
]]
=
None
,
):
...
...
@@ -1896,7 +1894,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
self
.
use_int4_w4a16
=
use_int4_w4a16
self
.
use_int8_w8a8
=
use_int8_w8a8
self
.
use_int8_w8a16
=
use_int8_w8a16
self
.
use_int4_w4a8
=
use_int4_w4a8
self
.
use_int4_w4a8
=
use_int4_w4a8
@
property
def
activation_formats
(
...
...
@@ -2027,7 +2025,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
use_int8_w8a8
=
self
.
use_int8_w8a8
,
use_int8_w8a16
=
self
.
use_int8_w8a16
,
use_int4_w4a16
=
self
.
use_int4_w4a16
,
use_int4_w4a8
=
self
.
use_int4_w4a8
,
use_int4_w4a8
=
self
.
use_int4_w4a8
,
per_channel_quant
=
self
.
per_act_token_quant
,
block_shape
=
self
.
block_shape
)
...
...
@@ -2068,7 +2066,7 @@ def modular_triton_fused_moe(
use_int8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
use_int4_w4a16
:
bool
,
use_int4_w4a8
:
bool
,
use_int4_w4a8
:
bool
,
per_act_token_quant
:
bool
,
block_shape
:
Optional
[
List
[
int
]]
=
None
,
)
->
mk
.
FusedMoEModularKernel
:
...
...
@@ -2079,7 +2077,7 @@ def modular_triton_fused_moe(
use_int8_w8a8
=
use_int8_w8a8
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int4_w4a16
=
use_int4_w4a16
,
use_int4_w4a8
=
use_int4_w4a8
,
use_int4_w4a8
=
use_int4_w4a8
,
per_act_token_quant
=
per_act_token_quant
,
block_shape
=
block_shape
,
),
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
48a9e546
...
...
@@ -795,7 +795,6 @@ class FusedMoE(torch.nn.Module):
if
quant_config
is
None
:
# Not considering quant for now, temporarily
self
.
use_nn_moe
=
int
(
os
.
environ
.
get
(
'MOE_NN'
,
1
))
==
1
# self.use_nn_moe = os.environ.get('MOE_NN') == '1'
else
:
self
.
use_nn_moe
=
False
...
...
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
View file @
48a9e546
...
...
@@ -36,9 +36,10 @@ class ActivationMethod(IntEnum):
@
cache
def
is_rocm_aiter_moe_enabled
()
->
bool
:
return
current_platform
.
is_rocm
()
\
and
envs
.
VLLM_ROCM_USE_AITER_MOE
\
and
envs
.
VLLM_ROCM_USE_AITER
return
False
# return current_platform.is_rocm() \
# and envs.VLLM_ROCM_USE_AITER_MOE \
# and envs.VLLM_ROCM_USE_AITER
def
rocm_aiter_asm_moe_tkw1_impl
(
...
...
vllm/two_batch_overlap/v1/model_input_split_v1.py
View file @
48a9e546
...
...
@@ -25,6 +25,7 @@ class TBOModelInputSplit():
self
.
req_num_right
=
0
self
.
scheduler_output_left
=
None
self
.
scheduler_output_right
=
None
self
.
query_start_loc_right
=
None
input_split
=
TBOModelInputSplit
()
...
...
@@ -136,78 +137,39 @@ def prepare_tbo_atten_metadata(
assert
num_reqs
>
0
seq_len_offset
=
req_offset
if
req_offset
==
0
:
#left
query_start_offset
=
0
else
:
query_start_offset
=
req_offset
+
1
# Get the number of scheduled tokens for each request.
tokens
=
[
scheduler_output
.
num_scheduled_tokens
[
i
]
for
i
in
req_ids
]
num_scheduled_tokens
=
np
.
array
(
tokens
,
dtype
=
np
.
int32
)
max_num_scheduled_tokens
=
max
(
tokens
)
# Get request indices.
# E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
req_indices
=
np
.
repeat
(
runner
.
arange_np
[:
num_reqs
],
num_scheduled_tokens
)
+
req_offset
if
req_offset
>
0
:
#right
if
input_split
.
query_start_loc_right
==
None
:
# TODO: create when system init
input_split
.
query_start_loc_right
=
torch
.
zeros
(
runner
.
max_num_reqs
+
1
,
dtype
=
torch
.
int32
,
device
=
runner
.
device
)
# cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
# arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
cu_num_tokens
,
arange
=
runner
.
_get_cumsum_and_arange
(
num_scheduled_tokens
)
cu_num_tokens
,
arange
=
runner
.
_get_cumsum_and_arange
(
num_scheduled_tokens
)
# Get positions.
positions_np
=
runner
.
positions_np
[:
total_num_scheduled_tokens
]
np
.
add
(
runner
.
input_batch
.
num_computed_tokens_cpu
[
req_indices
],
arange
,
out
=
positions_np
)
# Prepare the attention metadata.
runner
.
query_start_loc_np
[
0
]
=
0
runner
.
query_start_loc_np
[
1
:
num_reqs
+
1
]
=
cu_num_tokens
# Calculate the slot mapping for each KV cache group.
for
kv_cache_group_id
,
kv_cache_group_spec
in
enumerate
(
runner
.
kv_cache_config
.
kv_cache_groups
):
block_size
=
kv_cache_group_spec
.
kv_cache_spec
.
block_size
block_table
:
BlockTable
=
runner
.
input_batch
.
block_table
[
kv_cache_group_id
]
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
# -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
# where K is the max_num_blocks_per_req and the block size is 2.
# NOTE(woosuk): We can't simply use `token_indices // block_size`
# here because M (max_model_len) is not necessarily divisible by
# block_size.
block_table_indices
=
(
req_indices
*
block_table
.
max_num_blocks_per_req
+
positions_np
//
block_size
)
block_table_cpu
=
block_table
.
get_cpu_tensor
()
block_numbers
=
block_table_cpu
.
flatten
(
)[
block_table_indices
].
numpy
()
block_offsets
=
positions_np
%
block_size
np
.
add
(
block_numbers
*
block_size
,
block_offsets
,
out
=
block_table
.
slot_mapping_np
[:
total_num_scheduled_tokens
])
# Prepare the attention metadata.
runner
.
query_start_loc_np
[
0
]
=
0
runner
.
query_start_loc_np
[
1
:
num_reqs
+
1
]
=
cu_num_tokens
runner
.
seq_lens_np
[:
num_reqs
]
=
(
runner
.
input_batch
.
num_computed_tokens_cpu
[
req_offset
:
req_offset
+
num_reqs
]
+
num_scheduled_tokens
)
runner
.
query_start_loc
[
query_start_offset
:
query_start_offset
+
num_reqs
+
1
].
copy_
(
runner
.
query_start_loc_cpu
[:
num_reqs
+
1
],
non_blocking
=
True
)
# Note: pad query_start_loc to be non-decreasing, as kernels
# like FlashAttention requires that
if
req_offset
>
0
:
#right
runner
.
query_start_loc
[
query_start_offset
+
num_reqs
+
1
:].
fill_
(
input_split
.
query_start_loc_right
[
0
:
num_reqs
+
1
].
copy_
(
runner
.
query_start_loc_cpu
[:
num_reqs
+
1
],
non_blocking
=
True
)
# Note: pad query_start_loc to be non-decreasing, as kernels
# like FlashAttention requires that
input_split
.
query_start_loc_right
[
num_reqs
+
1
:].
fill_
(
runner
.
query_start_loc_cpu
[
num_reqs
].
item
())
runner
.
seq_lens
[
seq_len_offset
:
seq_len_offset
+
num_reqs
].
copy_
(
runner
.
seq_lens_cpu
[:
num_reqs
],
non_blocking
=
True
)
query_start_loc
=
input_split
.
query_start_loc_right
[:
num_reqs
+
1
]
# Fill unused with -1. Needed for reshape_and_cache
if
req_offset
>
0
:
#right
runner
.
seq_lens
[
seq_len_offset
+
num_reqs
:].
fill_
(
0
)
query_start_loc
=
runner
.
query_start_loc
[
query_start_offset
:
query_start_offset
+
num_reqs
+
1
]
else
:
query_start_loc
=
runner
.
query_start_loc
[:
num_reqs
+
1
]
seq_lens
=
runner
.
seq_lens
[
seq_len_offset
:
seq_len_offset
+
num_reqs
]
common_attn_metadata
=
CommonAttentionMetadata
(
...
...
@@ -240,6 +202,9 @@ def prepare_tbo_atten_metadata(
origin_slot_mapping
=
metadata_builder
.
block_table
.
slot_mapping
metadata_builder
.
block_table
.
slot_mapping
=
\
origin_slot_mapping
[
input_split
.
scheduler_output_left
.
total_num_scheduled_tokens
:]
origin_slot_map_cpu
=
metadata_builder
.
block_table
.
slot_mapping_cpu
metadata_builder
.
block_table
.
slot_mapping_cpu
=
\
origin_slot_map_cpu
[
input_split
.
scheduler_output_left
.
total_num_scheduled_tokens
:]
if
isinstance
(
metadata_builder
,
MLACommonMetadataBuilder
):
# now support prefill only
_num_decodes_record
=
metadata_builder
.
_num_decodes
_num_prefills_record
=
metadata_builder
.
_num_prefills
...
...
@@ -257,6 +222,7 @@ def prepare_tbo_atten_metadata(
if
req_offset
>
0
:
metadata_builder
.
block_table
.
block_table
=
origin_block_table
metadata_builder
.
block_table
.
slot_mapping
=
origin_slot_mapping
metadata_builder
.
block_table
.
slot_mapping_cpu
=
origin_slot_map_cpu
if
isinstance
(
metadata_builder
,
MLACommonMetadataBuilder
):
# now support prefill only
metadata_builder
.
_num_decodes
=
_num_decodes_record
...
...
@@ -304,18 +270,16 @@ def tbo_split_and_execute_model(
inputs_embeds
,
scheduler_output
:
"SchedulerOutput"
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
skip_cuda_graphs
:
bool
=
True
,
)
->
Union
[
ModelRunnerOutput
,
IntermediateTensors
]:
use_tbo
=
False
if
isinstance
(
runner
.
attn_metadata_builders
[
0
],
MLACommonMetadataBuilder
)
and
\
runner
.
attn_metadata_builders
[
0
].
_num_decodes
>
0
:
#is mla decode
use_tbo
=
False
else
:
if
len
(
scheduler_output
.
num_scheduled_tokens
)
>
1
:
if
len
(
scheduler_output
.
num_scheduled_tokens
)
>
1
and
num_input_tokens
>
envs
.
VLLM_TBO_MIN_TOKENS
:
split_scheduler_output
(
runner
,
scheduler_output
)
if
input_split
.
scheduler_output_left
.
total_num_scheduled_tokens
>=
envs
.
VLLM_TBO_MIN_TOKENS
and
\
input_split
.
scheduler_output_right
.
total_num_scheduled_tokens
>=
envs
.
VLLM_TBO_MIN_TOKENS
:
use_tbo
=
True
use_tbo
=
True
if
use_tbo
:
num_input_tokens_left
=
input_split
.
scheduler_output_left
.
total_num_scheduled_tokens
num_input_tokens_right
=
num_input_tokens
-
num_input_tokens_left
...
...
@@ -338,11 +302,12 @@ def tbo_split_and_execute_model(
else
:
# Run the decoder.
# Use persistent buffers for CUDA graphs.
envs
.
VLLM_ENABLE_TBO
=
False
with
set_forward_context
(
attn_metadata
,
runner
.
vllm_config
,
num_tokens
=
num_input_tokens
,
num_tokens_across_dp
=
num_tokens_across_dp
,
skip_cuda_graphs
=
True
):
skip_cuda_graphs
=
skip_cuda_graphs
):
runner
.
maybe_setup_kv_connector
(
scheduler_output
)
model_output
=
runner
.
model
(
...
...
@@ -355,4 +320,5 @@ def tbo_split_and_execute_model(
runner
.
maybe_wait_for_kv_save
()
finished_sending
,
finished_recving
=
(
runner
.
get_finished_kv_transfers
(
scheduler_output
))
envs
.
VLLM_ENABLE_TBO
=
True
return
model_output
,
finished_sending
,
finished_recving
\ No newline at end of file
vllm/v1/worker/gpu_input_batch.py
View file @
48a9e546
...
...
@@ -38,6 +38,7 @@ class CachedRequestState:
block_ids
:
tuple
[
list
[
int
],
...]
num_computed_tokens
:
int
output_token_ids
:
list
[
int
]
spec_token_ids
:
list
[
int
]
=
None
mrope_positions
:
Optional
[
torch
.
Tensor
]
=
None
mrope_position_delta
:
Optional
[
int
]
=
None
...
...
@@ -288,9 +289,16 @@ class InputBatch:
end_idx
=
start_idx
+
len
(
request
.
output_token_ids
)
self
.
token_ids_cpu
[
req_index
,
start_idx
:
end_idx
]
=
request
.
output_token_ids
num_spec_tokens
=
0
if
request
.
spec_token_ids
!=
None
:
num_spec_tokens
=
len
(
request
.
spec_token_ids
)
self
.
token_ids_cpu
[
req_index
,
end_idx
:
end_idx
+
num_spec_tokens
]
=
request
.
spec_token_ids
# Number of token ids in token_ids_cpu.
# NOTE(woosuk): This may include spec decode tokens.
self
.
num_tokens
[
req_index
]
=
request
.
num_tokens
self
.
num_tokens
[
req_index
]
=
request
.
num_tokens
+
num_spec_tokens
# Number of tokens without spec decode tokens.
self
.
num_tokens_no_spec
[
req_index
]
=
request
.
num_tokens
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
48a9e546
...
...
@@ -482,6 +482,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Update the cached states.
req_state
.
num_computed_tokens
=
num_computed_tokens
spec_token_ids
=
(
scheduler_output
.
scheduled_spec_decode_tokens
.
get
(
req_id
,
()))
if
not
is_last_rank
:
# When using PP, the scheduler sends the sampled tokens back,
...
...
@@ -498,6 +500,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
elif
num_new_tokens
>
0
:
req_state
.
output_token_ids
.
extend
(
new_token_ids
[
-
num_new_tokens
:])
if
len
(
spec_token_ids
)
>
0
:
req_state
.
spec_token_ids
=
spec_token_ids
# Update the block IDs.
if
not
resumed_from_preemption
:
...
...
@@ -537,8 +541,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self
.
input_batch
.
num_tokens
[
req_index
]
=
end_token_index
# Add spec_token_ids to token_ids_cpu.
spec_token_ids
=
(
scheduler_output
.
scheduled_spec_decode_tokens
.
get
(
req_id
,
()))
if
spec_token_ids
:
num_spec_tokens
=
len
(
spec_token_ids
)
start_index
=
self
.
input_batch
.
num_tokens_no_spec
[
req_index
]
...
...
@@ -635,7 +637,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# where M is the max_model_len.
token_indices
=
(
positions_np
+
req_indices
*
self
.
input_batch
.
token_ids_cpu
.
shape
[
1
])
# NOTE(woosuk): We use torch.index_select instead of np.take here
# because torch.index_select is much faster than np.take for large
# tensors.
...
...
@@ -1381,7 +1383,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
model_output
,
finished_sending
,
finished_recving
=
\
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
num_tokens_across_dp
,
input_ids
,
positions
,
inputs_embeds
,
scheduler_output
,
intermediate_tensors
)
inputs_embeds
,
scheduler_output
,
intermediate_tensors
,
skip_cuda_graphs
)
else
:
# Run the model.
# Use persistent buffers for CUDA graphs.
...
...
@@ -2096,8 +2099,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
hidden_states
=
outputs
if
self
.
speculative_config
and
self
.
speculative_config
.
use_eagle
()
and
not
is_profile
:
assert
isinstance
(
self
.
drafter
,
EagleProposer
)
self
.
drafter
.
dummy_run
(
num_tokens
,
attn_metadata
)
#assert isinstance(self.drafter, EagleProposer)
if
hasattr
(
self
,
'drafter'
)
and
isinstance
(
self
.
drafter
,
EagleProposer
):
self
.
drafter
.
dummy_run
(
num_tokens
,
attn_metadata
)
# This is necessary to avoid blocking DP.
# For dummy runs, we typically skip EPLB since we don't have any real
...
...
@@ -2231,8 +2235,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
def
profile_run
(
self
)
->
None
:
# set profiling flag to avoid torch compile
set_profilling
(
True
)
self
.
_sync_device
()
#
set_profilling(True)
#
self._sync_device()
# Profile with multimodal encoder & encoder cache.
# TODO: handle encoder-decoder models once we support them.
...
...
@@ -2317,7 +2321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
del
hidden_states
,
output
self
.
encoder_cache
.
clear
()
gc
.
collect
()
set_profilling
(
False
)
#
set_profilling(False)
def
capture_model
(
self
)
->
None
:
if
not
self
.
use_cuda_graph
:
...
...
@@ -2685,10 +2689,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
kv_caches
=
self
.
initialize_kv_cache_tensors
(
kv_cache_config
)
if
self
.
speculative_config
and
self
.
speculative_config
.
use_eagle
():
assert
isinstance
(
self
.
drafter
,
EagleProposer
)
#
assert isinstance(self.drafter, EagleProposer)
# validate all draft model layers belong to the same kv cache
# group
self
.
drafter
.
validate_same_kv_cache_group
(
kv_cache_config
)
if
hasattr
(
self
,
'drafter'
)
and
isinstance
(
self
.
drafter
,
EagleProposer
):
self
.
drafter
.
validate_same_kv_cache_group
(
kv_cache_config
)
if
has_kv_transfer_group
():
get_kv_transfer_group
().
register_kv_caches
(
kv_caches
)
...
...
vllm/zero_overhead/v1/core.py
View file @
48a9e546
...
...
@@ -80,6 +80,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
request
.
_output_token_ids
[
fix_offset
]
=
generated_token_ids
request
.
_all_token_ids
[
fix_offset
]
=
generated_token_ids
requsets_valid_token_len
[
req_id
]
+=
1
generated_token_ids
=
[
generated_token_ids
]
else
:
valid_output_end
=
valid_output_len
+
len
(
generated_token_ids
)
-
request
.
num_output_tokens
if
valid_output_end
==
0
:
...
...
@@ -107,7 +108,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
pooler_output
=
None
if
pooler_outputs
:
pooler_output
=
pooler_outputs
[
req_i
nde
x
]
pooler_output
=
pooler_outputs
[
req_i
d
x
]
stopped
=
check_stop
(
request
,
scheduler
.
max_model_len
,
pooler_output
,
True
)
if
stopped
:
...
...
@@ -118,7 +119,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
and
request
.
sampling_params
.
logprobs
is
not
None
and
logprobs
:
# NOTE: once we support N tokens per step (spec decode),
# the outer lists can be of length > 1.
new_logprobs
=
logprobs
.
slice
(
req_i
nde
x
,
req_i
nde
x
+
1
)
new_logprobs
=
logprobs
.
slice
(
req_i
d
x
,
req_i
d
x
+
1
)
if
new_token_ids
and
scheduler
.
structured_output_manager
.
should_advance
(
request
):
...
...
vllm/zero_overhead/v1/gpu_model_runner.py
View file @
48a9e546
...
...
@@ -472,12 +472,12 @@ class V1ZeroModelRunner(GPUModelRunner):
# If attention doesn't support CUDA Graphs for this batch, but we
# compiled with full CUDA graphs, we have to skip them entirely.
skip_cuda_graphs
=
self
.
full_cuda_graph
and
not
attention_cuda_graphs
if
envs
.
VLLM_ENABLE_TBO
and
(
not
self
.
use_cuda_graph
or
skip_cuda_graphs
):
model_output
,
finished_sending
,
finished_recving
=
\
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
num_tokens_across_dp
,
input_ids
,
positions
,
inputs_embeds
,
scheduler_output
,
intermediate_tensors
)
inputs_embeds
,
scheduler_output
,
intermediate_tensors
,
skip_cuda_graphs
)
else
:
# Run the model.
# Use persistent buffers for CUDA graphs.
...
...
vllm/zero_overhead/v1/outputs.py
View file @
48a9e546
...
...
@@ -9,6 +9,6 @@ class ZeroV1ModelRunnerOutput(ModelRunnerOutput):
# [num_reqs]
fix_req_ids
:
list
[
str
]
=
None
fix_sampled_token_ids
:
list
[
list
[
int
]]
=
None
fix_draft_req_ids
:
list
[
list
[
int
]
]
=
None
fix_draft_req_ids
:
list
[
str
]
=
None
fix_draft_tokens_ids
:
list
[
list
[
int
]]
=
None
is_output_valid
:
bool
=
True
\ No newline at end of file
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment