Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
08cb5d8f
Commit
08cb5d8f
authored
Jun 13, 2025
by
王敏
Browse files
[fix]修复并行解码eagle和mlp相关单测问题
parent
9bcbaafc
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
50 additions
and
16 deletions
+50
-16
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_eagle_correctness.py
+2
-0
tests/spec_decode/e2e/test_mlp_correctness.py
tests/spec_decode/e2e/test_mlp_correctness.py
+34
-7
vllm/model_executor/models/mlp_speculator.py
vllm/model_executor/models/mlp_speculator.py
+9
-9
vllm/spec_decode/multi_step_worker.py
vllm/spec_decode/multi_step_worker.py
+5
-0
No files found.
tests/spec_decode/e2e/test_eagle_correctness.py
View file @
08cb5d8f
...
@@ -27,6 +27,8 @@ from .conftest import run_equality_correctness_test
...
@@ -27,6 +27,8 @@ from .conftest import run_equality_correctness_test
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
import
vllm.envs
as
envs
import
vllm.envs
as
envs
os
.
environ
[
"LLAMA_NN"
]
=
"0"
# main model
# main model
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
...
...
tests/spec_decode/e2e/test_mlp_correctness.py
View file @
08cb5d8f
...
@@ -59,6 +59,9 @@ PRECISION = "float16"
...
@@ -59,6 +59,9 @@ PRECISION = "float16"
# Main model
# Main model
"model_name"
:
MAIN_MODEL
,
"model_name"
:
MAIN_MODEL
,
# GPU memory utilization
"gpu_memory_utilization"
:
0.8
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -72,9 +75,9 @@ PRECISION = "float16"
...
@@ -72,9 +75,9 @@ PRECISION = "float16"
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
128
,
128
,
])
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
,
32
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
,
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"prefill_chunk_size"
,
[
-
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"prefill_chunk_size"
,
[
-
1
,
4
])
def
test_mlp_e2e_greedy_correctness
(
vllm_runner
,
common_llm_kwargs
,
def
test_mlp_e2e_greedy_correctness
(
vllm_runner
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
...
@@ -107,6 +110,9 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -107,6 +110,9 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Main model
# Main model
"model_name"
:
MAIN_MODEL
,
"model_name"
:
MAIN_MODEL
,
# GPU memory utilization
"gpu_memory_utilization"
:
0.8
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -125,7 +131,7 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -125,7 +131,7 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
},
},
])
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
1
,
6
])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
1
,
6
])
@
pytest
.
mark
.
parametrize
(
"prefill_chunk_size"
,
[
-
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"prefill_chunk_size"
,
[
-
1
,
4
])
...
@@ -171,6 +177,9 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
...
@@ -171,6 +177,9 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
# Main model
# Main model
"model_name"
:
MAIN_MODEL
,
"model_name"
:
MAIN_MODEL
,
# GPU memory utilization
"gpu_memory_utilization"
:
0.8
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -182,7 +191,7 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
...
@@ -182,7 +191,7 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
},
},
])
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
2048
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
2048
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"prefill_chunk_size"
,
[
-
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"prefill_chunk_size"
,
[
-
1
,
4
])
def
test_mlp_e2e_acceptance_rate
(
vllm_runner
,
common_llm_kwargs
,
def
test_mlp_e2e_acceptance_rate
(
vllm_runner
,
common_llm_kwargs
,
...
@@ -224,12 +233,15 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
...
@@ -224,12 +233,15 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
SPEC_MODEL
,
"model"
:
SPEC_MODEL
,
},
},
# GPU memory utilization
"gpu_memory_utilization"
:
0.8
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"seed"
:
1
}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{
"seed"
:
1
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"seed"
:
5
}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"seed"
:
5
}])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"output_len"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"temperature"
,
[
1.0
])
@
pytest
.
mark
.
parametrize
(
"temperature"
,
[
1.0
])
@
pytest
.
mark
.
parametrize
(
"prefill_chunk_size"
,
[
-
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"prefill_chunk_size"
,
[
-
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
...
@@ -269,7 +281,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
...
@@ -269,7 +281,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
@@ -282,6 +294,9 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
...
@@ -282,6 +294,9 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
# Main model
# Main model
"model_name"
:
MAIN_MODEL
,
"model_name"
:
MAIN_MODEL
,
# GPU memory utilization
"gpu_memory_utilization"
:
0.8
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -323,7 +338,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
...
@@ -323,7 +338,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
@@ -336,6 +351,9 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
...
@@ -336,6 +351,9 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
# Main model
# Main model
"model_name"
:
MAIN_MODEL
,
"model_name"
:
MAIN_MODEL
,
# GPU memory utilization
"gpu_memory_utilization"
:
0.8
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -392,6 +410,9 @@ def test_mlp_e2e_greedy_correctness_with_padding(
...
@@ -392,6 +410,9 @@ def test_mlp_e2e_greedy_correctness_with_padding(
# Main model
# Main model
"model_name"
:
MAIN_MODEL
,
"model_name"
:
MAIN_MODEL
,
# GPU memory utilization
"gpu_memory_utilization"
:
0.8
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -446,6 +467,9 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
...
@@ -446,6 +467,9 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
# Main model
# Main model
"model_name"
:
MAIN_MODEL
,
"model_name"
:
MAIN_MODEL
,
# GPU memory utilization
"gpu_memory_utilization"
:
0.8
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -495,6 +519,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -495,6 +519,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# GPU memory utilization
"gpu_memory_utilization"
:
0.8
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
vllm/model_executor/models/mlp_speculator.py
View file @
08cb5d8f
...
@@ -213,14 +213,14 @@ class MLPSpeculator(nn.Module):
...
@@ -213,14 +213,14 @@ class MLPSpeculator(nn.Module):
weight_loader
(
param
,
loaded_weight
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
loaded_params
.
add
(
name
)
if
self
.
use_llama_nn
:
if
self
.
use_llama_nn
:
if
(
os
.
environ
[
'LM_NN'
]
==
'1'
and
"head"
in
name
)
or
"proj"
in
name
:
if
(
os
.
environ
[
'LM_NN'
]
==
'1'
and
"head"
in
name
)
or
"proj"
in
name
:
_weight
=
torch
.
zeros_like
(
param
.
data
)
_weight
=
torch
.
zeros_like
(
param
.
data
)
ori_shape
=
_weight
.
shape
ori_shape
=
_weight
.
shape
ops
.
trans_w16_gemm
(
_weight
,
param
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
ops
.
trans_w16_gemm
(
_weight
,
param
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
param
.
data
.
copy_
(
_weight
)
param
.
data
.
copy_
(
_weight
)
param
.
data
=
param
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
param
.
data
=
param
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
return
loaded_params
return
loaded_params
vllm/spec_decode/multi_step_worker.py
View file @
08cb5d8f
...
@@ -51,9 +51,14 @@ class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase):
...
@@ -51,9 +51,14 @@ class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase):
def
set_include_gpu_probs_tensor
(
self
)
->
None
:
def
set_include_gpu_probs_tensor
(
self
)
->
None
:
# Need include_gpu_probs_tensor for MultiStepWorker
# Need include_gpu_probs_tensor for MultiStepWorker
self
.
model_runner
.
sampler
.
include_gpu_probs_tensor
=
True
self
.
model_runner
.
sampler
.
include_gpu_probs_tensor
=
True
if
hasattr
(
self
.
model_runner
.
model
,
"sampler"
):
(
self
.
model_runner
.
model
.
sampler
.
include_gpu_probs_tensor
)
=
True
def
set_should_modify_greedy_probs_inplace
(
self
)
->
None
:
def
set_should_modify_greedy_probs_inplace
(
self
)
->
None
:
self
.
model_runner
.
sampler
.
should_modify_greedy_probs_inplace
=
True
self
.
model_runner
.
sampler
.
should_modify_greedy_probs_inplace
=
True
if
hasattr
(
self
.
model_runner
.
model
,
"sampler"
):
(
self
.
model_runner
.
model
.
sampler
.
should_modify_greedy_probs_inplace
)
=
True
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
sampler_output
(
def
sampler_output
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment