Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bd363067
Commit
bd363067
authored
Jun 05, 2025
by
lizhigong
Browse files
Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead
parents
87ef4618
d36deb1a
Changes
106
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
43 additions
and
27 deletions
+43
-27
tests/v1/shutdown/test_delete.py
tests/v1/shutdown/test_delete.py
+4
-2
tests/v1/shutdown/test_forward_error.py
tests/v1/shutdown/test_forward_error.py
+4
-2
tests/v1/shutdown/test_processor_error.py
tests/v1/shutdown/test_processor_error.py
+4
-2
tests/v1/shutdown/test_startup_error.py
tests/v1/shutdown/test_startup_error.py
+5
-3
tests/v1/spec_decode/__init__.py
tests/v1/spec_decode/__init__.py
+0
-0
tests/v1/spec_decode/test_max_len.py
tests/v1/spec_decode/test_max_len.py
+5
-3
tests/v1/spec_decode/test_ngram.py
tests/v1/spec_decode/test_ngram.py
+5
-3
tests/v1/test_async_llm_dp.py
tests/v1/test_async_llm_dp.py
+3
-2
tests/v1/test_oracle.py
tests/v1/test_oracle.py
+8
-7
tests/v1/tpu/untest_basic.py
tests/v1/tpu/untest_basic.py
+0
-0
tests/v1/tpu/untest_mha_attn.py
tests/v1/tpu/untest_mha_attn.py
+0
-0
tests/v1/tpu/untest_multimodal.py
tests/v1/tpu/untest_multimodal.py
+0
-0
tests/v1/tpu/untest_pallas.py
tests/v1/tpu/untest_pallas.py
+0
-0
tests/v1/tpu/untest_perf.py
tests/v1/tpu/untest_perf.py
+0
-0
tests/v1/tpu/untest_sampler.py
tests/v1/tpu/untest_sampler.py
+0
-0
tests/v1/tpu/untest_topk_topp_sampler.py
tests/v1/tpu/untest_topk_topp_sampler.py
+0
-0
tests/v1/tpu/worker/untest_tpu_model_runner.py
tests/v1/tpu/worker/untest_tpu_model_runner.py
+0
-0
tests/v1/worker/test_gpu_model_runner.py
tests/v1/worker/test_gpu_model_runner.py
+5
-3
tests/weight_loading/__init__.py
tests/weight_loading/__init__.py
+0
-0
tests/weight_loading/untest_weight_loading.py
tests/weight_loading/untest_weight_loading.py
+0
-0
No files found.
tests/v1/shutdown/test_delete.py
View file @
bd363067
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown."""
import
os
import
pytest
from
tests.utils
import
wait_for_gpu_memory_to_clear
...
...
@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
...utils
import
models_path_prefix
MODELS
=
[
"meta-llama/Llama-3.2-1B"
]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
]
@
pytest
.
mark
.
asyncio
...
...
tests/v1/shutdown/test_forward_error.py
View file @
bd363067
...
...
@@ -3,6 +3,7 @@
import
asyncio
import
os
import
pytest
from
tests.utils
import
wait_for_gpu_memory_to_clear
...
...
@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.exceptions
import
EngineDeadError
from
...utils
import
models_path_prefix
MODELS
=
[
"meta-llama/Llama-3.2-1B"
]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
]
def
evil_forward
(
self
,
*
args
,
**
kwargs
):
...
...
tests/v1/shutdown/test_processor_error.py
View file @
bd363067
...
...
@@ -3,6 +3,7 @@
import
asyncio
import
os
import
pytest
from
tests.v1.shutdown.utils
import
SHUTDOWN_TEST_TIMEOUT_SEC
...
...
@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.exceptions
import
EngineGenerateError
from
...utils
import
models_path_prefix
MODELS
=
[
"meta-llama/Llama-3.2-1B"
]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
]
@
pytest
.
mark
.
asyncio
...
...
tests/v1/shutdown/test_startup_error.py
View file @
bd363067
# SPDX-License-Identifier: Apache-2.0
"""Test that we handle a startup Error and shutdown."""
import
os
import
pytest
from
tests.utils
import
wait_for_gpu_memory_to_clear
...
...
@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
...utils
import
models_path_prefix
MODELS
=
[
"meta-llama/Llama-3.2-1B"
]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
]
def
evil_method
(
self
,
*
args
,
**
kwargs
):
...
...
@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing.
"""
if
model
!=
"meta-llama/Llama-3.2-1B"
:
if
model
!=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
:
pytest
.
skip
(
reason
=
"Only test meta-llama/Llama-3.2-1B"
)
if
cuda_device_count_stateless
()
<
tensor_parallel_size
:
pytest
.
skip
(
reason
=
"Not enough CUDA devices"
)
...
...
tests/v1/spec_decode/__init__.py
0 → 100644
View file @
bd363067
tests/v1/spec_decode/test_max_len.py
View file @
bd363067
# SPDX-License-Identifier: Apache-2.0
"""Test whether spec decoding handles the max model length properly."""
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
_PROMPTS
=
[
"1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
,
...
...
@@ -21,7 +23,7 @@ def test_ngram_max_len(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_model_len
=
100
,
enforce_eager
=
True
,
# For faster initialization.
speculative_config
=
{
...
...
@@ -44,11 +46,11 @@ def test_eagle_max_len(
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
)
,
enforce_eager
=
True
,
# For faster initialization.
speculative_config
=
{
"method"
:
"eagle"
,
"model"
:
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
)
,
"num_speculative_tokens"
:
num_speculative_tokens
,
},
max_model_len
=
100
,
...
...
tests/v1/spec_decode/test_ngram.py
View file @
bd363067
# SPDX-License-Identifier: Apache-2.0
import
os
import
numpy
as
np
from
vllm.config
import
ModelConfig
,
SpeculativeConfig
,
VllmConfig
from
vllm.v1.spec_decode.ngram_proposer
import
(
NgramProposer
,
_find_subarray_kmp
,
_kmp_lps_array
)
from
...utils
import
models_path_prefix
def
test_kmp_lps_array
():
...
...
@@ -43,10 +45,10 @@ def test_ngram_proposer():
def
ngram_proposer
(
min_n
:
int
,
max_n
:
int
,
k
:
int
)
->
NgramProposer
:
# Dummy model config. Just to set max_model_len.
model_config
=
ModelConfig
(
model
=
"facebook/opt-125m"
,
model_config
=
ModelConfig
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
task
=
"generate"
,
max_model_len
=
100
,
tokenizer
=
"facebook/opt-125m"
,
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
tokenizer_mode
=
"auto"
,
dtype
=
"auto"
,
seed
=
None
,
...
...
tests/v1/test_async_llm_dp.py
View file @
bd363067
...
...
@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.core_client
import
DPAsyncMPClient
from
..utils
import
models_path_prefix
engine_args
=
AsyncEngineArgs
(
model
=
"ibm-research/PowerMoE-3b"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"ibm-research/PowerMoE-3b"
)
,
enforce_eager
=
True
,
disable_log_requests
=
True
,
tensor_parallel_size
=
int
(
os
.
getenv
(
"TP_SIZE"
,
1
)),
...
...
tests/v1/test_oracle.py
View file @
bd363067
...
...
@@ -7,16 +7,17 @@ import vllm.envs as envs
from
vllm
import
LLM
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
..utils
import
models_path_prefix
UNSUPPORTED_MODELS_V1
=
[
"openai/whisper-large-v3"
,
# transcription
"facebook/bart-large-cnn"
,
# encoder decoder
"mistralai/Mamba-Codestral-7B-v0.1"
,
# mamba
"ibm-ai-platform/Bamba-9B"
,
# hybrid
"BAAI/bge-m3"
,
# embedding
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3"
)
,
# transcription
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-large-cnn"
)
,
# encoder decoder
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mamba-Codestral-7B-v0.1"
)
,
# mamba
os
.
path
.
join
(
models_path_prefix
,
"ibm-ai-platform/Bamba-9B"
)
,
# hybrid
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-m3"
)
,
# embedding
]
MODEL
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
UNSUPPORTED_MODELS_V1
)
...
...
tests/v1/tpu/test_basic.py
→
tests/v1/tpu/
un
test_basic.py
View file @
bd363067
File moved
tests/v1/tpu/test_mha_attn.py
→
tests/v1/tpu/
un
test_mha_attn.py
View file @
bd363067
File moved
tests/v1/tpu/test_multimodal.py
→
tests/v1/tpu/
un
test_multimodal.py
View file @
bd363067
File moved
tests/v1/tpu/test_pallas.py
→
tests/v1/tpu/
un
test_pallas.py
View file @
bd363067
File moved
tests/v1/tpu/test_perf.py
→
tests/v1/tpu/
un
test_perf.py
View file @
bd363067
File moved
tests/v1/tpu/test_sampler.py
→
tests/v1/tpu/
un
test_sampler.py
View file @
bd363067
File moved
tests/v1/tpu/test_topk_topp_sampler.py
→
tests/v1/tpu/
un
test_topk_topp_sampler.py
View file @
bd363067
File moved
tests/v1/tpu/worker/test_tpu_model_runner.py
→
tests/v1/tpu/worker/
un
test_tpu_model_runner.py
View file @
bd363067
File moved
tests/v1/worker/test_gpu_model_runner.py
View file @
bd363067
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
from
vllm.config
import
CacheConfig
,
ModelConfig
,
SchedulerConfig
,
VllmConfig
...
...
@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
SchedulerOutput
)
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.worker.gpu_model_runner
import
GPUModelRunner
from
...utils
import
models_path_prefix
@
pytest
.
fixture
...
...
@@ -17,9 +19,9 @@ def model_runner():
max_model_len
=
512
,
)
model_config
=
ModelConfig
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
task
=
"generate"
,
tokenizer
=
"facebook/opt-125m"
,
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
"float16"
,
...
...
tests/weight_loading/__init__.py
0 → 100644
View file @
bd363067
tests/weight_loading/test_weight_loading.py
→
tests/weight_loading/
un
test_weight_loading.py
View file @
bd363067
File moved
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment