Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2c5e637b
Unverified
Commit
2c5e637b
authored
Feb 22, 2025
by
Kevin H. Luu
Committed by
GitHub
Feb 22, 2025
Browse files
[ci] Use env var to control whether to use S3 bucket in CI (#13634)
parent
322d2a27
Changes
30
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
67 additions
and
200 deletions
+67
-200
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+2
-2
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+5
-6
tests/basic_correctness/test_cumem.py
tests/basic_correctness/test_cumem.py
+2
-7
tests/conftest.py
tests/conftest.py
+1
-72
tests/engine/test_computed_prefix_blocks.py
tests/engine/test_computed_prefix_blocks.py
+1
-6
tests/engine/test_detokenization.py
tests/engine/test_detokenization.py
+2
-6
tests/engine/test_executor.py
tests/engine/test_executor.py
+4
-17
tests/engine/test_skip_tokenizer_init.py
tests/engine/test_skip_tokenizer_init.py
+5
-8
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+3
-10
tests/entrypoints/llm/test_collective_rpc.py
tests/entrypoints/llm/test_collective_rpc.py
+1
-1
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+1
-3
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+1
-3
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+1
-3
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+2
-5
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_lazy_outlines.py
+2
-5
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/llm/test_prompt_validation.py
+2
-7
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+27
-28
tests/models/test_initialization.py
tests/models/test_initialization.py
+1
-5
tests/mq_llm_engine/test_abort.py
tests/mq_llm_engine/test_abort.py
+2
-2
tests/mq_llm_engine/test_error_handling.py
tests/mq_llm_engine/test_error_handling.py
+2
-4
No files found.
.buildkite/test-pipeline.yaml
View file @
2c5e637b
...
@@ -278,7 +278,7 @@ steps:
...
@@ -278,7 +278,7 @@ steps:
command
:
pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
command
:
pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
parallelism
:
4
parallelism
:
4
-
label
:
"
PyTorch
Fullgraph
Smoke
Test
"
# 9min
-
label
:
PyTorch Fullgraph Smoke Test
# 9min
fast_check
:
true
fast_check
:
true
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
...
@@ -289,7 +289,7 @@ steps:
...
@@ -289,7 +289,7 @@ steps:
-
pytest -v -s compile/piecewise/test_simple.py
-
pytest -v -s compile/piecewise/test_simple.py
-
pytest -v -s compile/piecewise/test_toy_llama.py
-
pytest -v -s compile/piecewise/test_toy_llama.py
-
label
:
"
PyTorch
Fullgraph
Test
"
# 18min
-
label
:
PyTorch Fullgraph Test
# 18min
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
tests/compile
-
tests/compile
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
2c5e637b
...
@@ -9,7 +9,6 @@ import weakref
...
@@ -9,7 +9,6 @@ import weakref
import
pytest
import
pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
..conftest
import
VllmRunner
from
..conftest
import
VllmRunner
...
@@ -34,7 +33,7 @@ def v1(run_with_both_engines):
...
@@ -34,7 +33,7 @@ def v1(run_with_both_engines):
def
test_vllm_gc_ed
():
def
test_vllm_gc_ed
():
"""Verify vllm instance is GC'ed when it is deleted"""
"""Verify vllm instance is GC'ed when it is deleted"""
llm
=
LLM
(
"distilbert/distilgpt2"
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
llm
=
LLM
(
"distilbert/distilgpt2"
)
weak_llm
=
weakref
.
ref
(
llm
)
weak_llm
=
weakref
.
ref
(
llm
)
del
llm
del
llm
# If there's any circular reference to vllm, this fails
# If there's any circular reference to vllm, this fails
...
@@ -43,10 +42,10 @@ def test_vllm_gc_ed():
...
@@ -43,10 +42,10 @@ def test_vllm_gc_ed():
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"XFORMERS"
,
"FLASHINFER"
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
])
def
test_models
(
def
test_models
(
hf_runner
,
hf_runner
,
model
:
str
,
model
:
str
,
...
@@ -97,8 +96,8 @@ def test_models(
...
@@ -97,8 +96,8 @@ def test_models(
"test_suite"
,
[
"test_suite"
,
[
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"L4"
),
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"L4"
),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"L4"
),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-
2-7b-hf
"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-
3.2-1B-Instruct
"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-
2-7b-hf
"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-
3.2-1B-Instruct
"
,
"mp"
,
""
,
"L4"
),
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"A100"
),
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"A100"
),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"A100"
),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"A100"
),
(
"distilbert/distilgpt2"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"distilbert/distilgpt2"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
...
...
tests/basic_correctness/test_cumem.py
View file @
2c5e637b
...
@@ -4,11 +4,9 @@ import pytest
...
@@ -4,11 +4,9 @@ import pytest
import
torch
import
torch
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.device_allocator.cumem
import
CuMemAllocator
from
vllm.device_allocator.cumem
import
CuMemAllocator
from
vllm.utils
import
GiB_bytes
from
vllm.utils
import
GiB_bytes
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
from
..utils
import
fork_new_process_for_each_test
from
..utils
import
fork_new_process_for_each_test
...
@@ -121,7 +119,7 @@ def test_cumem_with_cudagraph():
...
@@ -121,7 +119,7 @@ def test_cumem_with_cudagraph():
"model, use_v1"
,
"model, use_v1"
,
[
[
# sleep mode with safetensors
# sleep mode with safetensors
(
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/
meta-llama/Llama-3.2-1B"
,
True
),
(
"
meta-llama/Llama-3.2-1B"
,
True
),
# sleep mode with pytorch checkpoint
# sleep mode with pytorch checkpoint
(
"facebook/opt-125m"
,
False
),
(
"facebook/opt-125m"
,
False
),
])
])
...
@@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
...
@@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
if
use_v1
else
"0"
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
if
use_v1
else
"0"
free
,
total
=
torch
.
cuda
.
mem_get_info
()
free
,
total
=
torch
.
cuda
.
mem_get_info
()
used_bytes_baseline
=
total
-
free
# in case other process is running
used_bytes_baseline
=
total
-
free
# in case other process is running
load_format
=
LoadFormat
.
AUTO
llm
=
LLM
(
model
,
enable_sleep_mode
=
True
)
if
"Llama"
in
model
:
load_format
=
LoadFormat
.
RUNAI_STREAMER
llm
=
LLM
(
model
,
load_format
=
load_format
,
enable_sleep_mode
=
True
)
prompt
=
"How are you?"
prompt
=
"How are you?"
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
10
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
10
)
output
=
llm
.
generate
(
prompt
,
sampling_params
)
output
=
llm
.
generate
(
prompt
,
sampling_params
)
...
...
tests/conftest.py
View file @
2c5e637b
...
@@ -24,7 +24,7 @@ from tests.models.utils import (TokensTextLogprobs,
...
@@ -24,7 +24,7 @@ from tests.models.utils import (TokensTextLogprobs,
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
LoadFormat
,
TaskOption
,
TokenizerPoolConfig
from
vllm.config
import
TaskOption
,
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
from
vllm.distributed
import
(
cleanup_dist_env_and_memory
,
init_distributed_environment
,
init_distributed_environment
,
...
@@ -47,70 +47,6 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
...
@@ -47,70 +47,6 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M
=
TypeVar
(
"_M"
)
_M
=
TypeVar
(
"_M"
)
MODELS_ON_S3
=
[
"distilbert/distilgpt2"
,
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Meta-Llama-3-8B"
,
"meta-llama/Llama-3.2-1B"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
"openai-community/gpt2"
,
"ArthurZ/Ilama-3.2-1B"
,
"llava-hf/llava-1.5-7b-hf"
,
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
,
"ai21labs/Jamba-tiny-random"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
"nm-testing/Phi-3-mini-128k-instruct-FP8"
,
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"
,
"AMead10/Llama-3.2-1B-Instruct-AWQ"
,
"shuyuej/Llama-3.2-1B-Instruct-GPTQ"
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head"
,
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024"
,
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"
,
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"
,
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
"nm-testing/tinyllama-oneshot-w4a16-group128-v2"
,
"nm-testing/tinyllama-oneshot-w8a16-per-channel"
,
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
,
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing"
,
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor"
,
"nm-testing/llama2.c-stories42M-pruned2.4-compressed"
,
]
MODEL_WEIGHTS_S3_BUCKET
=
"s3://vllm-ci-model-weights"
_PromptMultiModalInput
=
Union
[
List
[
_M
],
List
[
List
[
_M
]]]
_PromptMultiModalInput
=
Union
[
List
[
_M
],
List
[
List
[
_M
]]]
PromptImageInput
=
_PromptMultiModalInput
[
Image
.
Image
]
PromptImageInput
=
_PromptMultiModalInput
[
Image
.
Image
]
...
@@ -742,14 +678,8 @@ class VllmRunner:
...
@@ -742,14 +678,8 @@ class VllmRunner:
enable_chunked_prefill
:
bool
=
False
,
enable_chunked_prefill
:
bool
=
False
,
swap_space
:
int
=
4
,
swap_space
:
int
=
4
,
enforce_eager
:
Optional
[
bool
]
=
False
,
enforce_eager
:
Optional
[
bool
]
=
False
,
load_format
:
Optional
[
LoadFormat
]
=
None
,
**
kwargs
,
**
kwargs
,
)
->
None
:
)
->
None
:
if
model_name
in
MODELS_ON_S3
and
not
load_format
:
model_name
=
(
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/
{
model_name
}
"
)
load_format
=
LoadFormat
.
RUNAI_STREAMER
if
not
load_format
:
load_format
=
LoadFormat
.
AUTO
self
.
model
=
LLM
(
self
.
model
=
LLM
(
model
=
model_name
,
model
=
model_name
,
task
=
task
,
task
=
task
,
...
@@ -764,7 +694,6 @@ class VllmRunner:
...
@@ -764,7 +694,6 @@ class VllmRunner:
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
block_size
=
block_size
,
block_size
=
block_size
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
load_format
=
load_format
,
**
kwargs
,
**
kwargs
,
)
)
...
...
tests/engine/test_computed_prefix_blocks.py
View file @
2c5e637b
...
@@ -2,16 +2,12 @@
...
@@ -2,16 +2,12 @@
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
def
test_computed_prefix_blocks
(
model
:
str
,
block_size
:
int
):
# This test checks if we are able to run the engine to completion
# This test checks if we are able to run the engine to completion
...
@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
...
@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration."
)
"decoration."
)
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
block_size
=
block_size
,
block_size
=
block_size
,
enable_prefix_caching
=
True
)
enable_prefix_caching
=
True
)
...
...
tests/engine/test_detokenization.py
View file @
2c5e637b
...
@@ -2,15 +2,11 @@
...
@@ -2,15 +2,11 @@
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_computed_prefix_blocks
(
model
:
str
):
def
test_computed_prefix_blocks
(
model
:
str
):
# This test checks if the engine generates completions both with and
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
# without optional detokenization, that detokenization includes text
...
@@ -21,7 +17,7 @@ def test_computed_prefix_blocks(model: str):
...
@@ -21,7 +17,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"paper clips? Is there an easy to follow video tutorial available "
"online for free?"
)
"online for free?"
)
llm
=
LLM
(
model
=
model
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
llm
=
LLM
(
model
=
model
)
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
detokenize
=
False
)
detokenize
=
False
)
...
...
tests/engine/test_executor.py
View file @
2c5e637b
...
@@ -6,17 +6,12 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
...
@@ -6,17 +6,12 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.executor.uniproc_executor
import
UniProcExecutor
from
vllm.executor.uniproc_executor
import
UniProcExecutor
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
class
Mock
:
class
Mock
:
...
...
...
@@ -38,12 +33,10 @@ class CustomUniExecutor(UniProcExecutor):
...
@@ -38,12 +33,10 @@ class CustomUniExecutor(UniProcExecutor):
CustomUniExecutorAsync
=
CustomUniExecutor
CustomUniExecutorAsync
=
CustomUniExecutor
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor_type_checking
(
model
):
def
test_custom_executor_type_checking
(
model
):
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
Mock
)
distributed_executor_backend
=
Mock
)
LLMEngine
.
from_engine_args
(
engine_args
)
LLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
...
@@ -52,8 +45,7 @@ def test_custom_executor_type_checking(model):
...
@@ -52,8 +45,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor
(
model
,
tmp_path
):
def
test_custom_executor
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
os
.
chdir
(
tmp_path
)
...
@@ -62,7 +54,6 @@ def test_custom_executor(model, tmp_path):
...
@@ -62,7 +54,6 @@ def test_custom_executor(model, tmp_path):
engine_args
=
EngineArgs
(
engine_args
=
EngineArgs
(
model
=
model
,
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutor
,
distributed_executor_backend
=
CustomUniExecutor
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
)
)
...
@@ -77,8 +68,7 @@ def test_custom_executor(model, tmp_path):
...
@@ -77,8 +68,7 @@ def test_custom_executor(model, tmp_path):
os
.
chdir
(
cwd
)
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_custom_executor_async
(
model
,
tmp_path
):
def
test_custom_executor_async
(
model
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
os
.
chdir
(
tmp_path
)
...
@@ -87,7 +77,6 @@ def test_custom_executor_async(model, tmp_path):
...
@@ -87,7 +77,6 @@ def test_custom_executor_async(model, tmp_path):
engine_args
=
AsyncEngineArgs
(
engine_args
=
AsyncEngineArgs
(
model
=
model
,
model
=
model
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
distributed_executor_backend
=
CustomUniExecutorAsync
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
)
)
...
@@ -106,8 +95,7 @@ def test_custom_executor_async(model, tmp_path):
...
@@ -106,8 +95,7 @@ def test_custom_executor_async(model, tmp_path):
os
.
chdir
(
cwd
)
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_respect_ray
(
model
):
def
test_respect_ray
(
model
):
# even for TP=1 and PP=1,
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
# if users specify ray, we should use ray.
...
@@ -116,7 +104,6 @@ def test_respect_ray(model):
...
@@ -116,7 +104,6 @@ def test_respect_ray(model):
engine_args
=
EngineArgs
(
engine_args
=
EngineArgs
(
model
=
model
,
model
=
model
,
distributed_executor_backend
=
"ray"
,
distributed_executor_backend
=
"ray"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
enforce_eager
=
True
,
# reduce test time
enforce_eager
=
True
,
# reduce test time
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
...
...
tests/engine/test_skip_tokenizer_init.py
View file @
2c5e637b
...
@@ -2,22 +2,19 @@
...
@@ -2,22 +2,19 @@
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"distilbert/distilgpt2"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/distilbert/distilgpt2"
])
def
test_skip_tokenizer_initialization
(
model
:
str
):
def
test_skip_tokenizer_initialization
(
model
:
str
):
# This test checks if the flag skip_tokenizer_init skips the initialization
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
# token ids.
llm
=
LLM
(
model
=
model
,
llm
=
LLM
(
model
=
model
,
skip_tokenizer_init
=
True
,
skip_tokenizer_init
=
True
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
)
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
sampling_params
=
SamplingParams
(
prompt_logprobs
=
True
,
detokenize
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
with
pytest
.
raises
(
ValueError
,
match
=
"cannot pass text prompts when"
):
...
...
tests/entrypoints/llm/test_chat.py
View file @
2c5e637b
...
@@ -5,17 +5,12 @@ from typing import List
...
@@ -5,17 +5,12 @@ from typing import List
import
pytest
import
pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
...conftest
import
MODEL_WEIGHTS_S3_BUCKET
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
..openai.test_vision
import
TEST_IMAGE_URLS
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
def
test_chat
():
def
test_chat
():
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
)
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
prompt1
=
"Explain the concept of entropy."
prompt1
=
"Explain the concept of entropy."
messages
=
[
messages
=
[
...
@@ -33,8 +28,7 @@ def test_chat():
...
@@ -33,8 +28,7 @@ def test_chat():
def
test_multi_chat
():
def
test_multi_chat
():
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Llama-3.2-1B-Instruct"
,
llm
=
LLM
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
)
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
prompt1
=
"Explain the concept of entropy."
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
prompt2
=
"Explain what among us is."
...
@@ -71,8 +65,7 @@ def test_multi_chat():
...
@@ -71,8 +65,7 @@ def test_multi_chat():
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
llm
=
LLM
(
model
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/Phi-3.5-vision-instruct"
,
model
=
"microsoft/Phi-3.5-vision-instruct"
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
,
dtype
=
"bfloat16"
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
max_num_seqs
=
5
,
...
...
tests/entrypoints/llm/test_collective_rpc.py
View file @
2c5e637b
...
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
...
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
def
echo_rank
(
self
):
def
echo_rank
(
self
):
return
self
.
rank
return
self
.
rank
llm
=
LLM
(
model
=
"
s3://vllm-ci-model-weights
/Llama-3.2-1B-Instruct"
,
llm
=
LLM
(
model
=
"
meta-llama
/Llama-3.2-1B-Instruct"
,
enforce_eager
=
True
,
enforce_eager
=
True
,
load_format
=
"dummy"
,
load_format
=
"dummy"
,
tensor_parallel_size
=
tp_size
,
tensor_parallel_size
=
tp_size
,
...
...
tests/entrypoints/llm/test_encode.py
View file @
2c5e637b
...
@@ -6,10 +6,9 @@ from typing import List
...
@@ -6,10 +6,9 @@ from typing import List
import
pytest
import
pytest
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
MODEL_NAME
=
"
s3://vllm-ci-model-weights
/e5-mistral-7b-instruct"
MODEL_NAME
=
"
intfloat
/e5-mistral-7b-instruct"
PROMPTS
=
[
PROMPTS
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -33,7 +32,6 @@ def llm():
...
@@ -33,7 +32,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
32768
,
max_num_batched_tokens
=
32768
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.75
,
gpu_memory_utilization
=
0.75
,
...
...
tests/entrypoints/llm/test_generate.py
View file @
2c5e637b
...
@@ -6,10 +6,9 @@ from typing import List
...
@@ -6,10 +6,9 @@ from typing import List
import
pytest
import
pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
MODEL_NAME
=
"
s3://vllm-ci-model-weights
/distilgpt2"
MODEL_NAME
=
"
distilbert
/distilgpt2"
PROMPTS
=
[
PROMPTS
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -31,7 +30,6 @@ def llm():
...
@@ -31,7 +30,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_num_batched_tokens
=
4096
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
gpu_memory_utilization
=
0.10
,
...
...
tests/entrypoints/llm/test_generate_multiple_loras.py
View file @
2c5e637b
...
@@ -7,11 +7,10 @@ import pytest
...
@@ -7,11 +7,10 @@ import pytest
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
MODEL_NAME
=
"
s3://vllm-ci-model-weights
/zephyr-7b-beta"
MODEL_NAME
=
"
HuggingFaceH4
/zephyr-7b-beta"
PROMPTS
=
[
PROMPTS
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -28,7 +27,6 @@ def llm():
...
@@ -28,7 +27,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
,
max_model_len
=
8192
,
max_model_len
=
8192
,
enable_lora
=
True
,
enable_lora
=
True
,
...
...
tests/entrypoints/llm/test_guided_generate.py
View file @
2c5e637b
...
@@ -7,13 +7,12 @@ import weakref
...
@@ -7,13 +7,12 @@ import weakref
import
jsonschema
import
jsonschema
import
pytest
import
pytest
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
MODEL_NAME
=
"
s3://vllm-ci-model-weights
/Qwen2.5-1.5B-Instruct"
MODEL_NAME
=
"
Qwen
/Qwen2.5-1.5B-Instruct"
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
]
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
]
...
@@ -21,9 +20,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
...
@@ -21,9 +20,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
def
llm
():
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
llm
=
LLM
(
model
=
MODEL_NAME
,
max_model_len
=
1024
)
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
max_model_len
=
1024
)
with
llm
.
deprecate_legacy_api
():
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
yield
weakref
.
proxy
(
llm
)
...
...
tests/entrypoints/llm/test_lazy_outlines.py
View file @
2c5e637b
...
@@ -6,7 +6,6 @@ from contextlib import nullcontext
...
@@ -6,7 +6,6 @@ from contextlib import nullcontext
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
...
@@ -44,8 +43,7 @@ def run_normal():
...
@@ -44,8 +43,7 @@ def run_normal():
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM without guided decoding as a baseline.
# Create an LLM without guided decoding as a baseline.
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/distilgpt2"
,
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.3
)
gpu_memory_utilization
=
0.3
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
@@ -61,8 +59,7 @@ def run_normal():
...
@@ -61,8 +59,7 @@ def run_normal():
def
run_lmfe
(
sample_regex
):
def
run_lmfe
(
sample_regex
):
# Create an LLM with guided decoding enabled.
# Create an LLM with guided decoding enabled.
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/distilgpt2"
,
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
,
enforce_eager
=
True
,
guided_decoding_backend
=
"lm-format-enforcer"
,
guided_decoding_backend
=
"lm-format-enforcer"
,
gpu_memory_utilization
=
0.3
)
gpu_memory_utilization
=
0.3
)
...
...
tests/entrypoints/llm/test_prompt_validation.py
View file @
2c5e637b
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
import
pytest
import
pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.config
import
LoadFormat
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
...
@@ -15,17 +14,13 @@ def v1(run_with_both_engines):
...
@@ -15,17 +14,13 @@ def v1(run_with_both_engines):
def
test_empty_prompt
():
def
test_empty_prompt
():
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/gpt2"
,
llm
=
LLM
(
model
=
"openai-community/gpt2"
,
enforce_eager
=
True
)
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
llm
.
generate
([
""
])
llm
.
generate
([
""
])
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
skip_v1
def
test_out_of_vocab_token
():
def
test_out_of_vocab_token
():
llm
=
LLM
(
model
=
"s3://vllm-ci-model-weights/gpt2"
,
llm
=
LLM
(
model
=
"openai-community/gpt2"
,
enforce_eager
=
True
)
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'out of vocabulary'
):
with
pytest
.
raises
(
ValueError
,
match
=
'out of vocabulary'
):
llm
.
generate
({
"prompt_token_ids"
:
[
999999
]})
llm
.
generate
({
"prompt_token_ids"
:
[
999999
]})
tests/metrics/test_metrics.py
View file @
2c5e637b
...
@@ -8,21 +8,17 @@ import ray
...
@@ -8,21 +8,17 @@ import ray
from
prometheus_client
import
REGISTRY
from
prometheus_client
import
REGISTRY
from
vllm
import
EngineArgs
,
LLMEngine
from
vllm
import
EngineArgs
,
LLMEngine
from
vllm.config
import
LoadFormat
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.metrics
import
RayPrometheusStatLogger
from
vllm.engine.metrics
import
RayPrometheusStatLogger
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.test_utils
import
MODEL_WEIGHTS_S3_BUCKET
from
..conftest
import
MODEL_WEIGHTS_S3_BUCKET
MODELS
=
[
MODELS
=
[
"distilbert/distilgpt2"
,
"distilbert/distilgpt2"
,
]
]
RUNAI_STREAMER_LOAD_FORMAT
=
LoadFormat
.
RUNAI_STREAMER
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
...
@@ -146,9 +142,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
...
@@ -146,9 +142,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
metrics_tag_content
=
stat_logger
.
labels
[
"model_name"
]
metrics_tag_content
=
stat_logger
.
labels
[
"model_name"
]
if
served_model_name
is
None
or
served_model_name
==
[]:
if
served_model_name
is
None
or
served_model_name
==
[]:
actual_model_name
=
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/
{
model
}
"
assert
metrics_tag_content
==
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/
{
model
}
"
,
(
assert
metrics_tag_content
==
actual_model_name
,
(
f
"Metrics tag model_name is wrong! expect:
{
model
!
r
}
\n
"
f
"Metrics tag model_name is wrong! expect:
{
actual_model_name
!
r
}
\n
"
f
"actual:
{
metrics_tag_content
!
r
}
"
)
f
"actual:
{
metrics_tag_content
!
r
}
"
)
else
:
else
:
assert
metrics_tag_content
==
served_model_name
[
0
],
(
assert
metrics_tag_content
==
served_model_name
[
0
],
(
...
@@ -174,10 +169,11 @@ async def test_async_engine_log_metrics_regression(
...
@@ -174,10 +169,11 @@ async def test_async_engine_log_metrics_regression(
when disable_log_stats=False
when disable_log_stats=False
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
"""
"""
engine_args
=
AsyncEngineArgs
(
model
=
model
,
engine_args
=
AsyncEngineArgs
(
model
=
model
,
dtype
=
dtype
,
dtype
=
dtype
,
disable_log_stats
=
disable_log_stats
,
disable_log_stats
=
disable_log_stats
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
)
async_engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
async_engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
for
i
,
prompt
in
enumerate
(
example_prompts
):
for
i
,
prompt
in
enumerate
(
example_prompts
):
results
=
async_engine
.
generate
(
results
=
async_engine
.
generate
(
...
@@ -189,7 +185,7 @@ async def test_async_engine_log_metrics_regression(
...
@@ -189,7 +185,7 @@ async def test_async_engine_log_metrics_regression(
async
for
_
in
results
:
async
for
_
in
results
:
pass
pass
assert_metrics
(
async_engine
.
engine
,
disable_log_stats
,
assert_metrics
(
model
,
async_engine
.
engine
,
disable_log_stats
,
len
(
example_prompts
))
len
(
example_prompts
))
...
@@ -204,10 +200,11 @@ def test_engine_log_metrics_regression(
...
@@ -204,10 +200,11 @@ def test_engine_log_metrics_regression(
max_tokens
:
int
,
max_tokens
:
int
,
disable_log_stats
:
bool
,
disable_log_stats
:
bool
,
)
->
None
:
)
->
None
:
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
dtype
=
dtype
,
dtype
=
dtype
,
disable_log_stats
=
disable_log_stats
,
disable_log_stats
=
disable_log_stats
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
for
i
,
prompt
in
enumerate
(
example_prompts
):
for
i
,
prompt
in
enumerate
(
example_prompts
):
engine
.
add_request
(
engine
.
add_request
(
...
@@ -218,7 +215,8 @@ def test_engine_log_metrics_regression(
...
@@ -218,7 +215,8 @@ def test_engine_log_metrics_regression(
while
engine
.
has_unfinished_requests
():
while
engine
.
has_unfinished_requests
():
engine
.
step
()
engine
.
step
()
assert_metrics
(
engine
,
disable_log_stats
,
len
(
example_prompts
))
assert_metrics
(
f
"
{
MODEL_WEIGHTS_S3_BUCKET
}
/
{
model
}
"
,
engine
,
disable_log_stats
,
len
(
example_prompts
))
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
@@ -285,14 +283,15 @@ def test_metric_spec_decode_interval(
...
@@ -285,14 +283,15 @@ def test_metric_spec_decode_interval(
)
->
None
:
)
->
None
:
k
=
5
k
=
5
engine_args
=
EngineArgs
(
model
=
model
,
engine_args
=
EngineArgs
(
model
=
model
,
dtype
=
dtype
,
dtype
=
dtype
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
,
gpu_memory_utilization
=
0.4
,
speculative_model
=
model
,
speculative_model
=
model
,
num_speculative_tokens
=
k
,
num_speculative_tokens
=
k
,
enforce_eager
=
True
,
enforce_eager
=
True
,
load_format
=
RUNAI_STREAMER_LOAD_FORMAT
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
...
@@ -359,7 +358,7 @@ def test_metric_spec_decode_interval(
...
@@ -359,7 +358,7 @@ def test_metric_spec_decode_interval(
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
def
assert_metrics
(
engine
:
LLMEngine
,
disable_log_stats
:
bool
,
def
assert_metrics
(
model
:
str
,
engine
:
LLMEngine
,
disable_log_stats
:
bool
,
num_requests
:
int
)
->
None
:
num_requests
:
int
)
->
None
:
if
disable_log_stats
:
if
disable_log_stats
:
with
pytest
.
raises
(
AttributeError
):
with
pytest
.
raises
(
AttributeError
):
...
@@ -370,7 +369,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
...
@@ -370,7 +369,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
# Ensure the count bucket of request-level histogram metrics matches
# Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are
# the number of requests as a simple sanity check to ensure metrics are
# generated
# generated
labels
=
{
'model_name'
:
engine
.
model_config
.
model
}
labels
=
{
'model_name'
:
model
}
request_histogram_metrics
=
[
request_histogram_metrics
=
[
"vllm:e2e_request_latency_seconds"
,
"vllm:e2e_request_latency_seconds"
,
"vllm:request_prompt_tokens"
,
"vllm:request_prompt_tokens"
,
...
...
tests/models/test_initialization.py
View file @
2c5e637b
...
@@ -7,7 +7,6 @@ from transformers import PretrainedConfig
...
@@ -7,7 +7,6 @@ from transformers import PretrainedConfig
from
vllm
import
LLM
from
vllm
import
LLM
from
..conftest
import
MODELS_ON_S3
from
.registry
import
HF_EXAMPLE_MODELS
from
.registry
import
HF_EXAMPLE_MODELS
...
@@ -43,11 +42,8 @@ def test_can_initialize(model_arch):
...
@@ -43,11 +42,8 @@ def test_can_initialize(model_arch):
with
patch
.
object
(
LLM
.
get_engine_class
(),
"_initialize_kv_caches"
,
with
patch
.
object
(
LLM
.
get_engine_class
(),
"_initialize_kv_caches"
,
_initialize_kv_caches
):
_initialize_kv_caches
):
model_name
=
model_info
.
default
if
model_name
in
MODELS_ON_S3
:
model_name
=
f
"s3://vllm-ci-model-weights/
{
model_name
.
split
(
'/'
)[
-
1
]
}
"
LLM
(
LLM
(
model_
name
,
model_
info
.
default
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
speculative_model
=
model_info
.
speculative_model
,
speculative_model
=
model_info
.
speculative_model
,
...
...
tests/mq_llm_engine/test_abort.py
View file @
2c5e637b
...
@@ -10,8 +10,8 @@ import pytest
...
@@ -10,8 +10,8 @@ import pytest
from
tests.mq_llm_engine.utils
import
RemoteMQLLMEngine
,
generate
from
tests.mq_llm_engine.utils
import
RemoteMQLLMEngine
,
generate
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
MODEL
=
"
s3://vllm-ci-model-weights
/gemma-1.1-2b-it"
MODEL
=
"
google
/gemma-1.1-2b-it"
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
load_format
=
"runai_streamer"
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
)
RAISED_ERROR
=
KeyError
RAISED_ERROR
=
KeyError
RAISED_VALUE
=
"foo"
RAISED_VALUE
=
"foo"
EXPECTED_TOKENS
=
250
EXPECTED_TOKENS
=
250
...
...
tests/mq_llm_engine/test_error_handling.py
View file @
2c5e637b
...
@@ -21,10 +21,8 @@ from vllm.lora.request import LoRARequest
...
@@ -21,10 +21,8 @@ from vllm.lora.request import LoRARequest
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
MODEL
=
"s3://vllm-ci-model-weights/gemma-1.1-2b-it"
MODEL
=
"google/gemma-1.1-2b-it"
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
enforce_eager
=
True
)
load_format
=
"runai_streamer"
,
enforce_eager
=
True
)
RAISED_ERROR
=
KeyError
RAISED_ERROR
=
KeyError
RAISED_VALUE
=
"foo"
RAISED_VALUE
=
"foo"
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment