Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8e340b4f
Commit
8e340b4f
authored
Jun 05, 2025
by
yangql
Browse files
Merge remote-tracking branch 'origin/v0.8.5.post1-dev' into v0.8.5.post1-dev
parents
1cb37dab
a68aef25
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
345 additions
and
322 deletions
+345
-322
tests/runai_model_streamer_test/test_weight_utils.py
tests/runai_model_streamer_test/test_weight_utils.py
+5
-3
tests/samplers/test_no_bad_words.py
tests/samplers/test_no_bad_words.py
+3
-2
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+4
-3
tests/tokenization/test_detokenize.py
tests/tokenization/test_detokenize.py
+2
-2
tests/tokenization/test_tokenizer_group.py
tests/tokenization/test_tokenizer_group.py
+4
-2
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+286
-286
tests/v1/e2e/test_correctness_sliding_window.py
tests/v1/e2e/test_correctness_sliding_window.py
+6
-4
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+5
-0
tests/v1/e2e/untest_cascade_attention.py
tests/v1/e2e/untest_cascade_attention.py
+5
-4
tests/v1/engine/test_llm_engine.py
tests/v1/engine/test_llm_engine.py
+4
-2
tests/v1/engine/test_output_processor.py
tests/v1/engine/test_output_processor.py
+3
-2
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+9
-7
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+3
-1
tests/v1/sample/test_logprobs_e2e.py
tests/v1/sample/test_logprobs_e2e.py
+4
-3
tests/v1/sample/test_sampling_params_e2e.py
tests/v1/sample/test_sampling_params_e2e.py
+2
-1
tests/v1/spec_decode/untest_max_len.py
tests/v1/spec_decode/untest_max_len.py
+0
-0
No files found.
tests/runai_model_streamer_test/test_weight_utils.py
View file @
8e340b4f
# SPDX-License-Identifier: Apache-2.0
import
os
import
glob
import
tempfile
...
...
@@ -9,6 +10,7 @@ import torch
from
vllm.model_executor.model_loader.weight_utils
import
(
download_weights_from_hf
,
runai_safetensors_weights_iterator
,
safetensors_weights_iterator
)
from
..utils
import
models_path_prefix
def
test_runai_model_loader
():
...
...
@@ -23,10 +25,10 @@ def test_runai_model_loader():
runai_model_streamer_tensors
=
{}
hf_safetensors_tensors
=
{}
for
name
,
tensor
in
runai_safetensors_weights_iterator
(
safetensors
):
for
name
,
tensor
in
runai_safetensors_weights_iterator
(
safetensors
,
False
):
runai_model_streamer_tensors
[
name
]
=
tensor
for
name
,
tensor
in
safetensors_weights_iterator
(
safetensors
):
for
name
,
tensor
in
safetensors_weights_iterator
(
safetensors
,
False
):
hf_safetensors_tensors
[
name
]
=
tensor
assert
len
(
runai_model_streamer_tensors
)
==
len
(
hf_safetensors_tensors
)
...
...
@@ -38,4 +40,4 @@ def test_runai_model_loader():
if
__name__
==
"__main__"
:
test_runai_model_loader
()
test_runai_model_loader
()
\ No newline at end of file
tests/samplers/test_no_bad_words.py
View file @
8e340b4f
...
...
@@ -43,7 +43,8 @@ def _generate(
class
TestOneTokenBadWord
:
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/Llama-2-7B-fp16"
)
# MODEL = os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-fp16")
MODEL
=
"TheBloke/Llama-2-7B-fp16"
PROMPT
=
"Hi! How are"
TARGET_TOKEN
=
"you"
...
...
@@ -191,4 +192,4 @@ class TestTwoTokenBadWord:
prompt
:
str
,
add_special_tokens
:
bool
=
True
)
->
list
[
int
]:
return
self
.
tokenizer
(
prompt
,
add_special_tokens
=
add_special_tokens
).
input_ids
add_special_tokens
=
add_special_tokens
).
input_ids
\ No newline at end of file
tests/tensorizer_loader/test_tensorizer.py
View file @
8e340b4f
...
...
@@ -7,16 +7,15 @@ import pathlib
import
subprocess
from
functools
import
partial
from
unittest.mock
import
MagicMock
,
patch
from
typing
import
List
,
Tuple
,
Optional
import
openai
import
pytest
import
torch
from
huggingface_hub
import
snapshot_download
from
typing
import
List
,
Tuple
,
Optional
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.lora.request
import
LoRARequest
# yapf conflicts with isort for this docstring
# yapf: disable
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
...
...
@@ -26,6 +25,8 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
open_stream
,
serialize_vllm_model
,
tensorize_vllm_model
)
from
vllm.lora.request
import
LoRARequest
# yapf: enable
from
vllm.utils
import
PlaceholderModule
,
import_from_path
...
...
@@ -245,7 +246,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH
/
"offline_inference/multilora_inference.py"
,
)
model_ref
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b-hf"
)
model_ref
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b-hf"
)
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path
=
os
.
path
.
join
(
models_path_prefix
,
"yard1/llama-2-7b-sql-lora-test"
)
test_prompts
=
multilora_inference
.
create_test_prompts
(
lora_path
)
...
...
tests/tokenization/test_detokenize.py
View file @
8e340b4f
...
...
@@ -89,7 +89,7 @@ def tokenizer(tokenizer_name):
AutoTokenizer
.
from_pretrained
(
tokenizer_name
))
@
pytest
.
mark
.
parametrize
(
"tokenizer_name"
,
[
"mistralai/Pixtral-12B-2409"
])
@
pytest
.
mark
.
parametrize
(
"tokenizer_name"
,
[
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Pixtral-12B-2409"
)
])
@
pytest
.
mark
.
parametrize
(
"truth"
,
[
...
...
@@ -403,4 +403,4 @@ def test_decode_prompt_logprobs_chunked_prefill(
generated_string
+=
prompt_logprobs
[
prompt_token
].
decoded_token
assert
generated_string
==
example_prompts
[
idx
],
(
"Detokenized prompt logprobs do not match original prompt"
)
"Detokenized prompt logprobs do not match original prompt"
)
\ No newline at end of file
tests/tokenization/test_tokenizer_group.py
View file @
8e340b4f
...
...
@@ -8,11 +8,13 @@ from ..utils import models_path_prefix
from
vllm.transformers_utils.tokenizer_group
import
TokenizerGroup
# export HF_ENDPOINT=https://hf-mirror.com
@
pytest
.
mark
.
asyncio
async
def
test_tokenizer_group
():
reference_tokenizer
=
AutoTokenizer
.
from_pretrained
(
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
))
# reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
reference_tokenizer
=
AutoTokenizer
.
from_pretrained
(
"gpt2"
)
tokenizer_group
=
TokenizerGroup
(
tokenizer_id
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
#
tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora
=
False
,
max_num_seqs
=
1
,
max_input_length
=
None
,
...
...
tests/v1/core/test_scheduler.py
View file @
8e340b4f
This diff is collapsed.
Click to expand it.
tests/v1/e2e/test_correctness_sliding_window.py
View file @
8e340b4f
# SPDX-License-Identifier: Apache-2.0
from
dataclasses
import
dataclass
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...core.block.e2e.test_correctness_sliding_window
import
(
check_answers
,
prep_prompts
)
from
...utils
import
models_path_prefix
@
dataclass
...
...
@@ -16,16 +18,16 @@ class TestConfig:
model_config
=
{
"bigcode/starcoder2-3b"
:
TestConfig
(
4096
,
(
800
,
1100
)),
"google/gemma-2-2b-it"
:
TestConfig
(
4096
,
(
400
,
800
)),
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
)
:
TestConfig
(
4096
,
(
800
,
1100
)),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
)
:
TestConfig
(
4096
,
(
400
,
800
)),
}
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"bigcode/starcoder2-3b"
,
# sliding window only
"google/gemma-2-2b-it"
,
# sliding window + full attention
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
)
,
# sliding window only
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
)
,
# sliding window + full attention
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
...
...
tests/v1/e2e/test_spec_decode.py
View file @
8e340b4f
...
...
@@ -4,9 +4,11 @@ from __future__ import annotations
import
random
from
typing
import
Any
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
@
pytest
.
fixture
...
...
@@ -49,14 +51,17 @@ def sampling_config():
@
pytest
.
fixture
def
model_name
():
# return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
return
"meta-llama/Llama-3.1-8B-Instruct"
def
eagle_model_name
():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
return
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
def
eagle3_model_name
():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
return
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
...
...
tests/v1/e2e/test_cascade_attention.py
→
tests/v1/e2e/
un
test_cascade_attention.py
View file @
8e340b4f
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
fork_new_process_for_each_test
from
...utils
import
fork_new_process_for_each_test
,
models_path_prefix
@
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"attn_backend"
,
[
"FLASH_ATTN_VLLM_V1"
,
"FLASHINFER_VLLM_V1"
])
[
"FLASH_ATTN_VLLM_V1"
])
#
"FLASHINFER_VLLM_V1"
def
test_cascade_attention
(
example_system_message
,
monkeypatch
,
attn_backend
):
prompt
=
"
\n
<User>: Implement fibonacci sequence in Python.
\n
<Claude>:"
...
...
@@ -17,7 +18,7 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
100
)
# No cascade attention.
...
...
@@ -29,4 +30,4 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
prompts
=
[
example_system_message
+
prompt
]
*
64
responses
=
llm
.
generate
(
prompts
,
sampling_params
)
for
response
in
responses
:
assert
response
.
outputs
[
0
].
text
==
ref_output
assert
response
.
outputs
[
0
].
text
==
ref_output
\ No newline at end of file
tests/v1/engine/test_llm_engine.py
View file @
8e340b4f
...
...
@@ -3,11 +3,13 @@
import
random
from
typing
import
Optional
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
MODEL
=
"facebook/opt-125m"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
DTYPE
=
"half"
...
...
@@ -96,4 +98,4 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
}
raise
AssertionError
(
f
"
{
len
(
completion_counts
)
}
unique completions; expected"
f
"
{
n
}
. Repeats:
{
repeats
}
"
)
f
"
{
n
}
. Repeats:
{
repeats
}
"
)
\ No newline at end of file
tests/v1/engine/test_output_processor.py
View file @
8e340b4f
...
...
@@ -20,6 +20,7 @@ from vllm.v1.engine import EngineCoreRequest
from
vllm.v1.engine.output_processor
import
(
OutputProcessor
,
RequestOutputCollector
)
from
vllm.v1.metrics.stats
import
IterationStats
from
...utils
import
models_path_prefix
def
_ref_convert_id_to_token
(
...
...
@@ -520,7 +521,7 @@ def test_stop_token(include_stop_str_in_output: bool,
dummy_test_vectors: dummy engine core outputs and other data structures
"""
model_id
=
dummy_test_vectors
.
tokenizer
.
name_or_path
if
model_id
!=
'meta-llama/Llama-3.2-1B'
:
if
model_id
!=
os
.
path
.
join
(
models_path_prefix
,
'meta-llama/Llama-3.2-1B'
)
:
raise
AssertionError
(
"Test requires meta-llama/Llama-3.2-1B but "
f
"
{
model_id
}
is in use."
)
do_logprobs
=
num_sample_logprobs
is
not
None
...
...
@@ -992,4 +993,4 @@ async def test_cumulative_output_collector_n():
# Third is the one where index is 2
third
=
[
k
for
k
in
result
.
outputs
if
k
.
index
==
2
]
assert
len
(
third
)
==
1
assert
third
[
0
].
text
==
"c"
assert
third
[
0
].
text
==
"c"
\ No newline at end of file
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
8e340b4f
...
...
@@ -7,6 +7,7 @@ import re
from
enum
import
Enum
from
typing
import
Any
import
os
import
jsonschema
import
pytest
from
pydantic
import
BaseModel
...
...
@@ -15,22 +16,23 @@ from vllm.entrypoints.llm import LLM
from
vllm.outputs
import
RequestOutput
from
vllm.platforms
import
current_platform
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
....utils
import
models_path_prefix
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
=
[
(
"mistralai/Ministral-8B-Instruct-2410"
,
"xgrammar:disable-any-whitespace"
,
(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Ministral-8B-Instruct-2410"
)
,
"xgrammar:disable-any-whitespace"
,
"auto"
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"guidance:disable-any-whitespace"
,
(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Ministral-8B-Instruct-2410"
)
,
"guidance:disable-any-whitespace"
,
"auto"
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"xgrammar:disable-any-whitespace"
,
(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Ministral-8B-Instruct-2410"
)
,
"xgrammar:disable-any-whitespace"
,
"mistral"
),
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
"xgrammar:disable-any-whitespace"
,
"auto"
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
,
"xgrammar:disable-any-whitespace"
,
"auto"
),
#FIXME: This test is flaky on CI thus disabled
#("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
]
PARAMS_MODELS_TOKENIZER_MODE
=
[
(
"mistralai/Ministral-8B-Instruct-2410"
,
"auto"
),
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
"auto"
),
(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Ministral-8B-Instruct-2410"
)
,
"auto"
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
,
"auto"
),
]
...
...
@@ -572,4 +574,4 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
assert
"a3"
in
generated
assert
"a4"
not
in
generated
assert
"a5"
not
in
generated
assert
"a6"
not
in
generated
assert
"a6"
not
in
generated
\ No newline at end of file
tests/v1/sample/test_logprobs.py
View file @
8e340b4f
...
...
@@ -3,6 +3,7 @@
import
itertools
from
collections.abc
import
Generator
import
os
import
pytest
import
torch
...
...
@@ -13,8 +14,9 @@ from tests.v1.sample.utils import (
from
vllm
import
SamplingParams
from
...conftest
import
HfRunner
,
VllmRunner
from
...utils
import
models_path_prefix
MODEL
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
DTYPE
=
"half"
NONE
=
BatchLogprobsComposition
.
NONE
...
...
tests/v1/sample/test_logprobs_e2e.py
View file @
8e340b4f
# SPDX-License-Identifier: Apache-2.0
import
os
import
lm_eval
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
# arc-easy uses prompt_logprobs=1, logprobs=1
TASK
=
"arc_easy"
...
...
@@ -11,7 +12,7 @@ RTOL = 0.03
EXPECTED_VALUE
=
0.62
# FIXME(rob): enable prefix caching once supported.
MODEL
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
MODEL_ARGS
=
f
"pretrained=
{
MODEL
}
,enforce_eager=True,enable_prefix_caching=False"
# noqa: E501
SERVER_ARGS
=
[
"--enforce_eager"
,
"--no_enable_prefix_caching"
,
"--disable-log-requests"
...
...
@@ -49,4 +50,4 @@ def test_promt_logprobs_e2e_server():
measured_value
=
results
[
"results"
][
TASK
][
FILTER
]
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
and
measured_value
+
RTOL
>
EXPECTED_VALUE
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
\ No newline at end of file
tests/v1/sample/test_sampling_params_e2e.py
View file @
8e340b4f
...
...
@@ -4,11 +4,12 @@ import os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
if
os
.
getenv
(
"VLLM_USE_V1"
,
"0"
)
!=
"1"
:
pytest
.
skip
(
"Test package requires V1"
,
allow_module_level
=
True
)
MODEL
=
"meta-llama/Llama-3.2-1B"
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
PROMPT
=
"Hello my name is Robert and I"
...
...
tests/v1/spec_decode/test_max_len.py
→
tests/v1/spec_decode/
un
test_max_len.py
View file @
8e340b4f
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment