Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d9784107
Unverified
Commit
d9784107
authored
Jul 21, 2025
by
Ning Xie
Committed by
GitHub
Jul 21, 2025
Browse files
[Misc] unify variable for LLM instance (#20996)
Signed-off-by:
Andy Xie
<
andy.xning@gmail.com
>
parent
e6b90a28
Changes
53
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
76 additions
and
77 deletions
+76
-77
tests/lora/test_llama_tp.py
tests/lora/test_llama_tp.py
+10
-10
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+7
-7
tests/model_executor/test_model_load_with_params.py
tests/model_executor/test_model_load_with_params.py
+5
-5
tests/models/language/generation/test_hybrid.py
tests/models/language/generation/test_hybrid.py
+1
-1
tests/models/language/generation/test_mistral.py
tests/models/language/generation/test_mistral.py
+7
-7
tests/models/language/pooling/mteb_utils.py
tests/models/language/pooling/mteb_utils.py
+9
-9
tests/models/language/pooling/test_gritlm.py
tests/models/language/pooling/test_gritlm.py
+2
-2
tests/models/language/pooling/test_jina.py
tests/models/language/pooling/test_jina.py
+2
-2
tests/models/language/pooling/test_nomic_max_model_len.py
tests/models/language/pooling/test_nomic_max_model_len.py
+3
-3
tests/models/language/pooling/test_truncation_control.py
tests/models/language/pooling/test_truncation_control.py
+3
-3
tests/models/multimodal/generation/test_pixtral.py
tests/models/multimodal/generation/test_pixtral.py
+2
-3
tests/models/multimodal/generation/test_whisper.py
tests/models/multimodal/generation/test_whisper.py
+1
-1
tests/models/multimodal/generation/vlm_utils/core.py
tests/models/multimodal/generation/vlm_utils/core.py
+1
-1
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+1
-1
tests/models/multimodal/pooling/test_jinavl_reranker.py
tests/models/multimodal/pooling/test_jinavl_reranker.py
+1
-1
tests/models/quantization/test_modelopt.py
tests/models/quantization/test_modelopt.py
+3
-3
tests/models/quantization/test_nvfp4.py
tests/models/quantization/test_nvfp4.py
+3
-3
tests/prefix_caching/test_disable_sliding_window.py
tests/prefix_caching/test_disable_sliding_window.py
+11
-11
tests/prefix_caching/test_prefix_caching.py
tests/prefix_caching/test_prefix_caching.py
+3
-3
tests/quantization/test_gptq_dynamic.py
tests/quantization/test_gptq_dynamic.py
+1
-1
No files found.
tests/lora/test_llama_tp.py
View file @
d9784107
...
...
@@ -186,25 +186,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
model_uri
=
tmp_path
/
"vllm"
/
model_ref
/
suffix
/
model_name
tensorizer_config
=
TensorizerConfig
(
tensorizer_uri
=
str
(
model_uri
))
loaded_
v
llm
_model
=
LLM
(
model
=
model_ref
,
load_format
=
"tensorizer"
,
enable_lora
=
True
,
enforce_eager
=
True
,
model_loader_extra_config
=
tensorizer_config
,
max_num_seqs
=
13
,
tensor_parallel_size
=
2
,
max_loras
=
2
)
loaded_llm
=
LLM
(
model
=
model_ref
,
load_format
=
"tensorizer"
,
enable_lora
=
True
,
enforce_eager
=
True
,
model_loader_extra_config
=
tensorizer_config
,
max_num_seqs
=
13
,
tensor_parallel_size
=
2
,
max_loras
=
2
)
tc_as_dict
=
tensorizer_config
.
to_serializable
()
print
(
"lora adapter created"
)
assert
do_sample
(
loaded_
v
llm
_model
,
assert
do_sample
(
loaded_llm
,
sql_lora_files
,
tensorizer_config_dict
=
tc_as_dict
,
lora_id
=
0
)
==
EXPECTED_NO_LORA_OUTPUT
print
(
"lora 1"
)
assert
do_sample
(
loaded_
v
llm
_model
,
assert
do_sample
(
loaded_llm
,
sql_lora_files
,
tensorizer_config_dict
=
tc_as_dict
,
lora_id
=
1
)
==
EXPECTED_LORA_OUTPUT
tests/metrics/test_metrics.py
View file @
d9784107
...
...
@@ -41,7 +41,7 @@ def test_metric_counter_prompt_tokens(
dtype
=
dtype
,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
prompt_token_counts
=
[
len
(
tokenizer
.
encode
(
p
))
for
p
in
example_prompts
]
...
...
@@ -53,7 +53,7 @@ def test_metric_counter_prompt_tokens(
vllm_prompt_token_count
=
sum
(
prompt_token_counts
)
_
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_loggers
[
'prometheus'
]
stat_logger
=
vllm_model
.
llm
.
llm_engine
.
stat_loggers
[
'prometheus'
]
metric_count
=
stat_logger
.
metrics
.
counter_prompt_tokens
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
...
...
@@ -77,8 +77,8 @@ def test_metric_counter_generation_tokens(
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.4
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_loggers
[
'prometheus'
]
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
stat_logger
=
vllm_model
.
llm
.
llm_engine
.
stat_loggers
[
'prometheus'
]
metric_count
=
stat_logger
.
metrics
.
counter_generation_tokens
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
vllm_generation_count
=
0
...
...
@@ -113,8 +113,8 @@ def test_metric_counter_generation_tokens_multi_step(
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_loggers
[
'prometheus'
]
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
stat_logger
=
vllm_model
.
llm
.
llm_engine
.
stat_loggers
[
'prometheus'
]
metric_count
=
stat_logger
.
metrics
.
counter_generation_tokens
.
labels
(
**
stat_logger
.
labels
).
_value
.
get
()
vllm_generation_count
=
0
...
...
@@ -145,7 +145,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
disable_log_stats
=
False
,
gpu_memory_utilization
=
0.3
,
served_model_name
=
served_model_name
)
as
vllm_model
:
stat_logger
=
vllm_model
.
model
.
llm_engine
.
stat_loggers
[
'prometheus'
]
stat_logger
=
vllm_model
.
llm
.
llm_engine
.
stat_loggers
[
'prometheus'
]
metrics_tag_content
=
stat_logger
.
labels
[
"model_name"
]
if
envs
.
VLLM_CI_USE_S3
:
...
...
tests/model_executor/test_model_load_with_params.py
View file @
d9784107
...
...
@@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner):
output
=
vllm_model
.
embed
(
"Write a short story about a robot that"
" dreams for the first time.
\n
"
)
model_config
=
vllm_model
.
model
.
llm_engine
.
model_config
model_tokenizer
=
vllm_model
.
model
.
llm_engine
.
tokenizer
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
model_tokenizer
=
vllm_model
.
llm
.
llm_engine
.
tokenizer
# asserts on the bert model config file
assert
model_config
.
encoder_config
[
"max_seq_length"
]
==
512
...
...
@@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
output
=
vllm_model
.
embed
(
"Write a short story about a robot that"
" dreams for the first time.
\n
"
)
model_config
=
vllm_model
.
model
.
llm_engine
.
model_config
model_tokenizer
=
vllm_model
.
model
.
llm_engine
.
tokenizer
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
model_tokenizer
=
vllm_model
.
llm
.
llm_engine
.
tokenizer
# asserts on the bert model config file
assert
model_config
.
encoder_config
[
"max_seq_length"
]
==
512
...
...
@@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
output
=
vllm_model
.
embed
(
"Write a short story about a robot that"
" dreams for the first time.
\n
"
)
model_tokenizer
=
vllm_model
.
model
.
llm_engine
.
tokenizer
model_tokenizer
=
vllm_model
.
llm
.
llm_engine
.
tokenizer
assert
model_tokenizer
.
tokenizer_id
==
model_name
def
check_model
(
model
):
...
...
tests/models/language/generation/test_hybrid.py
View file @
d9784107
...
...
@@ -274,7 +274,7 @@ def test_models_preemption_recompute(
Tests that outputs are identical with and w/o preemptions (recompute).
"""
with
vllm_runner
(
model
,
max_num_seqs
=
MAX_NUM_SEQS
)
as
vllm_model
:
scheduler
=
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
]
scheduler
=
vllm_model
.
llm
.
llm_engine
.
scheduler
[
0
]
scheduler
.
ENABLE_ARTIFICIAL_PREEMPT
=
True
preempt_vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
...
...
tests/models/language/generation/test_mistral.py
View file @
d9784107
...
...
@@ -238,8 +238,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
load_format
=
"mistral"
)
as
vllm_model
:
for
prompt
in
SYMBOLIC_LANG_PROMPTS
:
msg
=
{
"role"
:
"user"
,
"content"
:
prompt
}
outputs
=
vllm_model
.
model
.
chat
([
msg
],
sampling_params
=
SAMPLING_PARAMS
)
outputs
=
vllm_model
.
llm
.
chat
([
msg
],
sampling_params
=
SAMPLING_PARAMS
)
assert
"�"
not
in
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
...
...
@@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
load_format
=
"mistral"
)
as
vllm_model
:
msgs
=
copy
.
deepcopy
(
MSGS
)
outputs
=
vllm_model
.
model
.
chat
(
msgs
,
tools
=
TOOLS
,
sampling_params
=
SAMPLING_PARAMS
)
outputs
=
vllm_model
.
llm
.
chat
(
msgs
,
tools
=
TOOLS
,
sampling_params
=
SAMPLING_PARAMS
)
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
tool_parser
=
MistralToolParser
(
tokenizer
)
model_output
=
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
...
...
@@ -308,7 +308,7 @@ def test_mistral_guided_decoding(
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
SAMPLE_JSON_SCHEMA
}
"
}]
outputs
=
vllm_model
.
model
.
chat
(
messages
,
sampling_params
=
params
)
outputs
=
vllm_model
.
llm
.
chat
(
messages
,
sampling_params
=
params
)
generated_text
=
outputs
[
0
].
outputs
[
0
].
text
json_response
=
json
.
loads
(
generated_text
)
...
...
tests/models/language/pooling/mteb_utils.py
View file @
d9784107
...
...
@@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder):
def
__init__
(
self
,
vllm_model
):
super
().
__init__
()
self
.
model
=
vllm_model
self
.
llm
=
vllm_model
self
.
rng
=
np
.
random
.
default_rng
(
seed
=
42
)
def
encode
(
...
...
@@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
# issues by randomizing the order.
r
=
self
.
rng
.
permutation
(
len
(
sentences
))
sentences
=
[
sentences
[
i
]
for
i
in
r
]
outputs
=
self
.
model
.
embed
(
sentences
,
use_tqdm
=
False
)
outputs
=
self
.
llm
.
embed
(
sentences
,
use_tqdm
=
False
)
embeds
=
np
.
array
(
outputs
)
embeds
=
embeds
[
np
.
argsort
(
r
)]
return
embeds
...
...
@@ -61,10 +61,10 @@ class VllmMtebEncoder(mteb.Encoder):
queries
=
[
s
[
0
]
for
s
in
sentences
]
corpus
=
[
s
[
1
]
for
s
in
sentences
]
outputs
=
self
.
model
.
score
(
queries
,
corpus
,
truncate_prompt_tokens
=-
1
,
use_tqdm
=
False
)
outputs
=
self
.
llm
.
score
(
queries
,
corpus
,
truncate_prompt_tokens
=-
1
,
use_tqdm
=
False
)
scores
=
np
.
array
(
outputs
)
scores
=
scores
[
np
.
argsort
(
r
)]
return
scores
...
...
@@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,
if
model_info
.
architecture
:
assert
(
model_info
.
architecture
in
vllm_model
.
model
.
llm_engine
.
model_config
.
architectures
)
in
vllm_model
.
llm
.
llm_engine
.
model_config
.
architectures
)
vllm_main_score
=
run_mteb_embed_task
(
VllmMtebEncoder
(
vllm_model
),
MTEB_EMBED_TASKS
)
vllm_dtype
=
vllm_model
.
model
.
llm_engine
.
model_config
.
dtype
vllm_dtype
=
vllm_model
.
llm
.
llm_engine
.
model_config
.
dtype
with
hf_runner
(
model_info
.
name
,
is_sentence_transformer
=
True
,
...
...
@@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner,
max_num_seqs
=
8
,
**
vllm_extra_kwargs
)
as
vllm_model
:
model_config
=
vllm_model
.
model
.
llm_engine
.
model_config
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
if
model_info
.
architecture
:
assert
(
model_info
.
architecture
in
model_config
.
architectures
)
...
...
tests/models/language/pooling/test_gritlm.py
View file @
d9784107
...
...
@@ -120,7 +120,7 @@ def test_gritlm_offline_embedding(vllm_runner):
task
=
"embed"
,
max_model_len
=
MAX_MODEL_LEN
,
)
as
vllm_model
:
llm
=
vllm_model
.
model
llm
=
vllm_model
.
llm
d_rep
=
run_llm_encode
(
llm
,
...
...
@@ -167,7 +167,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
task
=
"generate"
,
max_model_len
=
MAX_MODEL_LEN
,
)
as
vllm_model
:
llm
=
vllm_model
.
model
llm
=
vllm_model
.
llm
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
256
)
outputs
=
llm
.
generate
(
input
,
sampling_params
=
sampling_params
)
...
...
tests/models/language/pooling/test_jina.py
View file @
d9784107
...
...
@@ -87,10 +87,10 @@ def test_matryoshka(
task
=
"embed"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
assert
vllm_model
.
model
.
llm_engine
.
model_config
.
is_matryoshka
assert
vllm_model
.
llm
.
llm_engine
.
model_config
.
is_matryoshka
matryoshka_dimensions
=
(
vllm_model
.
model
.
llm_engine
.
model_config
.
matryoshka_dimensions
)
vllm_model
.
llm
.
llm_engine
.
model_config
.
matryoshka_dimensions
)
assert
matryoshka_dimensions
is
not
None
if
dimensions
not
in
matryoshka_dimensions
:
...
...
tests/models/language/pooling/test_nomic_max_model_len.py
View file @
d9784107
...
...
@@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor)
def
test_default
(
model_info
,
vllm_runner
):
with
vllm_runner
(
model_info
.
name
,
task
=
"embed"
,
max_model_len
=
None
)
as
vllm_model
:
model_config
=
vllm_model
.
model
.
llm_engine
.
model_config
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
if
model_info
.
name
==
"nomic-ai/nomic-embed-text-v2-moe"
:
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
...
...
@@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512
with
vllm_runner
(
model_info
.
name
,
task
=
"embed"
,
max_model_len
=
256
)
as
vllm_model
:
model_config
=
vllm_model
.
model
.
llm_engine
.
model_config
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
assert
model_config
.
max_model_len
==
256
# set 512 < max_model_len <= 2048
...
...
@@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
else
:
with
vllm_runner
(
model_info
.
name
,
task
=
"embed"
,
max_model_len
=
1024
)
as
vllm_model
:
model_config
=
vllm_model
.
model
.
llm_engine
.
model_config
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
assert
model_config
.
max_model_len
==
1024
...
...
tests/models/language/pooling/test_truncation_control.py
View file @
d9784107
...
...
@@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
with
vllm_runner
(
model_name
,
task
=
"embed"
,
max_model_len
=
max_model_len
)
as
vllm_model
:
vllm_output
=
vllm_model
.
model
.
encode
(
vllm_output
=
vllm_model
.
llm
.
encode
(
input_str
,
truncate_prompt_tokens
=
truncate_prompt_tokens
)
prompt_tokens
=
vllm_output
[
0
].
prompt_token_ids
...
...
@@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
with
vllm_runner
(
model_name
,
task
=
"embed"
,
max_model_len
=
max_model_len
)
as
vllm_model
:
vllm_output
=
vllm_model
.
model
.
encode
(
vllm_output
=
vllm_model
.
llm
.
encode
(
input_str
,
truncate_prompt_tokens
=
truncate_prompt_tokens
)
prompt_tokens
=
vllm_output
[
0
].
prompt_token_ids
...
...
@@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
model_name
,
task
=
"embed"
,
max_model_len
=
max_model_len
)
as
vllm_model
:
llm_output
=
vllm_model
.
model
.
encode
(
llm_output
=
vllm_model
.
llm
.
encode
(
input_str
,
truncate_prompt_tokens
=
truncate_prompt_tokens
)
assert
llm_output
==
f
"""truncate_prompt_tokens value
...
...
tests/models/multimodal/generation/test_pixtral.py
View file @
d9784107
...
...
@@ -180,8 +180,7 @@ def test_chat(
)
as
vllm_model
:
outputs
=
[]
for
msg
in
MSGS
:
output
=
vllm_model
.
model
.
chat
(
msg
,
sampling_params
=
SAMPLING_PARAMS
)
output
=
vllm_model
.
llm
.
chat
(
msg
,
sampling_params
=
SAMPLING_PARAMS
)
outputs
.
extend
(
output
)
...
...
@@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
max_model_len
=
8192
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
)
as
vllm_model
:
outputs
=
vllm_model
.
model
.
generate
(
prompt
)
outputs
=
vllm_model
.
llm
.
generate
(
prompt
)
assert
len
(
outputs
)
==
1
,
f
"
{
len
(
outputs
)
=
}
"
output
:
RequestOutput
=
outputs
[
0
]
...
...
tests/models/multimodal/generation/test_whisper.py
View file @
d9784107
...
...
@@ -106,7 +106,7 @@ def run_test(
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
llm
=
vllm_model
.
model
llm
=
vllm_model
.
llm
sampling_params
=
SamplingParams
(
temperature
=
0
,
...
...
tests/models/multimodal/generation/vlm_utils/core.py
View file @
d9784107
...
...
@@ -85,7 +85,7 @@ def run_test(
enforce_eager
=
enforce_eager
,
task
=
task
,
**
vllm_runner_kwargs_
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
vllm_kwargs
:
dict
[
str
,
Any
]
=
{}
if
get_stop_token_ids
is
not
None
:
...
...
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
View file @
d9784107
...
...
@@ -96,7 +96,7 @@ def _run_test(
dtype
=
dtype
,
enforce_eager
=
True
,
max_model_len
=
8192
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
texts
=
[
# this is necessary because vllm_model.embed will not apply any
# templating to the prompt, and therefore lacks an image_pad
...
...
tests/models/multimodal/pooling/test_jinavl_reranker.py
View file @
d9784107
...
...
@@ -56,7 +56,7 @@ def vllm_reranker(
mm_processor_kwargs
=
mm_processor_kwargs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
as
vllm_model
:
outputs
=
vllm_model
.
model
.
score
(
query
,
documents
)
outputs
=
vllm_model
.
llm
.
score
(
query
,
documents
)
return
[
output
.
outputs
.
score
for
output
in
outputs
]
...
...
tests/models/quantization/test_modelopt.py
View file @
d9784107
...
...
@@ -45,7 +45,7 @@ EXPECTED_STRS_MAP = {
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_models
(
example_prompts
,
model_name
)
->
None
:
model
=
LLM
(
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
MAX_MODEL_LEN
,
trust_remote_code
=
True
,
...
...
@@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for
prompt
in
formatted_prompts
:
outputs
=
model
.
generate
(
prompt
,
params
)
outputs
=
llm
.
generate
(
prompt
,
params
)
generations
.
append
(
outputs
[
0
].
outputs
[
0
].
text
)
del
model
del
llm
print
(
model_name
,
generations
)
expected_strs
=
EXPECTED_STRS_MAP
[
model_name
]
...
...
tests/models/quantization/test_nvfp4.py
View file @
d9784107
...
...
@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
reason
=
"modelopt_fp4 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_models
(
example_prompts
,
model_name
)
->
None
:
model
=
LLM
(
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
MAX_MODEL_LEN
,
trust_remote_code
=
True
,
...
...
@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for
prompt
in
formatted_prompts
:
outputs
=
model
.
generate
(
prompt
,
params
)
outputs
=
llm
.
generate
(
prompt
,
params
)
generations
.
append
(
outputs
[
0
].
outputs
[
0
].
text
)
del
model
del
llm
print
(
model_name
,
generations
)
expected_strs
=
EXPECTED_STRS_MAP
[
model_name
]
...
...
tests/prefix_caching/test_disable_sliding_window.py
View file @
d9784107
...
...
@@ -25,25 +25,25 @@ MODEL_LEN_LEN = [
@
pytest
.
mark
.
parametrize
(
"model_len_len"
,
MODEL_LEN_LEN
)
def
test_disable_sliding_window
(
model_len_len
,
):
model
,
sliding_len
,
full_len
=
model_len_len
vllm_
disabled_
model
=
LLM
(
model
,
disable_sliding_window
=
True
)
vllm_
disabled_
model
.
generate
(
"Hi my name is"
)
model_config
=
vllm_
disabled_
model
.
llm_engine
.
model_config
disabled_
llm
=
LLM
(
model
,
disable_sliding_window
=
True
)
disabled_
llm
.
generate
(
"Hi my name is"
)
model_config
=
disabled_
llm
.
llm_engine
.
model_config
assert
model_config
.
max_model_len
==
sliding_len
,
(
"Max len expected to equal sliding_len of %s, but got %s"
,
sliding_len
,
model_config
.
max_model_len
)
del
vllm_
disabled_
model
del
disabled_
llm
cleanup_dist_env_and_memory
()
vllm_
enabled_
model
=
LLM
(
model
,
enforce_eager
=
True
,
disable_sliding_window
=
False
,
enable_prefix_caching
=
False
)
vllm_
enabled_
model
.
generate
(
"Hi my name is"
)
model_config
=
vllm_
enabled_
model
.
llm_engine
.
model_config
enabled_
llm
=
LLM
(
model
,
enforce_eager
=
True
,
disable_sliding_window
=
False
,
enable_prefix_caching
=
False
)
enabled_
llm
.
generate
(
"Hi my name is"
)
model_config
=
enabled_
llm
.
llm_engine
.
model_config
assert
model_config
.
max_model_len
==
full_len
,
(
"Max len expected to equal full_len of %s, but got %s"
,
full_len
,
model_config
.
max_model_len
)
del
vllm_
enabled_
model
del
enabled_
llm
cleanup_dist_env_and_memory
()
tests/prefix_caching/test_prefix_caching.py
View file @
d9784107
...
...
@@ -93,8 +93,8 @@ def test_mixed_requests(
# Run all the promopts
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
req_outputs
=
vllm_model
.
model
.
generate
(
example_prompts
,
greedy_params
)
req_outputs
=
vllm_model
.
llm
.
generate
(
example_prompts
,
greedy_params
)
# Verify number of cached tokens
for
i
in
range
(
len
(
req_outputs
)):
...
...
@@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model):
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_seqs
=
max_num_batched_tokens
,
)
engine
:
LLMEngine
=
runner
.
model
.
llm_engine
engine
:
LLMEngine
=
runner
.
llm
.
llm_engine
scheduler
:
Scheduler
=
SchedulerProxy
(
engine
.
scheduler
[
0
])
# type: ignore
engine
.
scheduler
[
0
]
=
scheduler
...
...
tests/quantization/test_gptq_dynamic.py
View file @
d9784107
...
...
@@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
linear_method_cls
=
GPTQMarlinLinearMethod
if
use_marlin_kernel
else
(
GPTQLinearMethod
)
for
name
,
submodule
in
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
for
name
,
submodule
in
(
vllm_model
.
llm
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
.
named_modules
()):
if
name
==
"lm_head"
:
assert
isinstance
(
submodule
.
quant_method
,
linear_method_cls
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment