Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5e078c69
Commit
5e078c69
authored
Jun 03, 2025
by
zhuwenwen
Browse files
[tests]skip tpu and weight_loading tests, fix tests of worker
parent
ced28510
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
141 additions
and
133 deletions
+141
-133
tests/spec_decode/e2e/test_compatibility.py
tests/spec_decode/e2e/test_compatibility.py
+4
-4
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_eagle_correctness.py
+6
-6
tests/spec_decode/e2e/test_mtp_correctness.py
tests/spec_decode/e2e/test_mtp_correctness.py
+4
-2
tests/spec_decode/e2e/test_ngram_correctness.py
tests/spec_decode/e2e/test_ngram_correctness.py
+2
-2
tests/spec_decode/test_multi_step_worker.py
tests/spec_decode/test_multi_step_worker.py
+101
-101
tests/spec_decode/test_spec_decode_worker.py
tests/spec_decode/test_spec_decode_worker.py
+5
-3
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+4
-2
tests/tool_use/utils.py
tests/tool_use/utils.py
+9
-9
tests/tpu/untest_quantization_accuracy.py
tests/tpu/untest_quantization_accuracy.py
+4
-2
tests/v1/tpu/untest_basic.py
tests/v1/tpu/untest_basic.py
+0
-0
tests/v1/tpu/untest_mha_attn.py
tests/v1/tpu/untest_mha_attn.py
+0
-0
tests/v1/tpu/untest_multimodal.py
tests/v1/tpu/untest_multimodal.py
+0
-0
tests/v1/tpu/untest_pallas.py
tests/v1/tpu/untest_pallas.py
+0
-0
tests/v1/tpu/untest_perf.py
tests/v1/tpu/untest_perf.py
+0
-0
tests/v1/tpu/untest_sampler.py
tests/v1/tpu/untest_sampler.py
+0
-0
tests/v1/tpu/untest_topk_topp_sampler.py
tests/v1/tpu/untest_topk_topp_sampler.py
+0
-0
tests/v1/tpu/worker/untest_tpu_model_runner.py
tests/v1/tpu/worker/untest_tpu_model_runner.py
+0
-0
tests/weight_loading/__init__.py
tests/weight_loading/__init__.py
+0
-0
tests/weight_loading/untest_weight_loading.py
tests/weight_loading/untest_weight_loading.py
+0
-0
tests/worker/test_model_runner.py
tests/worker/test_model_runner.py
+2
-2
No files found.
tests/spec_decode/e2e/test_compatibility.py
View file @
5e078c69
...
@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
...
@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
{
{
# Speculative max model len > overridden max model len should raise.
# Speculative max model len > overridden max model len should raise.
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
129
,
"max_model_len"
:
129
,
},
},
...
@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
...
@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
# Speculative max model len > draft max model len should raise.
# Speculative max model len > draft max model len should raise.
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
2048
+
1
,
"max_model_len"
:
2048
+
1
,
},
},
...
@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
...
@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
# Speculative max model len > target max model len should raise.
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"JackFram/llama-68m"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
"num_speculative_tokens"
:
5
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
131072
+
1
,
"max_model_len"
:
131072
+
1
,
},
},
...
@@ -64,4 +64,4 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
...
@@ -64,4 +64,4 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
with
pytest
.
raises
(
ValueError
,
match
=
"cannot be larger than"
):
with
pytest
.
raises
(
ValueError
,
match
=
"cannot be larger than"
):
get_output_from_llm_generator
(
test_llm_generator
,
prompts
,
get_output_from_llm_generator
(
test_llm_generator
,
prompts
,
sampling_params
)
sampling_params
)
\ No newline at end of file
tests/spec_decode/e2e/test_eagle_correctness.py
View file @
5e078c69
...
@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
"dtype"
:
"float16"
,
"dtype"
:
"float16"
,
# Main model
# Main model
"model_name"
:
"meta-llama/Llama-2-7b-chat-hf"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b-chat-hf"
)
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-llama2-chat-7B"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-llama2-chat-7B"
)
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
},
},
...
@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype"
:
"float16"
,
"dtype"
:
"float16"
,
# Main model
# Main model
"model_name"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
)
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
)
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
},
},
...
@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype"
:
"float16"
,
"dtype"
:
"float16"
,
# Main model
# Main model
"model_name"
:
"Qwen/Qwen2-7B-Instruct"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-7B-Instruct"
)
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
{
{
"speculative_config"
:
{
"speculative_config"
:
{
"model"
:
"yuhuili/EAGLE-Qwen2-7B-Instruct"
,
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-Qwen2-7B-Instruct"
)
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
"num_speculative_tokens"
:
MAX_SPEC_TOKENS
,
},
},
},
},
...
...
tests/spec_decode/e2e/test_mtp_correctness.py
View file @
5e078c69
...
@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
...
@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
correctess for the target model outputs.
correctess for the target model outputs.
"""
"""
import
os
import
pytest
import
pytest
from
.conftest
import
run_equality_correctness_test
from
.conftest
import
run_equality_correctness_test
from
...utils
import
models_path_prefix
# main model
# main model
MAIN_MODEL
=
"luccafong/deepseek_mtp_main_random"
MAIN_MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"luccafong/deepseek_mtp_main_random"
)
# max. number of speculative tokens: this corresponds to
# max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model.
# num_nextn_predict_layers in the config.json of the speculator model.
...
@@ -329,4 +331,4 @@ def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -329,4 +331,4 @@ def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
import
pytest
import
pytest
pytest
.
main
([
__file__
])
pytest
.
main
([
__file__
])
\ No newline at end of file
tests/spec_decode/e2e/test_ngram_correctness.py
View file @
5e078c69
...
@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"model_name"
:
"JackFram/llama-68m"
,
"model_name"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
...
@@ -372,4 +372,4 @@ def test_ngram_scorer(vllm_runner, common_llm_kwargs,
...
@@ -372,4 +372,4 @@ def test_ngram_scorer(vllm_runner, common_llm_kwargs,
batch_size
,
batch_size
,
max_output_len
=
output_len
,
max_output_len
=
output_len
,
seed
=
seed
,
seed
=
seed
,
temperature
=
0.0
)
temperature
=
0.0
)
\ No newline at end of file
tests/spec_decode/test_multi_step_worker.py
View file @
5e078c69
...
@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
...
@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
assert
(
num_mismatch
>
0
)
assert
(
num_mismatch
>
0
)
@
torch
.
inference_mode
()
#
@torch.inference_mode()
@
pytest
.
mark
.
parametrize
(
'num_steps'
,
[
1
,
2
,
3
,
4
])
#
@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
# The choice of backends forces the multi_step_worker to choose between
#
# The choice of backends forces the multi_step_worker to choose between
# the vanilla model_runner and TP1DraftModelRunner and that we can test
#
# the vanilla model_runner and TP1DraftModelRunner and that we can test
# both code paths.
#
# both code paths.
@
pytest
.
mark
.
parametrize
(
'attn_backend'
,
#
@pytest.mark.parametrize('attn_backend',
[
_Backend
.
XFORMERS
,
_Backend
.
FLASH_ATTN
])
#
[_Backend.XFORMERS, _Backend.FLASH_ATTN])
def
test_multi_step_correct_kvcache
(
num_steps
,
attn_backend
):
#
def test_multi_step_correct_kvcache(num_steps, attn_backend):
"""Verify that the KV cache of the draft model
#
"""Verify that the KV cache of the draft model
is correctly updated for sequences with bonus token.
#
is correctly updated for sequences with bonus token.
"""
#
"""
seed
=
100
#
seed = 100
model_name
=
"JackFram/llama-68m"
#
model_name = "JackFram/llama-68m"
block_size
=
16
#
block_size = 16
num_gpu_blocks
=
2048
//
block_size
#
num_gpu_blocks = 2048 // block_size
batch_size
=
1
#
batch_size = 1
with
global_force_attn_backend_context_manager
(
attn_backend
):
#
with global_force_attn_backend_context_manager(attn_backend):
dtype
=
'float16'
if
attn_backend
==
_Backend
.
FLASH_ATTN
else
'float32'
#
dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
multi_step_worker
=
create_worker
(
MultiStepWorker
,
#
multi_step_worker = create_worker(MultiStepWorker,
model_name
,
#
model_name,
block_size
,
#
block_size,
num_gpu_blocks
,
#
num_gpu_blocks,
seed
,
#
seed,
model_runner_cls
=
TP1DraftModelRunner
,
#
model_runner_cls=TP1DraftModelRunner,
dtype
=
dtype
)
#
dtype=dtype)
multi_step_worker
.
set_include_gpu_probs_tensor
()
#
multi_step_worker.set_include_gpu_probs_tensor()
worker
=
create_worker
(
Worker
,
#
worker = create_worker(Worker,
model_name
,
#
model_name,
block_size
,
#
block_size,
num_gpu_blocks
,
#
num_gpu_blocks,
seed
,
#
seed,
dtype
=
dtype
)
#
dtype=dtype)
prompts
=
[[
0
]
for
_
in
range
(
batch_size
)]
#
prompts = [[0] for _ in range(batch_size)]
# Already generate two tokens for the sequence
#
# Already generate two tokens for the sequence
# so that we can simulate the bonus token case
#
# so that we can simulate the bonus token case
multi_step_continuations
=
[[
#
multi_step_continuations = [[
random
.
randint
(
0
,
1000
),
#
random.randint(0, 1000),
random
.
randint
(
0
,
1000
)
#
random.randint(0, 1000)
]
for
_
in
prompts
]
#
] for _ in prompts]
final_prompt_lens
=
[
len
(
prompt
)
+
2
+
num_steps
for
prompt
in
prompts
]
#
final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
seq_ids_with_bonus_token_in_last_step
=
set
(
range
(
batch_size
))
#
seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
#
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts
,
#
prompts,
num_gpu_blocks
,
#
num_gpu_blocks,
block_size
,
#
block_size,
continuations
=
multi_step_continuations
,
#
continuations=multi_step_continuations,
final_prompt_lens
=
final_prompt_lens
)
#
final_prompt_lens=final_prompt_lens)
# Run multi-step.
#
# Run multi-step.
zero_kv_cache
(
multi_step_worker
.
cache_engine
)
#
zero_kv_cache(multi_step_worker.cache_engine)
multi_step_worker
.
sampler_output
(
execute_model_req
=
ExecuteModelRequest
(
#
multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
seq_group_metadata_list
=
seq_group_metadata_list
),
#
seq_group_metadata_list=seq_group_metadata_list),
sample_len
=
num_steps
,
#
sample_len=num_steps,
seq_ids_with_bonus_token_in_last_step
=
#
seq_ids_with_bonus_token_in_last_step=
seq_ids_with_bonus_token_in_last_step
)
#
seq_ids_with_bonus_token_in_last_step)
# Run single-step repeatedly.
#
# Run single-step repeatedly.
zero_kv_cache
(
worker
.
cache_engine
)
#
zero_kv_cache(worker.cache_engine)
# Generate the kv cache for the bonus token first
#
# Generate the kv cache for the bonus token first
single_step_continuations
=
[
c
[:
1
]
for
c
in
multi_step_continuations
]
#
single_step_continuations = [c[:1] for c in multi_step_continuations]
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
#
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts
,
#
prompts,
num_gpu_blocks
,
#
num_gpu_blocks,
block_size
,
#
block_size,
continuations
=
single_step_continuations
,
#
continuations=single_step_continuations,
final_prompt_lens
=
final_prompt_lens
)
#
final_prompt_lens=final_prompt_lens)
single_step_output
=
worker
.
execute_model
(
#
single_step_output = worker.execute_model(
execute_model_req
=
ExecuteModelRequest
(
#
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list
=
seq_group_metadata_list
))
#
seq_group_metadata_list=seq_group_metadata_list))
for
_
in
range
(
num_steps
):
#
for _ in range(num_steps):
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
#
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts
,
#
prompts,
num_gpu_blocks
,
#
num_gpu_blocks,
block_size
,
#
block_size,
continuations
=
multi_step_continuations
,
#
continuations=multi_step_continuations,
final_prompt_lens
=
final_prompt_lens
)
#
final_prompt_lens=final_prompt_lens)
single_step_output
=
worker
.
execute_model
(
#
single_step_output = worker.execute_model(
execute_model_req
=
ExecuteModelRequest
(
#
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list
=
seq_group_metadata_list
))
#
seq_group_metadata_list=seq_group_metadata_list))
for
i
,
seq_group_output
in
enumerate
(
single_step_output
[
-
1
]):
#
for i, seq_group_output in enumerate(single_step_output[-1]):
multi_step_continuations
[
i
].
append
(
#
multi_step_continuations[i].append(
seq_group_output
.
samples
[
0
].
output_token
)
#
seq_group_output.samples[0].output_token)
# Verify that the KV cache of the single-step and
#
# Verify that the KV cache of the single-step and
# multi-step workers are the same.
#
# multi-step workers are the same.
single_step_gpu_cache
=
worker
.
cache_engine
[
0
].
gpu_cache
#
single_step_gpu_cache = worker.cache_engine[0].gpu_cache
multi_step_gpu_cache
=
multi_step_worker
.
cache_engine
[
0
].
gpu_cache
#
multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
num_layers
=
len
(
single_step_gpu_cache
)
#
num_layers = len(single_step_gpu_cache)
allclose
=
lambda
a
,
b
:
torch
.
allclose
(
#
allclose = lambda a, b: torch.allclose(
a
.
cuda
(),
b
.
cuda
(),
rtol
=
1e-2
,
atol
=
1e-2
)
#
a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
for
i
in
range
(
num_layers
):
#
for i in range(num_layers):
assert
allclose
(
single_step_gpu_cache
[
i
][
0
],
#
assert allclose(single_step_gpu_cache[i][0],
multi_step_gpu_cache
[
i
][
0
])
#
multi_step_gpu_cache[i][0])
assert
allclose
(
single_step_gpu_cache
[
i
][
1
],
#
assert allclose(single_step_gpu_cache[i][1],
multi_step_gpu_cache
[
i
][
1
])
#
multi_step_gpu_cache[i][1])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
...
...
tests/spec_decode/test_spec_decode_worker.py
View file @
5e078c69
...
@@ -5,6 +5,7 @@ from collections import defaultdict
...
@@ -5,6 +5,7 @@ from collections import defaultdict
from
types
import
SimpleNamespace
from
types
import
SimpleNamespace
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
os
import
pytest
import
pytest
import
torch
import
torch
...
@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
...
@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
from
.test_utils
import
mock_spec_decode_sampler
from
.test_utils
import
mock_spec_decode_sampler
from
.utils
import
(
create_batch
,
create_sampler_output_list
,
create_worker
,
from
.utils
import
(
create_batch
,
create_sampler_output_list
,
create_worker
,
mock_worker
)
mock_worker
)
from
..utils
import
models_path_prefix
@
pytest
.
mark
.
parametrize
(
'k'
,
[
1
,
2
,
6
])
@
pytest
.
mark
.
parametrize
(
'k'
,
[
1
,
2
,
6
])
...
@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
...
@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
num_gpu_blocks
=
8096
//
block_size
num_gpu_blocks
=
8096
//
block_size
target_worker
=
create_worker
(
target_worker
=
create_worker
(
Worker
,
Worker
,
"JackFram/llama-68m"
,
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
)
,
block_size
,
block_size
,
num_gpu_blocks
,
num_gpu_blocks
,
seed
,
seed
,
)
)
draft_worker
=
create_worker
(
draft_worker
=
create_worker
(
MultiStepWorker
,
MultiStepWorker
,
"abhigoyal/vllm-eagle-llama-68m-random"
,
os
.
path
.
join
(
models_path_prefix
,
"abhigoyal/vllm-eagle-llama-68m-random"
)
,
block_size
,
block_size
,
num_gpu_blocks
,
num_gpu_blocks
,
seed
,
seed
,
...
@@ -941,4 +943,4 @@ def test_correctly_load_weight_for_eagle():
...
@@ -941,4 +943,4 @@ def test_correctly_load_weight_for_eagle():
target_worker
.
model_runner
.
model
.
lm_head
.
weight
.
data
)
target_worker
.
model_runner
.
model
.
lm_head
.
weight
.
data
)
assert
torch
.
allclose
(
assert
torch
.
allclose
(
worker
.
proposer_worker
.
worker
.
model_runner
.
model
.
lm_head
.
weight
.
data
,
worker
.
proposer_worker
.
worker
.
model_runner
.
model
.
lm_head
.
weight
.
data
,
worker
.
scorer_worker
.
model_runner
.
model
.
lm_head
.
weight
.
data
)
worker
.
scorer_worker
.
model_runner
.
model
.
lm_head
.
weight
.
data
)
\ No newline at end of file
tests/tensorizer_loader/test_tensorizer.py
View file @
5e078c69
...
@@ -7,6 +7,7 @@ import pathlib
...
@@ -7,6 +7,7 @@ import pathlib
import
subprocess
import
subprocess
from
functools
import
partial
from
functools
import
partial
from
unittest.mock
import
MagicMock
,
patch
from
unittest.mock
import
MagicMock
,
patch
from
typing
import
List
,
Tuple
,
Optional
import
openai
import
openai
import
pytest
import
pytest
...
@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download
...
@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm
import
EngineArgs
,
LLMEngine
,
RequestOutput
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.lora.request
import
LoRARequest
# yapf conflicts with isort for this docstring
# yapf conflicts with isort for this docstring
# yapf: disable
# yapf: disable
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
...
@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
...
@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH
/
"offline_inference/multilora_inference.py"
,
EXAMPLES_PATH
/
"offline_inference/multilora_inference.py"
,
)
)
model_ref
=
"meta-llama/Llama-2-7b-hf"
model_ref
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-2-7b-hf"
)
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path
=
os
.
path
.
join
(
models_path_prefix
,
"yard1/llama-2-7b-sql-lora-test"
)
lora_path
=
os
.
path
.
join
(
models_path_prefix
,
"yard1/llama-2-7b-sql-lora-test"
)
test_prompts
=
multilora_inference
.
create_test_prompts
(
lora_path
)
test_prompts
=
multilora_inference
.
create_test_prompts
(
lora_path
)
...
@@ -431,4 +433,4 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
...
@@ -431,4 +433,4 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
prompts
,
sampling_params
)
prompts
,
sampling_params
)
# noqa: E501
# noqa: E501
assert
outputs
==
deserialized_outputs
assert
outputs
==
deserialized_outputs
\ No newline at end of file
tests/tool_use/utils.py
View file @
5e078c69
...
@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"llama"
:
{
"llama"
:
{
"model"
:
"model"
:
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3.1-8B-Instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
...
@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"llama3.2"
:
{
"llama3.2"
:
{
"model"
:
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-3B-Instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
"--tool-call-parser"
,
"llama3_json"
,
"--chat-template"
,
...
@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"llama4"
:
{
"llama4"
:
{
"model"
:
"model"
:
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
...
@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"llama4_json"
:
{
"llama4_json"
:
{
"model"
:
"model"
:
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"-tp"
,
"4"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"-tp"
,
"4"
,
"--distributed-executor-backend"
,
"mp"
,
"--tool-call-parser"
,
"--distributed-executor-backend"
,
"mp"
,
"--tool-call-parser"
,
...
@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
# },
# },
"granite-3.0-8b"
:
{
"granite-3.0-8b"
:
{
"model"
:
"model"
:
"ibm-granite/granite-3.0-8b-instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3.0-8b-instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"granite"
,
"--chat-template"
,
"--tool-call-parser"
,
"granite"
,
"--chat-template"
,
...
@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"granite-3.1-8b"
:
{
"granite-3.1-8b"
:
{
"model"
:
"model"
:
"ibm-granite/granite-3.1-8b-instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3.1-8b-instruct"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--no-enable-prefix-caching"
,
...
@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"internlm"
:
{
"internlm"
:
{
"model"
:
"model"
:
"internlm/internlm2_5-7b-chat"
,
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2_5-7b-chat"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"internlm"
,
"--chat-template"
,
"--tool-call-parser"
,
"internlm"
,
"--chat-template"
,
...
@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
...
@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
},
"toolACE"
:
{
"toolACE"
:
{
"model"
:
"model"
:
"Team-ACE/ToolACE-8B"
,
os
.
path
.
join
(
models_path_prefix
,
"Team-ACE/ToolACE-8B"
)
,
"arguments"
:
[
"arguments"
:
[
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--enforce-eager"
,
"--no-enable-prefix-caching"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
"--tool-call-parser"
,
"pythonic"
,
"--chat-template"
,
...
@@ -361,4 +361,4 @@ MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
...
@@ -361,4 +361,4 @@ MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
"content"
:
"content"
:
"The weather in Orlando FL is 78 degrees fahrenheit with clear"
"The weather in Orlando FL is 78 degrees fahrenheit with clear"
"skies."
"skies."
}]
}]
\ No newline at end of file
tests/tpu/test_quantization_accuracy.py
→
tests/tpu/
un
test_quantization_accuracy.py
View file @
5e078c69
...
@@ -4,6 +4,8 @@ from dataclasses import dataclass
...
@@ -4,6 +4,8 @@ from dataclasses import dataclass
import
lm_eval
import
lm_eval
import
pytest
import
pytest
import
os
from
..utils
import
models_path_prefix
TASK
=
"gsm8k"
TASK
=
"gsm8k"
FILTER
=
"exact_match,strict-match"
FILTER
=
"exact_match,strict-match"
...
@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
...
@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
# NOTE: Accuracy scores measured on GPUs.
# NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS
=
[
ACCURACY_CONFIGS
=
[
GSM8KAccuracyTestConfig
(
GSM8KAccuracyTestConfig
(
model_name
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
)
,
excepted_value
=
0.76
),
# no bias
excepted_value
=
0.76
),
# no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
# so only one of these tests can run in a single call to pytest. As
...
@@ -48,4 +50,4 @@ def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
...
@@ -48,4 +50,4 @@ def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
measured_value
=
results
[
"results"
][
TASK
][
FILTER
]
measured_value
=
results
[
"results"
][
TASK
][
FILTER
]
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
and
measured_value
+
RTOL
>
EXPECTED_VALUE
and
measured_value
+
RTOL
>
EXPECTED_VALUE
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
\ No newline at end of file
tests/v1/tpu/test_basic.py
→
tests/v1/tpu/
un
test_basic.py
View file @
5e078c69
File moved
tests/v1/tpu/test_mha_attn.py
→
tests/v1/tpu/
un
test_mha_attn.py
View file @
5e078c69
File moved
tests/v1/tpu/test_multimodal.py
→
tests/v1/tpu/
un
test_multimodal.py
View file @
5e078c69
File moved
tests/v1/tpu/test_pallas.py
→
tests/v1/tpu/
un
test_pallas.py
View file @
5e078c69
File moved
tests/v1/tpu/test_perf.py
→
tests/v1/tpu/
un
test_perf.py
View file @
5e078c69
File moved
tests/v1/tpu/test_sampler.py
→
tests/v1/tpu/
un
test_sampler.py
View file @
5e078c69
File moved
tests/v1/tpu/test_topk_topp_sampler.py
→
tests/v1/tpu/
un
test_topk_topp_sampler.py
View file @
5e078c69
File moved
tests/v1/tpu/worker/test_tpu_model_runner.py
→
tests/v1/tpu/worker/
un
test_tpu_model_runner.py
View file @
5e078c69
File moved
tests/weight_loading/__init__.py
0 → 100644
View file @
5e078c69
tests/weight_loading/test_weight_loading.py
→
tests/weight_loading/
un
test_weight_loading.py
View file @
5e078c69
File moved
tests/worker/test_model_runner.py
View file @
5e078c69
...
@@ -27,7 +27,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
...
@@ -27,7 +27,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
def
test_deepseek_mla_attn_backend_module
():
def
test_deepseek_mla_attn_backend_module
():
model_runner
=
_create_model_runner
(
model_runner
=
_create_model_runner
(
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
,
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
)
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
enable_chunked_prefill
=
False
,
enable_chunked_prefill
=
False
,
)
)
...
@@ -383,4 +383,4 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
...
@@ -383,4 +383,4 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
assert
attr_expected
[
1
]
==
attr_actual
[
1
]
assert
attr_expected
[
1
]
==
attr_actual
[
1
]
for
attr_expected
,
attr_actual
in
zip
(
vars
(
attn_metadata
.
decode_metadata
),
for
attr_expected
,
attr_actual
in
zip
(
vars
(
attn_metadata
.
decode_metadata
),
vars
(
decode_meta_actual
)):
vars
(
decode_meta_actual
)):
assert
attr_expected
[
1
]
==
attr_actual
[
1
]
assert
attr_expected
[
1
]
==
attr_actual
[
1
]
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment