Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
155 additions
and
8 deletions
+155
-8
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
...ins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+1
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
...ugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+1
-0
tests/plugins/vllm_add_dummy_platform/setup.py
tests/plugins/vllm_add_dummy_platform/setup.py
+1
-0
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
...lm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+1
-0
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
...atform/vllm_add_dummy_platform/dummy_attention_backend.py
+1
-0
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
..._dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+1
-0
tests/plugins_tests/conftest.py
tests/plugins_tests/conftest.py
+1
-0
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_platform_plugins.py
+1
-0
tests/plugins_tests/test_scheduler_plugins.py
tests/plugins_tests/test_scheduler_plugins.py
+1
-0
tests/pplx_utils.py
tests/pplx_utils.py
+123
-0
tests/prefix_caching/test_disable_sliding_window.py
tests/prefix_caching/test_disable_sliding_window.py
+1
-0
tests/prefix_caching/test_prefix_caching.py
tests/prefix_caching/test_prefix_caching.py
+1
-0
tests/prompt_adapter/test_bloom.py
tests/prompt_adapter/test_bloom.py
+1
-0
tests/prompt_adapter/test_multi_adapter_inference.py
tests/prompt_adapter/test_multi_adapter_inference.py
+1
-0
tests/prompt_adapter/test_pa_lora.py
tests/prompt_adapter/test_pa_lora.py
+1
-0
tests/quantization/test_auto_round.py
tests/quantization/test_auto_round.py
+1
-0
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+1
-0
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+13
-7
tests/quantization/test_configs.py
tests/quantization/test_configs.py
+1
-0
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+2
-1
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
from
typing
import
Optional
...
...
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
from
typing
import
Optional
...
...
tests/plugins/vllm_add_dummy_platform/setup.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
setuptools
import
setup
from
setuptools
import
setup
...
...
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
from
typing
import
Optional
...
...
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.attention.backends.flash_attn
import
FlashAttentionBackend
from
vllm.attention.backends.flash_attn
import
FlashAttentionBackend
...
...
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.platforms.cuda
import
CudaPlatform
from
vllm.platforms.cuda
import
CudaPlatform
...
...
tests/plugins_tests/conftest.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
...
...
tests/plugins_tests/test_platform_plugins.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
torch
import
torch
...
...
tests/plugins_tests/test_scheduler_plugins.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
...
...
tests/pplx_utils.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
dataclasses
import
os
import
traceback
from
typing
import
Callable
import
torch
from
torch.multiprocessing
import
(
spawn
)
# pyright: ignore[reportPrivateImportUsage]
from
typing_extensions
import
Concatenate
,
ParamSpec
P
=
ParamSpec
(
"P"
)
@
dataclasses
.
dataclass
class
ProcessGroupInfo
:
world_size
:
int
world_local_size
:
int
rank
:
int
node_rank
:
int
local_rank
:
int
device
:
torch
.
device
def
_worker_parallel_launch
(
local_rank
:
int
,
world_size
:
int
,
world_local_size
:
int
,
node_rank
:
int
,
init_method
:
str
,
worker
:
Callable
[
Concatenate
[
ProcessGroupInfo
,
P
],
None
],
*
args
:
P
.
args
,
**
kwargs
:
P
.
kwargs
,
)
->
None
:
rank
=
node_rank
*
world_local_size
+
local_rank
torch
.
cuda
.
set_device
(
local_rank
)
device
=
torch
.
device
(
"cuda"
,
local_rank
)
torch
.
distributed
.
init_process_group
(
backend
=
"cpu:gloo,cuda:nccl"
,
init_method
=
init_method
,
rank
=
rank
,
world_size
=
world_size
,
device_id
=
device
,
)
barrier
=
torch
.
tensor
([
rank
],
device
=
device
)
torch
.
distributed
.
all_reduce
(
barrier
)
try
:
worker
(
ProcessGroupInfo
(
world_size
=
world_size
,
world_local_size
=
world_local_size
,
rank
=
rank
,
node_rank
=
node_rank
,
local_rank
=
local_rank
,
device
=
device
,
),
*
args
,
**
kwargs
,
)
except
Exception
as
ex
:
print
(
ex
)
traceback
.
print_exc
()
raise
finally
:
torch
.
distributed
.
destroy_process_group
()
def
parallel_launch
(
world_size
:
int
,
worker
:
Callable
[
Concatenate
[
ProcessGroupInfo
,
P
],
None
],
*
args
:
P
.
args
,
**
kwargs
:
P
.
kwargs
,
)
->
None
:
assert
not
kwargs
spawn
(
_worker_parallel_launch
,
args
=
(
world_size
,
world_size
,
0
,
"tcp://localhost:29500"
,
worker
,
)
+
args
,
nprocs
=
world_size
,
join
=
True
,
)
def
parallel_launch_from_env
(
worker
:
Callable
[
Concatenate
[
ProcessGroupInfo
,
P
],
None
],
*
args
:
P
.
args
,
**
kwargs
:
P
.
kwargs
,
)
->
None
:
"""
Launches a worker function in parallel across all processes in the current
environment. The environment must have the following variables set:
- WORLD_SIZE: The total number of processes.
- WORLD_LOCAL_SIZE: The number of processes on the current node.
- NODE_RANK: The rank of the current
- MASTER_ADDR: The address of the master process.
- MASTER_PORT: The port of the master process.
"""
assert
not
kwargs
world_size
=
int
(
os
.
environ
[
"WORLD_SIZE"
])
world_local_size
=
int
(
os
.
environ
[
"WORLD_LOCAL_SIZE"
])
node_rank
=
int
(
os
.
environ
[
"NODE_RANK"
])
assert
"MASTER_ADDR"
in
os
.
environ
assert
"MASTER_PORT"
in
os
.
environ
spawn
(
_worker_parallel_launch
,
args
=
(
world_size
,
world_local_size
,
node_rank
,
"env://"
,
worker
,
)
+
args
,
nprocs
=
world_local_size
,
join
=
True
,
)
tests/prefix_caching/test_disable_sliding_window.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the with and without prefix caching.
"""Compare the with and without prefix caching.
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
...
...
tests/prefix_caching/test_prefix_caching.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the with and without prefix caching.
"""Compare the with and without prefix caching.
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
...
...
tests/prompt_adapter/test_bloom.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
...
...
tests/prompt_adapter/test_multi_adapter_inference.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
EngineArgs
,
LLMEngine
,
SamplingParams
from
vllm
import
EngineArgs
,
LLMEngine
,
SamplingParams
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
...
...
tests/prompt_adapter/test_pa_lora.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
...
...
tests/quantization/test_auto_round.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test model set-up and inference for quantized HF models supported
"""Test model set-up and inference for quantized HF models supported
on the AutoRound.
on the AutoRound.
...
...
tests/quantization/test_bitsandbytes.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
'''Tests whether bitsandbytes computation is enabled correctly.
'''Tests whether bitsandbytes computation is enabled correctly.
Run `pytest tests/quantization/test_bitsandbytes.py`.
Run `pytest tests/quantization/test_bitsandbytes.py`.
...
...
tests/quantization/test_compressed_tensors.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test model set-up and weight loading for llmcompressor-quantized models.
"""Test model set-up and weight loading for llmcompressor-quantized models.
Run `pytest tests/quantization/test_compressed_tensors.py`.
Run `pytest tests/quantization/test_compressed_tensors.py`.
...
@@ -13,9 +14,10 @@ from compressed_tensors.quantization import QuantizationType
...
@@ -13,9 +14,10 @@ from compressed_tensors.quantization import QuantizationType
from
tests.models.utils
import
check_logprobs_close
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensors24
,
CompressedTensorsLinearMethod
,
CompressedTensors24
,
CompressedTensorsLinearMethod
,
CompressedTensorsW4A16Fp4
,
CompressedTensorsW4A16Sparse24
,
CompressedTensorsW4A4Fp4
,
CompressedTensorsW4A16Fp4
,
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A8Int8
,
CompressedTensorsW4A16Sparse24
,
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A16Fp8
,
CompressedTensorsWNA16
)
CompressedTensorsW8A8Int8
,
CompressedTensorsW8A16Fp8
,
CompressedTensorsWNA16
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
sparse_cutlass_supported
)
sparse_cutlass_supported
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -650,9 +652,13 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
...
@@ -650,9 +652,13 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
assert
output
assert
output
def
test_compressed_tensors_nvfp4a16
(
vllm_runner
):
@
pytest
.
mark
.
parametrize
(
# run weight only example
"args"
,
model
=
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
[(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16"
,
CompressedTensorsW4A16Fp4
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4"
,
CompressedTensorsW4A4Fp4
)])
def
test_compressed_tensors_nvfp4
(
vllm_runner
,
args
):
model
,
scheme
=
args
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
def
check_model
(
model
):
...
@@ -661,7 +667,7 @@ def test_compressed_tensors_nvfp4a16(vllm_runner):
...
@@ -661,7 +667,7 @@ def test_compressed_tensors_nvfp4a16(vllm_runner):
qkv_proj
=
layer
.
self_attn
.
qkv_proj
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW4A16Fp4
)
assert
isinstance
(
qkv_proj
.
scheme
,
scheme
)
assert
qkv_proj
.
scheme
.
group_size
==
16
assert
qkv_proj
.
scheme
.
group_size
==
16
llm
.
apply_model
(
check_model
)
llm
.
apply_model
(
check_model
)
...
...
tests/quantization/test_configs.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests whether Marlin models can be loaded from the autogptq config.
"""Tests whether Marlin models can be loaded from the autogptq config.
Run `pytest tests/quantization/test_configs.py --forked`.
Run `pytest tests/quantization/test_configs.py --forked`.
...
...
tests/quantization/test_cpu_offload.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Expanded quantized model tests for CPU offloading
# Expanded quantized model tests for CPU offloading
# Base tests: tests/basic_correctness/test_cpu_offload.py
# Base tests: tests/basic_correctness/test_cpu_offload.py
...
...
Prev
1
…
27
28
29
30
31
32
33
34
35
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment