Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
309 additions
and
55 deletions
+309
-55
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
...ins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+2
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
...dd_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+2
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
...ins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+2
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
...ugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+2
-0
tests/plugins/vllm_add_dummy_platform/setup.py
tests/plugins/vllm_add_dummy_platform/setup.py
+2
-0
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
...lm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+2
-0
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
...atform/vllm_add_dummy_platform/dummy_attention_backend.py
+2
-0
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
..._dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+2
-0
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_platform_plugins.py
+2
-0
tests/prefix_caching/test_disable_sliding_window.py
tests/prefix_caching/test_disable_sliding_window.py
+1
-0
tests/prefix_caching/test_prefix_caching.py
tests/prefix_caching/test_prefix_caching.py
+1
-0
tests/prompt_adapter/test_bloom.py
tests/prompt_adapter/test_bloom.py
+2
-0
tests/prompt_adapter/test_multi_adapter_inference.py
tests/prompt_adapter/test_multi_adapter_inference.py
+2
-0
tests/prompt_adapter/test_pa_lora.py
tests/prompt_adapter/test_pa_lora.py
+2
-0
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+1
-0
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+276
-55
tests/quantization/test_configs.py
tests/quantization/test_configs.py
+1
-0
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+2
-0
tests/quantization/test_experts_int8.py
tests/quantization/test_experts_int8.py
+2
-0
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
ModelRegistry
...
...
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Iterable
,
List
,
Optional
,
Tuple
,
Union
import
torch
...
...
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
import
torch
...
...
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
import
torch
...
...
tests/plugins/vllm_add_dummy_platform/setup.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
setuptools
import
setup
setup
(
...
...
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
...
...
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm.attention.backends.flash_attn
import
FlashAttentionBackend
...
...
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm.platforms.cuda
import
CudaPlatform
...
...
tests/plugins_tests/test_platform_plugins.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
torch
from
tests.kernels.utils
import
override_backend_env_variable
...
...
tests/prefix_caching/test_disable_sliding_window.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the with and without prefix caching.
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
...
...
tests/prefix_caching/test_prefix_caching.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the with and without prefix caching.
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
...
...
tests/prompt_adapter/test_bloom.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
os
...
...
tests/prompt_adapter/test_multi_adapter_inference.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
EngineArgs
,
LLMEngine
,
SamplingParams
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
..utils
import
models_path_prefix
...
...
tests/prompt_adapter/test_pa_lora.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
huggingface_hub
import
snapshot_download
from
vllm
import
EngineArgs
,
LLMEngine
,
SamplingParams
...
...
tests/quantization/test_bitsandbytes.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
'''Tests whether bitsandbytes computation is enabled correctly.
Run `pytest tests/quantization/test_bitsandbytes.py`.
...
...
tests/quantization/test_compressed_tensors.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Test model set-up and weight loading for llmcompressor-quantized models.
Run `pytest tests/quantization/test_compressed_tensors.py`.
"""
from
typing
import
Optional
import
pytest
...
...
@@ -24,12 +26,30 @@ from ..utils import models_path_prefix
@
pytest
.
mark
.
parametrize
(
"model_args"
,
[(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
),
"tensor"
,
QuantizationType
.
INT
,
2560
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"
),
"channel"
,
QuantizationType
.
INT
,
2560
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"
),
"tensor"
,
QuantizationType
.
INT
,
2560
,
False
)])
[
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
),
"tensor"
,
QuantizationType
.
INT
,
2560
,
True
,
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"
),
"channel"
,
QuantizationType
.
INT
,
2560
,
True
,
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"
),
"tensor"
,
QuantizationType
.
INT
,
2560
,
False
,
),
],
)
def
test_compressed_tensors_w8a8_static_setup
(
vllm_runner
,
model_args
):
model_path
,
strategy
,
quant_type
,
shape_0
,
is_symmetric
=
model_args
with
vllm_runner
(
model_path
,
enforce_eager
=
True
)
as
llm
:
...
...
@@ -87,21 +107,31 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
assert
output
@
pytest
.
mark
.
parametrize
(
"model_path"
,
[
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
),
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"
),
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"
),
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
)
])
@
pytest
.
mark
.
parametrize
(
"model_path"
,
[
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
),
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"
),
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"
),
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
),
],
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
def
test_compressed_tensors_w8a8_logprobs
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_path
,
max_tokens
,
num_logprobs
):
def
test_compressed_tensors_w8a8_logprobs
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_path
,
max_tokens
,
num_logprobs
,
):
dtype
=
"bfloat16"
# skip language translation prompt for the static per tensor asym model
if
model_path
==
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
):
# noqa: E501
if
(
model_path
==
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
)
):
# noqa: E501
example_prompts
=
example_prompts
[
0
:
-
1
]
with
hf_runner
(
model_path
,
dtype
=
dtype
)
as
hf_model
:
...
...
@@ -127,13 +157,21 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
assert
output
@
pytest
.
mark
.
parametrize
(
"model_args"
,
[
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
),
"tensor"
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"
),
"tensor"
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
),
"channel"
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"
),
"channel"
),
])
@
pytest
.
mark
.
parametrize
(
"model_args"
,
[
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
),
"tensor"
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"
),
"tensor"
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
),
"channel"
,
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"
),
"channel"
,
),
],
)
def
test_compressed_tensors_w8a8_dynamic_per_token
(
vllm_runner
,
model_args
):
model_path
,
strategy
=
model_args
with
vllm_runner
(
model_path
,
dtype
=
torch
.
float16
)
as
llm
:
...
...
@@ -160,9 +198,12 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
reason
=
"WNA16 is not supported on ROCm."
)
@
pytest
.
mark
.
parametrize
(
"wNa16_args"
,
[(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
),
"channel"
,
None
,
8
),
[
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
),
"channel"
,
None
,
8
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w4a16-group128-v2"
),
"group"
,
128
,
8
),
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a16-per-channel"
),
"channel"
,
None
,
4
)])
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a16-per-channel"
),
"channel"
,
None
,
4
),
],
)
def
test_compressed_tensors_wNa16
(
vllm_runner
,
wNa16_args
):
model
,
strategy
,
group
,
pack_factor
=
wNa16_args
with
vllm_runner
(
model
)
as
llm
:
...
...
@@ -226,7 +267,8 @@ def test_compressed_tensors_fp8(vllm_runner):
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
(
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A16Fp8
))
(
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A16Fp8
),
)
assert
qkv_proj
.
input_scale
.
dtype
is
torch
.
float32
...
...
@@ -251,9 +293,14 @@ def test_compressed_tensors_kv_cache(vllm_runner):
assert
output
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
)
def
_test_2of4_quant_models
(
qkv_proj
,
weight_strategy
,
input_strategy
):
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
,
)
def
_test_2of4_quant_models
(
qkv_proj
,
weight_strategy
,
input_strategy
,
format
=
"dense"
):
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensors24
)
...
...
@@ -262,22 +309,39 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
assert
qkv_proj
.
scheme
.
quantized
assert
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
sparsity_map
=
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
# noqa: E501
assert
sparsity_map
.
get
(
"Linear"
).
format
==
"dense"
assert
sparsity_map
.
get
(
"Linear"
).
format
==
format
assert
sparsity_map
.
get
(
"Linear"
).
sparsity_structure
==
"2:4"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
90
),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing"
,
"channel"
,
"token"
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing"
,
"channel"
,
"tensor"
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing"
,
"tensor"
,
"tensor"
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"tensor"
,
"token"
),
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
90
),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing"
,
"channel"
,
"token"
,
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing"
,
"channel"
,
"tensor"
,
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing"
,
"tensor"
,
"tensor"
,
),
(
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"tensor"
,
"token"
,
),
],
)
def
test_compressed_tensors_2of4_quant_fp8
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
...
...
@@ -296,16 +360,134 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
assert
output
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing"
,
"channel"
,
"token"
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing"
,
"tensor"
,
"tensor"
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"tensor"
,
"token"
),
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
90
),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
,
"channel"
,
"token"
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM"
,
"channel"
,
"tensor"
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM"
,
"tensor"
,
"token"
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM"
,
"tensor"
,
"tensor"
,
),
],
)
def
test_compressed_tensors_2of4_quant_fp8_compressed
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
qkv_proj
.
scheme
.
weights_dtype
==
torch
.
float8_e4m3fn
_test_2of4_quant_models
(
qkv_proj
,
weight_strategy
,
input_strategy
,
format
=
"sparse-24-bitmask"
,
)
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
print
(
output
)
assert
output
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"cutlass is not yet supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM"
,
"channel"
,
"token"
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM"
,
"channel"
,
"tensor"
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM"
,
"tensor"
,
"token"
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM"
,
"tensor"
,
"tensor"
,
),
],
)
def
test_compressed_tensors_2of4_quant_int8_compressed
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
qkv_proj
.
scheme
.
weights_dtype
==
torch
.
int8
_test_2of4_quant_models
(
qkv_proj
,
weight_strategy
,
input_strategy
,
format
=
"sparse-24-bitmask"
,
)
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
print
(
output
)
assert
output
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing"
,
"channel"
,
"token"
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing"
,
"tensor"
,
"tensor"
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing"
,
"tensor"
,
"token"
,
),
],
)
def
test_compressed_tensors_2of4_quant_int8
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
...
...
@@ -327,10 +509,12 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
@
pytest
.
mark
.
skip
(
reason
=
"2of4 sparse w16a16 CUTLASS produces bad output."
)
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"2of4 Sparse is not yet supported on this GPU type."
)
reason
=
"2of4 Sparse is not yet supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor"
)])
[(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor"
)],
)
def
test_compressed_tensors_2of4_sparse
(
vllm_runner
,
args_2of4
):
model
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
...
...
@@ -347,7 +531,9 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
assert
qkv_proj
.
scheme
.
input_quant
is
None
assert
not
qkv_proj
.
scheme
.
quantized
assert
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
sparsity_map
=
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
# noqa: E501
sparsity_map
=
(
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
)
# noqa: E501
assert
sparsity_map
.
get
(
"Linear"
).
format
==
"dense"
assert
sparsity_map
.
get
(
"Linear"
).
sparsity_structure
==
"2:4"
...
...
@@ -356,3 +542,38 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
print
(
output
)
assert
output
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"Cutlass is not yet supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[(
"nm-testing/llama2.c-stories42M-pruned2.4-compressed"
)])
def
test_compressed_tensors_2of4_sparse_compressed
(
vllm_runner
,
args_2of4
):
model
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensors24
)
assert
qkv_proj
.
scheme
.
weight_quant
is
None
assert
qkv_proj
.
scheme
.
input_quant
is
None
assert
not
qkv_proj
.
scheme
.
quantized
assert
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
sparsity_map
=
(
qkv_proj
.
quant_method
.
quantization_config
.
sparsity_scheme_map
)
# noqa: E501
assert
sparsity_map
.
get
(
"Linear"
).
format
==
"sparse-24-bitmask"
assert
sparsity_map
.
get
(
"Linear"
).
sparsity_structure
==
"2:4"
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
print
(
output
)
assert
output
tests/quantization/test_configs.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Tests whether Marlin models can be loaded from the autogptq config.
Run `pytest tests/quantization/test_configs.py --forked`.
...
...
tests/quantization/test_cpu_offload.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# Expanded quantized model tests for CPU offloading
# Base tests: tests/basic_correctness/test_cpu_offload.py
...
...
tests/quantization/test_experts_int8.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
# flake8: noqa
"""Tests experts_int8 quantization startup and generation,
doesn't test correctness
...
...
tests/quantization/test_fp8.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Tests whether FP8 computation is enabled correctly.
Run `pytest tests/quantization/test_fp8.py --forked`.
...
...
Prev
1
…
16
17
18
19
20
21
22
23
24
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment