Unverified Commit f6aa1226 authored by Alex's avatar Alex Committed by GitHub
Browse files

[CI Sprint] Quantization CI Cleanup (#24130)


Signed-off-by: default avatarAlex Yun <alexyun04@gmail.com>
parent 184b12fd
...@@ -141,7 +141,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): ...@@ -141,7 +141,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
"neuralmagic/Llama-3.2-1B-quantized.w8a8", "neuralmagic/Llama-3.2-1B-quantized.w8a8",
], ],
) )
@pytest.mark.parametrize("max_tokens", [8]) @pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("num_logprobs", [10]) @pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"use_aiter", [True, False] if current_platform.is_rocm() else [False] "use_aiter", [True, False] if current_platform.is_rocm() else [False]
...@@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs( ...@@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
example_prompts, max_tokens, num_logprobs example_prompts, max_tokens, num_logprobs
) )
with vllm_runner(model_path, dtype=dtype) as vllm_model: with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs( vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs example_prompts, max_tokens, num_logprobs
) )
......
...@@ -19,8 +19,8 @@ def test_cpu_offload_fp8(): ...@@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
# Test loading a quantized checkpoint # Test loading a quantized checkpoint
compare_two_settings( compare_two_settings(
"neuralmagic/Qwen2-1.5B-Instruct-FP8", "neuralmagic/Qwen2-1.5B-Instruct-FP8",
[], ["--enforce_eager"],
["--cpu-offload-gb", "1"], ["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480, max_wait_seconds=480,
) )
...@@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch): ...@@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
# Test GPTQ Marlin # Test GPTQ Marlin
compare_two_settings( compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
[], ["--enforce_eager"],
["--cpu-offload-gb", "1"], ["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480, max_wait_seconds=480,
) )
...@@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch): ...@@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
# Test AWQ Marlin # Test AWQ Marlin
compare_two_settings( compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-AWQ", "Qwen/Qwen2-1.5B-Instruct-AWQ",
[], ["--enforce_eager"],
["--cpu-offload-gb", "1"], ["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480, max_wait_seconds=480,
) )
...@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch): ...@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
# Test wNa16 # Test wNa16
compare_two_settings( compare_two_settings(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2", "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
[], ["--enforce_eager"],
["--cpu-offload-gb", "1"], ["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480, max_wait_seconds=480,
) )
...@@ -21,7 +21,7 @@ MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"] ...@@ -21,7 +21,7 @@ MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
) )
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10]) @pytest.mark.parametrize("max_tokens", [4])
def test_model_experts_int8_startup( def test_model_experts_int8_startup(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
...@@ -33,5 +33,7 @@ def test_model_experts_int8_startup( ...@@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_transformers_version(on_fail="skip") model_info.check_transformers_version(on_fail="skip")
with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model: with vllm_runner(
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens) vllm_model.generate_greedy(example_prompts, max_tokens)
...@@ -45,10 +45,10 @@ def test_model_load_and_run( ...@@ -45,10 +45,10 @@ def test_model_load_and_run(
if force_marlin: if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
with vllm_runner(model_id) as llm: with vllm_runner(model_id, enforce_eager=True) as llm:
# note: this does not test accuracy, just that we can run through # note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy # see lm-eval tests for accuracy
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10) outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
print(outputs[0][1]) print(outputs[0][1])
...@@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run( ...@@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
# `LLM.apply_model` requires pickling a function. # `LLM.apply_model` requires pickling a function.
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
def check_model(model): def check_model(model):
attn = model.model.layers[0].self_attn.attn attn = model.model.layers[0].self_attn.attn
...@@ -112,7 +112,7 @@ def test_kv_cache_model_load_and_run( ...@@ -112,7 +112,7 @@ def test_kv_cache_model_load_and_run(
# note: this does not test accuracy, just that we can run through # note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy # see lm-eval tests for accuracy
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10) outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
print(outputs[0][1]) print(outputs[0][1])
...@@ -142,7 +142,10 @@ def test_load_fp16_model( ...@@ -142,7 +142,10 @@ def test_load_fp16_model(
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
with vllm_runner( with vllm_runner(
"facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype "facebook/opt-125m",
quantization="fp8",
enforce_eager=True,
kv_cache_dtype=kv_cache_dtype,
) as llm: ) as llm:
def check_model(model): def check_model(model):
......
...@@ -26,7 +26,7 @@ DTYPE = ["bfloat16"] ...@@ -26,7 +26,7 @@ DTYPE = ["bfloat16"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE) @pytest.mark.parametrize("dtype", DTYPE)
def test_ipex_quant(vllm_runner, model, dtype): def test_ipex_quant(vllm_runner, model, dtype):
with vllm_runner(model, dtype=dtype) as llm: with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32) output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output assert output
print(output) print(output)
...@@ -49,4 +49,4 @@ def test_lm_head( ...@@ -49,4 +49,4 @@ def test_lm_head(
vllm_model.apply_model(check_model) vllm_model.apply_model(check_model)
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1]) print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])
...@@ -88,6 +88,6 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner): ...@@ -88,6 +88,6 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
llm.apply_model(check_model) llm.apply_model(check_model)
# Run a simple generation test to ensure the model works # Run a simple generation test to ensure the model works
output = llm.generate_greedy(["Hello my name is"], max_tokens=20) output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output assert output
print(f"ModelOpt FP8 output: {output}") print(f"ModelOpt FP8 output: {output}")
...@@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: ...@@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
"facebook/opt-125m", "facebook/opt-125m",
dtype=dtype, dtype=dtype,
quantization="ptpc_fp8", quantization="ptpc_fp8",
enforce_eager=True,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
) )
except AssertionError as e: except AssertionError as e:
...@@ -65,5 +66,5 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: ...@@ -65,5 +66,5 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
llm.apply_model(check_model) llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20) output = llm.generate_greedy("Hello my name is", max_tokens=4)
assert output assert output
...@@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization import ( ...@@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization import (
get_quantization_config, get_quantization_config,
register_quantization_config, register_quantization_config,
) )
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizationConfig, # noqa: E501
) )
...@@ -142,5 +142,5 @@ def test_custom_quant(vllm_runner, model, monkeypatch): ...@@ -142,5 +142,5 @@ def test_custom_quant(vllm_runner, model, monkeypatch):
llm.apply_model(check_model) llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20) output = llm.generate_greedy("Hello my name is", max_tokens=1)
assert output assert output
...@@ -392,7 +392,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant( ...@@ -392,7 +392,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
assert not has_int4_preshuffled_tensor assert not has_int4_preshuffled_tensor
assert weight_attrs == [False, 1, 0, True] assert weight_attrs == [False, 1, 0, True]
output = llm.generate_greedy(["The capital of France is"], max_tokens=32) output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output assert output
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment