Unverified Commit 47e9038d authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

Fix cpu offload testing for gptq/awq/ct (#15648)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
parent 432cf22a
......@@ -33,7 +33,9 @@ def test_cpu_offload_fp8():
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq():
def test_cpu_offload_gptq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test GPTQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
["--cpu-offload-gb", "1"],
......@@ -47,7 +49,9 @@ def test_cpu_offload_gptq():
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq():
def test_cpu_offload_awq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test AWQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
["--cpu-offload-gb", "1"],
......@@ -61,7 +65,9 @@ def test_cpu_offload_awq():
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors():
def test_cpu_offload_compressed_tensors(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test wNa16
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
["--cpu-offload-gb", "1"],
......
......@@ -317,6 +317,37 @@ def _test_completion_close(
return results
def _test_chat(
client: openai.OpenAI,
model: str,
prompt: str,
):
results = []
messages = [{
"role": "user",
"content": [{
"type": "text",
"text": prompt
}]
}]
# test with text prompt
chat_response = client.chat.completions.create(model=model,
messages=messages,
max_tokens=5,
temperature=0.0)
results.append({
"test": "completion_close",
"text": chat_response.choices[0].message.content,
"finish_reason": chat_response.choices[0].finish_reason,
"usage": chat_response.usage,
})
return results
def _test_embeddings(
client: openai.OpenAI,
model: str,
......@@ -512,6 +543,8 @@ def compare_all_settings(model: str,
results += _test_completion(client, model, prompt, token_ids)
elif method == "generate_close":
results += _test_completion_close(client, model, prompt)
elif method == "generate_chat":
results += _test_chat(client, model, prompt)
elif method == "generate_with_image":
results += _test_image_text(
client, model,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment