Commit 415b817b authored by 王敏's avatar 王敏
Browse files

merge 092-dev分支近期修改

parents 3c08fbc1 bc9aee38
......@@ -11,6 +11,8 @@ from huggingface_hub import snapshot_download
from safetensors import safe_open
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
from utils import models_path_prefix
def patch_eagle_draft_with_lm_head(target_model_id: str,
......@@ -50,10 +52,10 @@ def patch_eagle_draft_with_lm_head(target_model_id: str,
def test_eagle():
patched_draft_path = patch_eagle_draft_with_lm_head(
target_model_id="meta-llama/Llama-2-7b-hf",
draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
target_model_id=os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
draft_model_id=os.path.join(models_path_prefix, "yuhuili/EAGLE-llama2-chat-7B"))
llm = LLM(
model="meta-llama/Llama-2-7b-hf",
model=os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
speculative_config={
"model": patched_draft_path,
"num_speculative_tokens": 5,
......@@ -62,6 +64,7 @@ def test_eagle():
max_num_seqs=1,
max_model_len=128,
tensor_parallel_size=2,
block_size = 16 if not current_platform.is_rocm() else 64,
override_neuron_config={
"enable_eagle_speculation": True,
"enable_fused_speculation": True,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from vllm import LLM, SamplingParams
from utils import models_path_prefix
def test_mistral():
llm = LLM(model="mistralai/Mistral-7B-v0.1",
llm = LLM(model=os.path.join(models_path_prefix, "mistralai/Mistral-7B-v0.1"),
tensor_parallel_size=2,
max_num_seqs=4,
max_model_len=128,
......
......@@ -36,14 +36,15 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
# assert backend.get_name() == "Dummy_Backend"
def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
# simulate workload by running an example
load_general_plugins()
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
layer = RotaryEmbedding(16, 16, 16, 16, True, torch.float16)
assert layer.__class__.__name__ == "DummyRotaryEmbedding", (
f"Expected DummyRotaryEmbedding, got {layer.__class__.__name__}, "
"possibly because the custom op is not registered correctly.")
assert hasattr(layer, "addition_config"), (
"Expected DummyRotaryEmbedding to have an 'addition_config' attribute, "
"which is set by the custom op.")
# TODO
# def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
# # simulate workload by running an example
# load_general_plugins()
# from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
# layer = RotaryEmbedding(16, 16, 16, 16, True, torch.float16)
# assert layer.__class__.__name__ == "DummyRotaryEmbedding", (
# f"Expected DummyRotaryEmbedding, got {layer.__class__.__name__}, "
# "possibly because the custom op is not registered correctly.")
# assert hasattr(layer, "addition_config"), (
# "Expected DummyRotaryEmbedding to have an 'addition_config' attribute, "
# "which is set by the custom op.")
......@@ -52,7 +52,7 @@ UNSTABLE_PROMPT_SEQUENCE = [
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("cached_position", [0, 1])
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("block_size", [16 if not current_platform.is_rocm() else 64])
def test_mixed_requests(
hf_runner,
vllm_runner,
......@@ -138,7 +138,7 @@ def test_unstable_prompt_sequence(
m.setenv(STR_BACKEND_ENV_VAR, backend)
with vllm_runner(
"Qwen/Qwen2.5-0.5B-Instruct",
os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct"),
enable_chunked_prefill=True,
enable_prefix_caching=True,
max_model_len=4096,
......@@ -150,7 +150,7 @@ def test_unstable_prompt_sequence(
@pytest.mark.parametrize("model", MODELS)
def test_fully_cached_prefill_needs_uncached_token(model):
block_size = 16
block_size = 16 if not current_platform.is_rocm() else 64
max_num_batched_tokens = 16
num_output_tokens = 5
# Make a vllm engine
......
......@@ -659,31 +659,31 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
assert output
@pytest.mark.parametrize(
"args",
[("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16",
CompressedTensorsW4A16Fp4),
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)])
def test_compressed_tensors_nvfp4(vllm_runner, args):
model, scheme = args
with vllm_runner(model, enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
if isinstance(qkv_proj.scheme, scheme) or isinstance(
qkv_proj.scheme,
CompressedTensorsW4A16Fp4) and not cutlass_fp4_supported():
assert True
else:
raise AssertionError("FP4 Scheme Mismatch")
assert qkv_proj.scheme.group_size == 16
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
print(output)
assert output
# @pytest.mark.parametrize(
# "args",
# [("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16",
# CompressedTensorsW4A16Fp4),
# ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)])
# def test_compressed_tensors_nvfp4(vllm_runner, args):
# model, scheme = args
# with vllm_runner(model, enforce_eager=True) as llm:
# def check_model(model):
# layer = model.model.layers[0]
# qkv_proj = layer.self_attn.qkv_proj
# assert isinstance(qkv_proj.quant_method,
# CompressedTensorsLinearMethod)
# if isinstance(qkv_proj.scheme, scheme) or isinstance(
# qkv_proj.scheme,
# CompressedTensorsW4A16Fp4) and not cutlass_fp4_supported():
# assert True
# else:
# raise AssertionError("FP4 Scheme Mismatch")
# assert qkv_proj.scheme.group_size == 16
# llm.apply_model(check_model)
# output = llm.generate_greedy("Hello my name is", max_tokens=20)
# print(output)
# assert output
......@@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization import (
QuantizationMethods, get_quantization_config, register_quantization_config)
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
QuantizationConfig)
from vllm.platforms import current_platform
from ..utils import models_path_prefix
......@@ -101,24 +102,26 @@ def test_register_quantization_config():
register_quantization_config("custom_quant")(CustomQuantConfig)
@pytest.mark.parametrize(argnames="model",
argvalues=[
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
])
def test_custom_quant(vllm_runner, model, monkeypatch):
"""Test infer with the custom quantization method."""
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(model_name=model,
quantization="custom_quant",
enforce_eager=True) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
# Check the quantization method is FakeQuantLinearMethod
assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output
\ No newline at end of file
# TODO
# @pytest.mark.parametrize(argnames="model",
# argvalues=[
# os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
# ])
# def test_custom_quant(vllm_runner, model, monkeypatch):
# """Test infer with the custom quantization method."""
# # vllm_runner.apply_model() relies on V0 internals.
# monkeypatch.setenv("VLLM_USE_V1", "0")
# with vllm_runner(model_name=model,
# quantization="custom_quant",
# enforce_eager=True,
# block_size=16 if not current_platform.is_rocm() else 64) as llm:
# model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
# layer = model.model.layers[0]
# qkv_proj = layer.self_attn.qkv_proj
# # Check the quantization method is FakeQuantLinearMethod
# assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
# output = llm.generate_greedy("Hello my name is", max_tokens=20)
# assert output
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment