Commit 87a2e37f authored by zhuwenwen's avatar zhuwenwen
Browse files

update tests

parent 3c9817d2
...@@ -19,8 +19,8 @@ from ..utils import models_path_prefix ...@@ -19,8 +19,8 @@ from ..utils import models_path_prefix
import os import os
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "facebook/opt-125m"), os.path.join(models_path_prefix, "facebook/opt-125m"),
] ]
@pytest.fixture(scope="module", autouse=True) @pytest.fixture(scope="module", autouse=True)
......
...@@ -22,10 +22,10 @@ TEST_MODELS = [ ...@@ -22,10 +22,10 @@ TEST_MODELS = [
"dtype": torch.float16, "dtype": torch.float16,
"quantization": "compressed-tensors" "quantization": "compressed-tensors"
}), }),
(os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), { # (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), {
"dtype": torch.float16, # "dtype": torch.float16,
"quantization": "fp8" # "quantization": "fp8"
}), # }),
(os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), { (os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
"quantization": "compressed-tensors" "quantization": "compressed-tensors"
}), }),
...@@ -49,20 +49,20 @@ if is_quant_method_supported("gptq"): ...@@ -49,20 +49,20 @@ if is_quant_method_supported("gptq"):
"quantization": "gptq" "quantization": "gptq"
})) }))
if is_quant_method_supported("gptq_marlin"): # if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), { # TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), {
"quantization": "gptq_marlin" # "quantization": "gptq_marlin"
})) # }))
if is_quant_method_supported("gptq_marlin_24"): # if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), { # TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), {
"quantization": "gptq_marlin_24" # "quantization": "gptq_marlin_24"
})) # }))
if is_quant_method_supported("marlin"): # if is_quant_method_supported("marlin"):
TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), { # TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), {
"quantization": "marlin" # "quantization": "marlin"
})) # }))
if not is_hip() and is_quant_method_supported("awq"): if not is_hip() and is_quant_method_supported("awq"):
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), { TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), {
......
...@@ -24,6 +24,7 @@ import os ...@@ -24,6 +24,7 @@ import os
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix from ...utils import models_path_prefix
import vllm.envs as envs
# main model # main model
MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m") MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
...@@ -36,7 +37,7 @@ SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-ra ...@@ -36,7 +37,7 @@ SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-ra
MAX_SPEC_TOKENS = 4 MAX_SPEC_TOKENS = 4
# precision # precision
PRECISION = "float32" PRECISION = "float32" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half"
@pytest.mark.parametrize( @pytest.mark.parametrize(
......
...@@ -34,7 +34,7 @@ async def test_tokenizer_group(tokenizer_group_type): ...@@ -34,7 +34,7 @@ async def test_tokenizer_group(tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2")) reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
tokenizer_group = get_tokenizer_group( tokenizer_group = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type), get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_id="gpt2", tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora=False, enable_lora=False,
max_num_seqs=1, max_num_seqs=1,
max_input_length=None, max_input_length=None,
...@@ -58,7 +58,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type): ...@@ -58,7 +58,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2")) reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
tokenizer_group_pool = get_tokenizer_group( tokenizer_group_pool = get_tokenizer_group(
get_tokenizer_pool_config(tokenizer_group_type), get_tokenizer_pool_config(tokenizer_group_type),
tokenizer_id="gpt2", tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora=False, enable_lora=False,
max_num_seqs=1, max_num_seqs=1,
max_input_length=None, max_input_length=None,
...@@ -100,7 +100,7 @@ async def test_tokenizer_group_ray_pool_env_var_propagation( ...@@ -100,7 +100,7 @@ async def test_tokenizer_group_ray_pool_env_var_propagation(
tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type) tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config( tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
tokenizer_pool_config, tokenizer_pool_config,
tokenizer_id="gpt2", tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora=False, enable_lora=False,
max_num_seqs=1, max_num_seqs=1,
max_input_length=None) max_input_length=None)
...@@ -111,7 +111,7 @@ async def test_tokenizer_group_ray_pool_env_var_propagation( ...@@ -111,7 +111,7 @@ async def test_tokenizer_group_ray_pool_env_var_propagation(
tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type) tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config( tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
tokenizer_pool_config, tokenizer_pool_config,
tokenizer_id="gpt2", tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora=False, enable_lora=False,
max_num_seqs=1, max_num_seqs=1,
max_input_length=None) max_input_length=None)
...@@ -148,7 +148,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): ...@@ -148,7 +148,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type) tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config( tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
tokenizer_pool_config, tokenizer_pool_config,
tokenizer_id="gpt2", tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora=False, enable_lora=False,
max_num_seqs=1, max_num_seqs=1,
max_input_length=None, max_input_length=None,
...@@ -175,7 +175,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): ...@@ -175,7 +175,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
fail_at = [1] fail_at = [1]
tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config( tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
tokenizer_pool_config, tokenizer_pool_config,
tokenizer_id="gpt2", tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora=False, enable_lora=False,
max_num_seqs=1, max_num_seqs=1,
max_input_length=None, max_input_length=None,
...@@ -196,7 +196,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): ...@@ -196,7 +196,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
fail_at = [] fail_at = []
tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config( tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
tokenizer_pool_config, tokenizer_pool_config,
tokenizer_id="gpt2", tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
enable_lora=False, enable_lora=False,
max_num_seqs=1, max_num_seqs=1,
max_input_length=2, max_input_length=2,
......
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from huggingface_hub import snapshot_download # from huggingface_hub import snapshot_download
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
...@@ -12,7 +12,7 @@ from .utils import ARGS, CONFIGS, ServerConfig ...@@ -12,7 +12,7 @@ from .utils import ARGS, CONFIGS, ServerConfig
def server_config(request): def server_config(request):
config = CONFIGS[request.param] config = CONFIGS[request.param]
# download model and tokenizer using transformers # download model and tokenizer using transformers
snapshot_download(config["model"]) # snapshot_download(config["model"])
yield CONFIGS[request.param] yield CONFIGS[request.param]
......
from typing import Dict, List from typing import Dict, List
import os
from openai.types.chat import (ChatCompletionMessageParam, from openai.types.chat import (ChatCompletionMessageParam,
ChatCompletionToolParam) ChatCompletionToolParam)
from typing_extensions import TypedDict from typing_extensions import TypedDict
from tests.utils import VLLM_PATH from tests.utils import VLLM_PATH
from ..utils import models_path_prefix
class ServerConfig(TypedDict): class ServerConfig(TypedDict):
...@@ -19,7 +21,7 @@ ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "8096"] ...@@ -19,7 +21,7 @@ ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "8096"]
CONFIGS: Dict[str, ServerConfig] = { CONFIGS: Dict[str, ServerConfig] = {
"hermes": { "hermes": {
"model": "model":
"NousResearch/Hermes-3-Llama-3.1-8B", os.path.join(models_path_prefix, "NousResearch/Hermes-3-Llama-3.1-8B"),
"arguments": [ "arguments": [
"--tool-call-parser", "hermes", "--chat-template", "--tool-call-parser", "hermes", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja") str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
...@@ -27,7 +29,7 @@ CONFIGS: Dict[str, ServerConfig] = { ...@@ -27,7 +29,7 @@ CONFIGS: Dict[str, ServerConfig] = {
}, },
"mistral": { "mistral": {
"model": "model":
"mistralai/Mistral-7B-Instruct-v0.3", os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
"arguments": [ "arguments": [
"--tool-call-parser", "mistral", "--chat-template", "--tool-call-parser", "mistral", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"), str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
......
...@@ -582,7 +582,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA): ...@@ -582,7 +582,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
qkv_words = "|".join(lay_qkv_words) qkv_words = "|".join(lay_qkv_words)
for layername, weight in params_dict.items(): for layername, weight in params_dict.items():
if "lm_head.weight" in layername: if "lm_head.weight" in layername and weight.shape[1] >= 4096:
lay_key_words.append("lm_head.weight") lay_key_words.append("lm_head.weight")
combined_words = "|".join(lay_key_words) combined_words = "|".join(lay_key_words)
os.environ['LM_NN'] = '1' os.environ['LM_NN'] = '1'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment