update tests

87a2e37f · zhuwenwen · 3c9817d2 · 87a2e37f · 87a2e37f · 87a2e37f
Commit 87a2e37f authored Nov 27, 2024 by zhuwenwen
7 changed files
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -19,8 +19,8 @@ from ..utils import models_path_prefix
 import os
 MODELS = [
-        os.path.join(models_path_prefix, "facebook/opt-125m"),
+    os.path.join(models_path_prefix, "facebook/opt-125m"),
-    ]
+]
 @pytest.fixture(scope="module", autouse=True)

--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -22,10 +22,10 @@ TEST_MODELS = [
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
-    (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), {
+    # (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), {
-        "dtype": torch.float16,
+    #     "dtype": torch.float16,
-        "quantization": "fp8"
+    #     "quantization": "fp8"
-    }),
+    # }),
    (os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
        "quantization": "compressed-tensors"
    }),
@@ -49,20 +49,20 @@ if is_quant_method_supported("gptq"):
        "quantization": "gptq"
    }))
-if is_quant_method_supported("gptq_marlin"):
+# if is_quant_method_supported("gptq_marlin"):
-    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), {
+#     TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), {
-        "quantization": "gptq_marlin"
+#         "quantization": "gptq_marlin"
-    }))
+#     }))
-if is_quant_method_supported("gptq_marlin_24"):
+# if is_quant_method_supported("gptq_marlin_24"):
-    TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), {
+#     TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), {
-        "quantization": "gptq_marlin_24"
+#         "quantization": "gptq_marlin_24"
-    }))
+#     }))
-if is_quant_method_supported("marlin"):
+# if is_quant_method_supported("marlin"):
-    TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), {
+#     TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), {
-        "quantization": "marlin"
+#         "quantization": "marlin"
-    }))
+#     }))
 if not is_hip() and is_quant_method_supported("awq"):
    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), {

--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -24,6 +24,7 @@ import os
 from .conftest import run_equality_correctness_test
 from ...utils import models_path_prefix
+import vllm.envs as envs
 # main model
 MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
@@ -36,7 +37,7 @@ SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-ra
 MAX_SPEC_TOKENS = 4
 # precision
-PRECISION = "float32"
+PRECISION = "float32" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half"
 @pytest.mark.parametrize(

--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -34,7 +34,7 @@ async def test_tokenizer_group(tokenizer_group_type):
    reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
    tokenizer_group = get_tokenizer_group(
        get_tokenizer_pool_config(tokenizer_group_type),
-        tokenizer_id="gpt2",
+        tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
        enable_lora=False,
        max_num_seqs=1,
        max_input_length=None,
@@ -58,7 +58,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
    reference_tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "gpt2"))
    tokenizer_group_pool = get_tokenizer_group(
        get_tokenizer_pool_config(tokenizer_group_type),
-        tokenizer_id="gpt2",
+        tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
        enable_lora=False,
        max_num_seqs=1,
        max_input_length=None,
@@ -100,7 +100,7 @@ async def test_tokenizer_group_ray_pool_env_var_propagation(
    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
    tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
        tokenizer_pool_config,
-        tokenizer_id="gpt2",
+        tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
        enable_lora=False,
        max_num_seqs=1,
        max_input_length=None)
@@ -111,7 +111,7 @@ async def test_tokenizer_group_ray_pool_env_var_propagation(
        tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
        tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
            tokenizer_pool_config,
-            tokenizer_id="gpt2",
+            tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
            enable_lora=False,
            max_num_seqs=1,
            max_input_length=None)
@@ -148,7 +148,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
        tokenizer_pool_config,
-        tokenizer_id="gpt2",
+        tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
        enable_lora=False,
        max_num_seqs=1,
        max_input_length=None,
@@ -175,7 +175,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
    fail_at = [1]
    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
        tokenizer_pool_config,
-        tokenizer_id="gpt2",
+        tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
        enable_lora=False,
        max_num_seqs=1,
        max_input_length=None,
@@ -196,7 +196,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
    fail_at = []
    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
        tokenizer_pool_config,
-        tokenizer_id="gpt2",
+        tokenizer_id=os.path.join(models_path_prefix, "gpt2"),
        enable_lora=False,
        max_num_seqs=1,
        max_input_length=2,

--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
 import pytest
 import pytest_asyncio
-from huggingface_hub import snapshot_download
+# from huggingface_hub import snapshot_download
 from tests.utils import RemoteOpenAIServer
@@ -12,7 +12,7 @@ from .utils import ARGS, CONFIGS, ServerConfig
 def server_config(request):
    config = CONFIGS[request.param]
    # download model and tokenizer using transformers
-    snapshot_download(config["model"])
+    # snapshot_download(config["model"])
    yield CONFIGS[request.param]

--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
 from typing import Dict, List
+import os
 from openai.types.chat import (ChatCompletionMessageParam,
                               ChatCompletionToolParam)
 from typing_extensions import TypedDict
 from tests.utils import VLLM_PATH
+from ..utils import models_path_prefix
 class ServerConfig(TypedDict):
@@ -19,7 +21,7 @@ ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "8096"]
 CONFIGS: Dict[str, ServerConfig] = {
    "hermes": {
        "model":
-        "NousResearch/Hermes-3-Llama-3.1-8B",
+        os.path.join(models_path_prefix, "NousResearch/Hermes-3-Llama-3.1-8B"),
        "arguments": [
            "--tool-call-parser", "hermes", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
@@ -27,7 +29,7 @@ CONFIGS: Dict[str, ServerConfig] = {
    },
    "mistral": {
        "model":
-        "mistralai/Mistral-7B-Instruct-v0.3",
+        os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
        "arguments": [
            "--tool-call-parser", "mistral", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -582,7 +582,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
            qkv_words = "|".join(lay_qkv_words)          
            for layername, weight in params_dict.items():
-                if "lm_head.weight" in layername:
+                if "lm_head.weight" in layername and weight.shape[1] >= 4096:
                    lay_key_words.append("lm_head.weight")
                    combined_words = "|".join(lay_key_words)
                    os.environ['LM_NN'] = '1'