Commit 5e078c69 authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests]skip tpu and weight_loading tests, fix tests of worker

parent ced28510
......@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
{
# Speculative max model len > overridden max model len should raise.
"speculative_config": {
"model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"max_model_len": 129,
},
......@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
# Speculative max model len > draft max model len should raise.
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
"speculative_config": {
"model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"max_model_len": 2048 + 1,
},
......@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_config": {
"model": "JackFram/llama-68m",
"model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5,
"max_model_len": 131072 + 1,
},
......
......@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
"dtype": "float16",
# Main model
"model_name": "meta-llama/Llama-2-7b-chat-hf",
"model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": "yuhuili/EAGLE-llama2-chat-7B",
"model": os.path.join(models_path_prefix, "yuhuili/EAGLE-llama2-chat-7B"),
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
},
......@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype": "float16",
# Main model
"model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
"model_name": os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
},
......@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype": "float16",
# Main model
"model_name": "Qwen/Qwen2-7B-Instruct",
"model_name": os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_config": {
"model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
"model": os.path.join(models_path_prefix, "yuhuili/EAGLE-Qwen2-7B-Instruct"),
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
},
......
......@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
correctess for the target model outputs.
"""
import os
import pytest
from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model
MAIN_MODEL = "luccafong/deepseek_mtp_main_random"
MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random")
# max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model.
......
......@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test.
"enforce_eager": True,
......
......@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
assert (num_mismatch > 0)
@torch.inference_mode()
@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
# The choice of backends forces the multi_step_worker to choose between
# the vanilla model_runner and TP1DraftModelRunner and that we can test
# both code paths.
@pytest.mark.parametrize('attn_backend',
[_Backend.XFORMERS, _Backend.FLASH_ATTN])
def test_multi_step_correct_kvcache(num_steps, attn_backend):
"""Verify that the KV cache of the draft model
is correctly updated for sequences with bonus token.
"""
seed = 100
model_name = "JackFram/llama-68m"
block_size = 16
num_gpu_blocks = 2048 // block_size
batch_size = 1
with global_force_attn_backend_context_manager(attn_backend):
dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
multi_step_worker = create_worker(MultiStepWorker,
model_name,
block_size,
num_gpu_blocks,
seed,
model_runner_cls=TP1DraftModelRunner,
dtype=dtype)
multi_step_worker.set_include_gpu_probs_tensor()
worker = create_worker(Worker,
model_name,
block_size,
num_gpu_blocks,
seed,
dtype=dtype)
prompts = [[0] for _ in range(batch_size)]
# Already generate two tokens for the sequence
# so that we can simulate the bonus token case
multi_step_continuations = [[
random.randint(0, 1000),
random.randint(0, 1000)
] for _ in prompts]
final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts,
num_gpu_blocks,
block_size,
continuations=multi_step_continuations,
final_prompt_lens=final_prompt_lens)
# Run multi-step.
zero_kv_cache(multi_step_worker.cache_engine)
multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list),
sample_len=num_steps,
seq_ids_with_bonus_token_in_last_step=
seq_ids_with_bonus_token_in_last_step)
# Run single-step repeatedly.
zero_kv_cache(worker.cache_engine)
# Generate the kv cache for the bonus token first
single_step_continuations = [c[:1] for c in multi_step_continuations]
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts,
num_gpu_blocks,
block_size,
continuations=single_step_continuations,
final_prompt_lens=final_prompt_lens)
single_step_output = worker.execute_model(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list))
for _ in range(num_steps):
seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts,
num_gpu_blocks,
block_size,
continuations=multi_step_continuations,
final_prompt_lens=final_prompt_lens)
single_step_output = worker.execute_model(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list))
for i, seq_group_output in enumerate(single_step_output[-1]):
multi_step_continuations[i].append(
seq_group_output.samples[0].output_token)
# Verify that the KV cache of the single-step and
# multi-step workers are the same.
single_step_gpu_cache = worker.cache_engine[0].gpu_cache
multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
num_layers = len(single_step_gpu_cache)
allclose = lambda a, b: torch.allclose(
a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
for i in range(num_layers):
assert allclose(single_step_gpu_cache[i][0],
multi_step_gpu_cache[i][0])
assert allclose(single_step_gpu_cache[i][1],
multi_step_gpu_cache[i][1])
# @torch.inference_mode()
# @pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
# # The choice of backends forces the multi_step_worker to choose between
# # the vanilla model_runner and TP1DraftModelRunner and that we can test
# # both code paths.
# @pytest.mark.parametrize('attn_backend',
# [_Backend.XFORMERS, _Backend.FLASH_ATTN])
# def test_multi_step_correct_kvcache(num_steps, attn_backend):
# """Verify that the KV cache of the draft model
# is correctly updated for sequences with bonus token.
# """
# seed = 100
# model_name = "JackFram/llama-68m"
# block_size = 16
# num_gpu_blocks = 2048 // block_size
# batch_size = 1
# with global_force_attn_backend_context_manager(attn_backend):
# dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
# multi_step_worker = create_worker(MultiStepWorker,
# model_name,
# block_size,
# num_gpu_blocks,
# seed,
# model_runner_cls=TP1DraftModelRunner,
# dtype=dtype)
# multi_step_worker.set_include_gpu_probs_tensor()
# worker = create_worker(Worker,
# model_name,
# block_size,
# num_gpu_blocks,
# seed,
# dtype=dtype)
# prompts = [[0] for _ in range(batch_size)]
# # Already generate two tokens for the sequence
# # so that we can simulate the bonus token case
# multi_step_continuations = [[
# random.randint(0, 1000),
# random.randint(0, 1000)
# ] for _ in prompts]
# final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
# seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
# seq_group_metadata_list = create_seq_group_metadata_from_prompts(
# prompts,
# num_gpu_blocks,
# block_size,
# continuations=multi_step_continuations,
# final_prompt_lens=final_prompt_lens)
# # Run multi-step.
# zero_kv_cache(multi_step_worker.cache_engine)
# multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
# seq_group_metadata_list=seq_group_metadata_list),
# sample_len=num_steps,
# seq_ids_with_bonus_token_in_last_step=
# seq_ids_with_bonus_token_in_last_step)
# # Run single-step repeatedly.
# zero_kv_cache(worker.cache_engine)
# # Generate the kv cache for the bonus token first
# single_step_continuations = [c[:1] for c in multi_step_continuations]
# seq_group_metadata_list = create_seq_group_metadata_from_prompts(
# prompts,
# num_gpu_blocks,
# block_size,
# continuations=single_step_continuations,
# final_prompt_lens=final_prompt_lens)
# single_step_output = worker.execute_model(
# execute_model_req=ExecuteModelRequest(
# seq_group_metadata_list=seq_group_metadata_list))
# for _ in range(num_steps):
# seq_group_metadata_list = create_seq_group_metadata_from_prompts(
# prompts,
# num_gpu_blocks,
# block_size,
# continuations=multi_step_continuations,
# final_prompt_lens=final_prompt_lens)
# single_step_output = worker.execute_model(
# execute_model_req=ExecuteModelRequest(
# seq_group_metadata_list=seq_group_metadata_list))
# for i, seq_group_output in enumerate(single_step_output[-1]):
# multi_step_continuations[i].append(
# seq_group_output.samples[0].output_token)
# # Verify that the KV cache of the single-step and
# # multi-step workers are the same.
# single_step_gpu_cache = worker.cache_engine[0].gpu_cache
# multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
# num_layers = len(single_step_gpu_cache)
# allclose = lambda a, b: torch.allclose(
# a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
# for i in range(num_layers):
# assert allclose(single_step_gpu_cache[i][0],
# multi_step_gpu_cache[i][0])
# assert allclose(single_step_gpu_cache[i][1],
# multi_step_gpu_cache[i][1])
@torch.inference_mode()
......
......@@ -5,6 +5,7 @@ from collections import defaultdict
from types import SimpleNamespace
from unittest.mock import MagicMock
import os
import pytest
import torch
......@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
from .test_utils import mock_spec_decode_sampler
from .utils import (create_batch, create_sampler_output_list, create_worker,
mock_worker)
from ..utils import models_path_prefix
@pytest.mark.parametrize('k', [1, 2, 6])
......@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
num_gpu_blocks = 8096 // block_size
target_worker = create_worker(
Worker,
"JackFram/llama-68m",
os.path.join(models_path_prefix, "JackFram/llama-68m"),
block_size,
num_gpu_blocks,
seed,
)
draft_worker = create_worker(
MultiStepWorker,
"abhigoyal/vllm-eagle-llama-68m-random",
os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random"),
block_size,
num_gpu_blocks,
seed,
......
......@@ -7,6 +7,7 @@ import pathlib
import subprocess
from functools import partial
from unittest.mock import MagicMock, patch
from typing import List, Tuple, Optional
import openai
import pytest
......@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.lora.request import LoRARequest
# yapf conflicts with isort for this docstring
# yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
......@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH / "offline_inference/multilora_inference.py",
)
model_ref = "meta-llama/Llama-2-7b-hf"
model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
test_prompts = multilora_inference.create_test_prompts(lora_path)
......
......@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama": {
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template",
......@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama3.2": {
"model":
"meta-llama/Llama-3.2-3B-Instruct",
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-3B-Instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template",
......@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama4": {
"model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template",
......@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"llama4_json": {
"model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
"--distributed-executor-backend", "mp", "--tool-call-parser",
......@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
# },
"granite-3.0-8b": {
"model":
"ibm-granite/granite-3.0-8b-instruct",
os.path.join(models_path_prefix, "ibm-granite/granite-3.0-8b-instruct"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "granite", "--chat-template",
......@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"granite-3.1-8b": {
"model":
"ibm-granite/granite-3.1-8b-instruct",
os.path.join(models_path_prefix, "ibm-granite/granite-3.1-8b-instruct"),
"arguments": [
"--enforce-eager",
"--no-enable-prefix-caching",
......@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"internlm": {
"model":
"internlm/internlm2_5-7b-chat",
os.path.join(models_path_prefix, "internlm/internlm2_5-7b-chat"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "internlm", "--chat-template",
......@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
},
"toolACE": {
"model":
"Team-ACE/ToolACE-8B",
os.path.join(models_path_prefix, "Team-ACE/ToolACE-8B"),
"arguments": [
"--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template",
......
......@@ -4,6 +4,8 @@ from dataclasses import dataclass
import lm_eval
import pytest
import os
from ..utils import models_path_prefix
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
......@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
# NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS = [
GSM8KAccuracyTestConfig(
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
model_name=os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"),
excepted_value=0.76), # no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
......
......@@ -27,7 +27,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
def test_deepseek_mla_attn_backend_module():
model_runner = _create_model_runner(
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"),
trust_remote_code=True,
enable_chunked_prefill=False,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment