Commit 5e078c69 authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests]skip tpu and weight_loading tests, fix tests of worker

parent ced28510
...@@ -19,7 +19,7 @@ from ...utils import models_path_prefix ...@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
{ {
# Speculative max model len > overridden max model len should raise. # Speculative max model len > overridden max model len should raise.
"speculative_config": { "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"max_model_len": 129, "max_model_len": 129,
}, },
...@@ -29,7 +29,7 @@ from ...utils import models_path_prefix ...@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
# Speculative max model len > draft max model len should raise. # Speculative max model len > draft max model len should raise.
# https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12 # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
"speculative_config": { "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"max_model_len": 2048 + 1, "max_model_len": 2048 + 1,
}, },
...@@ -38,7 +38,7 @@ from ...utils import models_path_prefix ...@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
# Speculative max model len > target max model len should raise. # Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18 # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_config": { "speculative_config": {
"model": "JackFram/llama-68m", "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"max_model_len": 131072 + 1, "max_model_len": 131072 + 1,
}, },
...@@ -64,4 +64,4 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): ...@@ -64,4 +64,4 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
with pytest.raises(ValueError, match="cannot be larger than"): with pytest.raises(ValueError, match="cannot be larger than"):
get_output_from_llm_generator(test_llm_generator, prompts, get_output_from_llm_generator(test_llm_generator, prompts,
sampling_params) sampling_params)
\ No newline at end of file
...@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
"dtype": "float16", "dtype": "float16",
# Main model # Main model
"model_name": "meta-llama/Llama-2-7b-chat-hf", "model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": { "speculative_config": {
"model": "yuhuili/EAGLE-llama2-chat-7B", "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-llama2-chat-7B"),
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
}, },
...@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype": "float16", "dtype": "float16",
# Main model # Main model
"model_name": "meta-llama/Meta-Llama-3-8B-Instruct", "model_name": os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": { "speculative_config": {
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
}, },
...@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, ...@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
"dtype": "float16", "dtype": "float16",
# Main model # Main model
"model_name": "Qwen/Qwen2-7B-Instruct", "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [ @pytest.mark.parametrize("test_llm_kwargs", [
{ {
"speculative_config": { "speculative_config": {
"model": "yuhuili/EAGLE-Qwen2-7B-Instruct", "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-Qwen2-7B-Instruct"),
"num_speculative_tokens": MAX_SPEC_TOKENS, "num_speculative_tokens": MAX_SPEC_TOKENS,
}, },
}, },
......
...@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the ...@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
correctess for the target model outputs. correctess for the target model outputs.
""" """
import os
import pytest import pytest
from .conftest import run_equality_correctness_test from .conftest import run_equality_correctness_test
from ...utils import models_path_prefix
# main model # main model
MAIN_MODEL = "luccafong/deepseek_mtp_main_random" MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random")
# max. number of speculative tokens: this corresponds to # max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model. # num_nextn_predict_layers in the config.json of the speculator model.
...@@ -329,4 +331,4 @@ def test_mtp_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -329,4 +331,4 @@ def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
if __name__ == "__main__": if __name__ == "__main__":
import pytest import pytest
pytest.main([__file__]) pytest.main([__file__])
\ No newline at end of file
...@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs, ...@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize( @pytest.mark.parametrize(
"common_llm_kwargs", "common_llm_kwargs",
[{ [{
"model_name": "JackFram/llama-68m", "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
# Skip cuda graph recording for fast test. # Skip cuda graph recording for fast test.
"enforce_eager": True, "enforce_eager": True,
...@@ -372,4 +372,4 @@ def test_ngram_scorer(vllm_runner, common_llm_kwargs, ...@@ -372,4 +372,4 @@ def test_ngram_scorer(vllm_runner, common_llm_kwargs,
batch_size, batch_size,
max_output_len=output_len, max_output_len=output_len,
seed=seed, seed=seed,
temperature=0.0) temperature=0.0)
\ No newline at end of file
...@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output(): ...@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
assert (num_mismatch > 0) assert (num_mismatch > 0)
@torch.inference_mode() # @torch.inference_mode()
@pytest.mark.parametrize('num_steps', [1, 2, 3, 4]) # @pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
# The choice of backends forces the multi_step_worker to choose between # # The choice of backends forces the multi_step_worker to choose between
# the vanilla model_runner and TP1DraftModelRunner and that we can test # # the vanilla model_runner and TP1DraftModelRunner and that we can test
# both code paths. # # both code paths.
@pytest.mark.parametrize('attn_backend', # @pytest.mark.parametrize('attn_backend',
[_Backend.XFORMERS, _Backend.FLASH_ATTN]) # [_Backend.XFORMERS, _Backend.FLASH_ATTN])
def test_multi_step_correct_kvcache(num_steps, attn_backend): # def test_multi_step_correct_kvcache(num_steps, attn_backend):
"""Verify that the KV cache of the draft model # """Verify that the KV cache of the draft model
is correctly updated for sequences with bonus token. # is correctly updated for sequences with bonus token.
""" # """
seed = 100 # seed = 100
model_name = "JackFram/llama-68m" # model_name = "JackFram/llama-68m"
block_size = 16 # block_size = 16
num_gpu_blocks = 2048 // block_size # num_gpu_blocks = 2048 // block_size
batch_size = 1 # batch_size = 1
with global_force_attn_backend_context_manager(attn_backend): # with global_force_attn_backend_context_manager(attn_backend):
dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32' # dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
multi_step_worker = create_worker(MultiStepWorker, # multi_step_worker = create_worker(MultiStepWorker,
model_name, # model_name,
block_size, # block_size,
num_gpu_blocks, # num_gpu_blocks,
seed, # seed,
model_runner_cls=TP1DraftModelRunner, # model_runner_cls=TP1DraftModelRunner,
dtype=dtype) # dtype=dtype)
multi_step_worker.set_include_gpu_probs_tensor() # multi_step_worker.set_include_gpu_probs_tensor()
worker = create_worker(Worker, # worker = create_worker(Worker,
model_name, # model_name,
block_size, # block_size,
num_gpu_blocks, # num_gpu_blocks,
seed, # seed,
dtype=dtype) # dtype=dtype)
prompts = [[0] for _ in range(batch_size)] # prompts = [[0] for _ in range(batch_size)]
# Already generate two tokens for the sequence # # Already generate two tokens for the sequence
# so that we can simulate the bonus token case # # so that we can simulate the bonus token case
multi_step_continuations = [[ # multi_step_continuations = [[
random.randint(0, 1000), # random.randint(0, 1000),
random.randint(0, 1000) # random.randint(0, 1000)
] for _ in prompts] # ] for _ in prompts]
final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts] # final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
seq_ids_with_bonus_token_in_last_step = set(range(batch_size)) # seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
seq_group_metadata_list = create_seq_group_metadata_from_prompts( # seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts, # prompts,
num_gpu_blocks, # num_gpu_blocks,
block_size, # block_size,
continuations=multi_step_continuations, # continuations=multi_step_continuations,
final_prompt_lens=final_prompt_lens) # final_prompt_lens=final_prompt_lens)
# Run multi-step. # # Run multi-step.
zero_kv_cache(multi_step_worker.cache_engine) # zero_kv_cache(multi_step_worker.cache_engine)
multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest( # multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list), # seq_group_metadata_list=seq_group_metadata_list),
sample_len=num_steps, # sample_len=num_steps,
seq_ids_with_bonus_token_in_last_step= # seq_ids_with_bonus_token_in_last_step=
seq_ids_with_bonus_token_in_last_step) # seq_ids_with_bonus_token_in_last_step)
# Run single-step repeatedly. # # Run single-step repeatedly.
zero_kv_cache(worker.cache_engine) # zero_kv_cache(worker.cache_engine)
# Generate the kv cache for the bonus token first # # Generate the kv cache for the bonus token first
single_step_continuations = [c[:1] for c in multi_step_continuations] # single_step_continuations = [c[:1] for c in multi_step_continuations]
seq_group_metadata_list = create_seq_group_metadata_from_prompts( # seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts, # prompts,
num_gpu_blocks, # num_gpu_blocks,
block_size, # block_size,
continuations=single_step_continuations, # continuations=single_step_continuations,
final_prompt_lens=final_prompt_lens) # final_prompt_lens=final_prompt_lens)
single_step_output = worker.execute_model( # single_step_output = worker.execute_model(
execute_model_req=ExecuteModelRequest( # execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list)) # seq_group_metadata_list=seq_group_metadata_list))
for _ in range(num_steps): # for _ in range(num_steps):
seq_group_metadata_list = create_seq_group_metadata_from_prompts( # seq_group_metadata_list = create_seq_group_metadata_from_prompts(
prompts, # prompts,
num_gpu_blocks, # num_gpu_blocks,
block_size, # block_size,
continuations=multi_step_continuations, # continuations=multi_step_continuations,
final_prompt_lens=final_prompt_lens) # final_prompt_lens=final_prompt_lens)
single_step_output = worker.execute_model( # single_step_output = worker.execute_model(
execute_model_req=ExecuteModelRequest( # execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list)) # seq_group_metadata_list=seq_group_metadata_list))
for i, seq_group_output in enumerate(single_step_output[-1]): # for i, seq_group_output in enumerate(single_step_output[-1]):
multi_step_continuations[i].append( # multi_step_continuations[i].append(
seq_group_output.samples[0].output_token) # seq_group_output.samples[0].output_token)
# Verify that the KV cache of the single-step and # # Verify that the KV cache of the single-step and
# multi-step workers are the same. # # multi-step workers are the same.
single_step_gpu_cache = worker.cache_engine[0].gpu_cache # single_step_gpu_cache = worker.cache_engine[0].gpu_cache
multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache # multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
num_layers = len(single_step_gpu_cache) # num_layers = len(single_step_gpu_cache)
allclose = lambda a, b: torch.allclose( # allclose = lambda a, b: torch.allclose(
a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2) # a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
for i in range(num_layers): # for i in range(num_layers):
assert allclose(single_step_gpu_cache[i][0], # assert allclose(single_step_gpu_cache[i][0],
multi_step_gpu_cache[i][0]) # multi_step_gpu_cache[i][0])
assert allclose(single_step_gpu_cache[i][1], # assert allclose(single_step_gpu_cache[i][1],
multi_step_gpu_cache[i][1]) # multi_step_gpu_cache[i][1])
@torch.inference_mode() @torch.inference_mode()
......
...@@ -5,6 +5,7 @@ from collections import defaultdict ...@@ -5,6 +5,7 @@ from collections import defaultdict
from types import SimpleNamespace from types import SimpleNamespace
from unittest.mock import MagicMock from unittest.mock import MagicMock
import os
import pytest import pytest
import torch import torch
...@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker ...@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
from .test_utils import mock_spec_decode_sampler from .test_utils import mock_spec_decode_sampler
from .utils import (create_batch, create_sampler_output_list, create_worker, from .utils import (create_batch, create_sampler_output_list, create_worker,
mock_worker) mock_worker)
from ..utils import models_path_prefix
@pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('k', [1, 2, 6])
...@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle(): ...@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
num_gpu_blocks = 8096 // block_size num_gpu_blocks = 8096 // block_size
target_worker = create_worker( target_worker = create_worker(
Worker, Worker,
"JackFram/llama-68m", os.path.join(models_path_prefix, "JackFram/llama-68m"),
block_size, block_size,
num_gpu_blocks, num_gpu_blocks,
seed, seed,
) )
draft_worker = create_worker( draft_worker = create_worker(
MultiStepWorker, MultiStepWorker,
"abhigoyal/vllm-eagle-llama-68m-random", os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random"),
block_size, block_size,
num_gpu_blocks, num_gpu_blocks,
seed, seed,
...@@ -941,4 +943,4 @@ def test_correctly_load_weight_for_eagle(): ...@@ -941,4 +943,4 @@ def test_correctly_load_weight_for_eagle():
target_worker.model_runner.model.lm_head.weight.data) target_worker.model_runner.model.lm_head.weight.data)
assert torch.allclose( assert torch.allclose(
worker.proposer_worker.worker.model_runner.model.lm_head.weight.data, worker.proposer_worker.worker.model_runner.model.lm_head.weight.data,
worker.scorer_worker.model_runner.model.lm_head.weight.data) worker.scorer_worker.model_runner.model.lm_head.weight.data)
\ No newline at end of file
...@@ -7,6 +7,7 @@ import pathlib ...@@ -7,6 +7,7 @@ import pathlib
import subprocess import subprocess
from functools import partial from functools import partial
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from typing import List, Tuple, Optional
import openai import openai
import pytest import pytest
...@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download ...@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.lora.request import LoRARequest
# yapf conflicts with isort for this docstring # yapf conflicts with isort for this docstring
# yapf: disable # yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
...@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): ...@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
EXAMPLES_PATH / "offline_inference/multilora_inference.py", EXAMPLES_PATH / "offline_inference/multilora_inference.py",
) )
model_ref = "meta-llama/Llama-2-7b-hf" model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test") lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
test_prompts = multilora_inference.create_test_prompts(lora_path) test_prompts = multilora_inference.create_test_prompts(lora_path)
...@@ -431,4 +433,4 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): ...@@ -431,4 +433,4 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
prompts, sampling_params) prompts, sampling_params)
# noqa: E501 # noqa: E501
assert outputs == deserialized_outputs assert outputs == deserialized_outputs
\ No newline at end of file
...@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"llama": { "llama": {
"model": "model":
"meta-llama/Meta-Llama-3.1-8B-Instruct", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template", "--tool-call-parser", "llama3_json", "--chat-template",
...@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"llama3.2": { "llama3.2": {
"model": "model":
"meta-llama/Llama-3.2-3B-Instruct", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-3B-Instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template", "--tool-call-parser", "llama3_json", "--chat-template",
...@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"llama4": { "llama4": {
"model": "model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct", os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template", "--tool-call-parser", "pythonic", "--chat-template",
...@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"llama4_json": { "llama4_json": {
"model": "model":
"meta-llama/Llama-4-Scout-17B-16E-Instruct", os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "-tp", "4", "--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
"--distributed-executor-backend", "mp", "--tool-call-parser", "--distributed-executor-backend", "mp", "--tool-call-parser",
...@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
# }, # },
"granite-3.0-8b": { "granite-3.0-8b": {
"model": "model":
"ibm-granite/granite-3.0-8b-instruct", os.path.join(models_path_prefix, "ibm-granite/granite-3.0-8b-instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "granite", "--chat-template", "--tool-call-parser", "granite", "--chat-template",
...@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"granite-3.1-8b": { "granite-3.1-8b": {
"model": "model":
"ibm-granite/granite-3.1-8b-instruct", os.path.join(models_path_prefix, "ibm-granite/granite-3.1-8b-instruct"),
"arguments": [ "arguments": [
"--enforce-eager", "--enforce-eager",
"--no-enable-prefix-caching", "--no-enable-prefix-caching",
...@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"internlm": { "internlm": {
"model": "model":
"internlm/internlm2_5-7b-chat", os.path.join(models_path_prefix, "internlm/internlm2_5-7b-chat"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "internlm", "--chat-template", "--tool-call-parser", "internlm", "--chat-template",
...@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = { ...@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
}, },
"toolACE": { "toolACE": {
"model": "model":
"Team-ACE/ToolACE-8B", os.path.join(models_path_prefix, "Team-ACE/ToolACE-8B"),
"arguments": [ "arguments": [
"--enforce-eager", "--no-enable-prefix-caching", "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template", "--tool-call-parser", "pythonic", "--chat-template",
...@@ -361,4 +361,4 @@ MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{ ...@@ -361,4 +361,4 @@ MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
"content": "content":
"The weather in Orlando FL is 78 degrees fahrenheit with clear" "The weather in Orlando FL is 78 degrees fahrenheit with clear"
"skies." "skies."
}] }]
\ No newline at end of file
...@@ -4,6 +4,8 @@ from dataclasses import dataclass ...@@ -4,6 +4,8 @@ from dataclasses import dataclass
import lm_eval import lm_eval
import pytest import pytest
import os
from ..utils import models_path_prefix
TASK = "gsm8k" TASK = "gsm8k"
FILTER = "exact_match,strict-match" FILTER = "exact_match,strict-match"
...@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig: ...@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
# NOTE: Accuracy scores measured on GPUs. # NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS = [ ACCURACY_CONFIGS = [
GSM8KAccuracyTestConfig( GSM8KAccuracyTestConfig(
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", model_name=os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"),
excepted_value=0.76), # no bias excepted_value=0.76), # no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU, # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As # so only one of these tests can run in a single call to pytest. As
...@@ -48,4 +50,4 @@ def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig): ...@@ -48,4 +50,4 @@ def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
measured_value = results["results"][TASK][FILTER] measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
\ No newline at end of file
...@@ -27,7 +27,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: ...@@ -27,7 +27,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
def test_deepseek_mla_attn_backend_module(): def test_deepseek_mla_attn_backend_module():
model_runner = _create_model_runner( model_runner = _create_model_runner(
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"),
trust_remote_code=True, trust_remote_code=True,
enable_chunked_prefill=False, enable_chunked_prefill=False,
) )
...@@ -383,4 +383,4 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): ...@@ -383,4 +383,4 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
assert attr_expected[1] == attr_actual[1] assert attr_expected[1] == attr_actual[1]
for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata), for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata),
vars(decode_meta_actual)): vars(decode_meta_actual)):
assert attr_expected[1] == attr_actual[1] assert attr_expected[1] == attr_actual[1]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment