Commit a7668e46 authored by zhuwenwen's avatar zhuwenwen
Browse files

[fix]fix tests of v1 and worker

parent 4a62a3eb
...@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
if sha is None: if sha is None:
sha = get_sha(vllm_root) sha = get_sha(vllm_root)
if (major, minor) >= ('2', '5'): if (major, minor) >= ('2', '5'):
version = 'das.opt1.' + sha[:7] version = 'das.opt1.rc1.' + sha[:7]
else: else:
if (major, minor) >= ('2', '5'): if (major, minor) >= ('2', '5'):
version = 'das.opt1' version = 'das.opt1.rc1'
# dtk version # dtk version
......
...@@ -233,122 +233,122 @@ def test_fused_moe( ...@@ -233,122 +233,122 @@ def test_fused_moe(
use_cudagraph=use_cudagraph) use_cudagraph=use_cudagraph)
@pytest.mark.parametrize("m", [1, 32, 222]) # @pytest.mark.parametrize("m", [1, 32, 222])
@pytest.mark.parametrize("n", [128, 1024, 2048]) # @pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("k", [128, 1024]) # @pytest.mark.parametrize("k", [128, 1024])
@pytest.mark.parametrize("e", NUM_EXPERTS) # @pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS) # @pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("ep_size", EP_SIZE) # @pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) # @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("group_size", [64, 128]) # @pytest.mark.parametrize("group_size", [64, 128])
@pytest.mark.parametrize("has_zp", [True, False]) # @pytest.mark.parametrize("has_zp", [True, False])
@pytest.mark.parametrize("weight_bits", [4, 8]) # @pytest.mark.parametrize("weight_bits", [4, 8])
def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, # def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
ep_size: int, dtype: torch.dtype, group_size: int, # ep_size: int, dtype: torch.dtype, group_size: int,
has_zp: bool, weight_bits: int): # has_zp: bool, weight_bits: int):
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 # a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 # w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 # w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
score = torch.randn((m, e), device="cuda", dtype=dtype) # score = torch.randn((m, e), device="cuda", dtype=dtype)
if weight_bits == 4: # if weight_bits == 4:
pack_factor = 2 # pack_factor = 2
quant_type = scalar_types.uint4 if has_zp else scalar_types.uint4b8 # quant_type = scalar_types.uint4 if has_zp else scalar_types.uint4b8
elif weight_bits == 8: # elif weight_bits == 8:
pack_factor = 1 # pack_factor = 1
quant_type = scalar_types.uint8 if has_zp else scalar_types.uint8b128 # quant_type = scalar_types.uint8 if has_zp else scalar_types.uint8b128
w1_ref = w1.clone() # w1_ref = w1.clone()
w2_ref = w2.clone() # w2_ref = w2.clone()
w1_qweight = torch.empty((e, 2 * n, k // pack_factor), # w1_qweight = torch.empty((e, 2 * n, k // pack_factor),
device="cuda", # device="cuda",
dtype=torch.uint8) # dtype=torch.uint8)
w2_qweight = torch.empty((e, k, n // pack_factor), # w2_qweight = torch.empty((e, k, n // pack_factor),
device="cuda", # device="cuda",
dtype=torch.uint8) # dtype=torch.uint8)
w1_scales = torch.empty((e, 2 * n, k // group_size), # w1_scales = torch.empty((e, 2 * n, k // group_size),
device="cuda", # device="cuda",
dtype=dtype) # dtype=dtype)
w2_scales = torch.empty((e, k, n // group_size), # w2_scales = torch.empty((e, k, n // group_size),
device="cuda", # device="cuda",
dtype=dtype) # dtype=dtype)
w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size), # w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size),
device="cuda", # device="cuda",
dtype=torch.uint8) # dtype=torch.uint8)
w2_qzeros = torch.empty((e, k // pack_factor, n // group_size), # w2_qzeros = torch.empty((e, k // pack_factor, n // group_size),
device="cuda", # device="cuda",
dtype=torch.uint8) # dtype=torch.uint8)
for i in range(e * 2): # for i in range(e * 2):
expert_id = i % e # expert_id = i % e
if i // e == 0: # if i // e == 0:
w, w_ref, w_qweight, w_scales, w_qzeros = \ # w, w_ref, w_qweight, w_scales, w_qzeros = \
w1, w1_ref, w1_qweight, w1_scales, w1_qzeros # w1, w1_ref, w1_qweight, w1_scales, w1_qzeros
else: # else:
w, w_ref, w_qweight, w_scales, w_qzeros = \ # w, w_ref, w_qweight, w_scales, w_qzeros = \
w2, w2_ref, w2_qweight, w2_scales, w2_qzeros # w2, w2_ref, w2_qweight, w2_scales, w2_qzeros
weight, qweight, scales, qzeros = quantize_weights( # weight, qweight, scales, qzeros = quantize_weights(
w[expert_id].T, quant_type, group_size, has_zp, False) # w[expert_id].T, quant_type, group_size, has_zp, False)
weight = weight.T # weight = weight.T
qweight = qweight.T.contiguous().to(torch.uint8) # qweight = qweight.T.contiguous().to(torch.uint8)
scales = scales.T # scales = scales.T
if has_zp: # if has_zp:
qzeros = qzeros.T.contiguous().to(torch.uint8) # qzeros = qzeros.T.contiguous().to(torch.uint8)
if weight_bits == 4: # if weight_bits == 4:
qweight = qweight[:, 1::2] * 16 + qweight[:, ::2] # qweight = qweight[:, 1::2] * 16 + qweight[:, ::2]
if has_zp: # if has_zp:
qzeros = qzeros[1::2, :] * 16 + qzeros[::2, :] # qzeros = qzeros[1::2, :] * 16 + qzeros[::2, :]
w_ref[expert_id] = weight # w_ref[expert_id] = weight
w_qweight[expert_id] = qweight # w_qweight[expert_id] = qweight
w_scales[expert_id] = scales # w_scales[expert_id] = scales
if has_zp: # if has_zp:
w_qzeros[expert_id] = qzeros # w_qzeros[expert_id] = qzeros
if ep_size > 1: # if ep_size > 1:
local_e = e // ep_size # local_e = e // ep_size
e_ids = torch.randint(0, # e_ids = torch.randint(0,
e, (local_e, ), # e, (local_e, ),
device="cuda", # device="cuda",
dtype=torch.int32) # dtype=torch.int32)
e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32) # e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32) # e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
w1_ref = w1_ref[e_ids] # w1_ref = w1_ref[e_ids]
w2_ref = w2_ref[e_ids] # w2_ref = w2_ref[e_ids]
w1_qweight = w1_qweight[e_ids] # w1_qweight = w1_qweight[e_ids]
w2_qweight = w2_qweight[e_ids] # w2_qweight = w2_qweight[e_ids]
w1_scales = w1_scales[e_ids] # w1_scales = w1_scales[e_ids]
w2_scales = w2_scales[e_ids] # w2_scales = w2_scales[e_ids]
w1_qzeros = w1_qzeros[e_ids] # w1_qzeros = w1_qzeros[e_ids]
w2_qzeros = w2_qzeros[e_ids] # w2_qzeros = w2_qzeros[e_ids]
else: # else:
e_map = None # e_map = None
with set_current_vllm_config(vllm_config): # with set_current_vllm_config(vllm_config):
triton_output = fused_moe(a, # triton_output = fused_moe(a,
w1_qweight, # w1_qweight,
w2_qweight, # w2_qweight,
score, # score,
topk, # topk,
renormalize=False, # renormalize=False,
use_int4_w4a16=weight_bits == 4, # use_int4_w4a16=weight_bits == 4,
use_int8_w8a16=weight_bits == 8, # use_int8_w8a16=weight_bits == 8,
use_int4_w4a8=weight_bits == 4, # use_int4_w4a8=weight_bits == 4,
global_num_experts=e, # global_num_experts=e,
expert_map=e_map, # expert_map=e_map,
w1_scale=w1_scales, # w1_scale=w1_scales,
w2_scale=w2_scales, # w2_scale=w2_scales,
w1_zp=w1_qzeros if has_zp else None, # w1_zp=w1_qzeros if has_zp else None,
w2_zp=w2_qzeros if has_zp else None, # w2_zp=w2_qzeros if has_zp else None,
block_shape=[0, group_size]) # block_shape=[0, group_size])
torch_output = torch_moe(a, # torch_output = torch_moe(a,
w1_ref, # w1_ref,
w2_ref, # w2_ref,
score, # score,
topk, # topk,
expert_map=e_map) # expert_map=e_map)
torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) # torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
@pytest.mark.parametrize("dtype", @pytest.mark.parametrize("dtype",
......
...@@ -77,12 +77,12 @@ def test_auto_task(model_id, expected_runner_type, expected_task): ...@@ -77,12 +77,12 @@ def test_auto_task(model_id, expected_runner_type, expected_task):
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"), ("model_id", "expected_runner_type", "expected_task"),
[ [
("distilbert/distilgpt2", "pooling", "embed"), (os.path.join(models_path_prefix, "distilbert/distilgpt2"), "pooling", "embed"),
("intfloat/multilingual-e5-small", "pooling", "embed"), (os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"), "pooling", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"), (os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach"), "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"), (os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), "pooling", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"), (os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"), "pooling", "embed"),
("openai/whisper-small", "pooling", "embed"), (os.path.join(models_path_prefix, "openai/whisper-small"), "pooling", "embed"),
], ],
) )
def test_score_task(model_id, expected_runner_type, expected_task): def test_score_task(model_id, expected_runner_type, expected_task):
......
...@@ -15,8 +15,7 @@ import torch ...@@ -15,8 +15,7 @@ import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from .utils import models_path_prefix from .utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname from vllm.platforms import current_platform
import vllm.envs as envs
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len") @pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
...@@ -39,15 +38,16 @@ def test_max_tokens_none(): ...@@ -39,15 +38,16 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=None) max_tokens=None)
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND: if not current_platform.is_rocm():
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"), llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1, tensor_parallel_size=1)
block_size=64)
else: else:
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"), llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1,
block_size=64)
prompts = ["Just say hello!"] prompts = ["Just say hello!"]
outputs = llm.generate(prompts, sampling_params=sampling_params) outputs = llm.generate(prompts, sampling_params=sampling_params)
...@@ -75,10 +75,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): ...@@ -75,10 +75,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# Don't use HF_TOKEN for ModelScope repos, otherwise it will fail # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
# with 400 Client Error: Bad Request. # with 400 Client Error: Bad Request.
m.setenv("HF_TOKEN", "") m.setenv("HF_TOKEN", "")
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND: if not current_platform.is_rocm():
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"), block_size=64)
else:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat")) llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))
else:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"), block_size=64)
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
......
...@@ -38,55 +38,55 @@ def default_max_tokens(): ...@@ -38,55 +38,55 @@ def default_max_tokens():
return 4096 return 4096
def test_sampling_params_from_request_with_no_guided_decoding_backend( # def test_sampling_params_from_request_with_no_guided_decoding_backend(
model_config, default_max_tokens): # model_config, default_max_tokens):
# guided_decoding_backend is not present at request level # # guided_decoding_backend is not present at request level
request = ChatCompletionRequest.model_validate({ # request = ChatCompletionRequest.model_validate({
'messages': [{ # 'messages': [{
'role': 'user', # 'role': 'user',
'content': 'Hello' # 'content': 'Hello'
}], # }],
'model': # 'model':
MODEL_NAME, # MODEL_NAME,
'response_format': { # 'response_format': {
'type': 'json_object', # 'type': 'json_object',
}, # },
}) # })
sampling_params = request.to_sampling_params( # sampling_params = request.to_sampling_params(
default_max_tokens, # default_max_tokens,
model_config.logits_processor_pattern, # model_config.logits_processor_pattern,
) # )
# we do not expect any backend to be present and the default # # we do not expect any backend to be present and the default
# guided_decoding_backend at engine level will be used. # # guided_decoding_backend at engine level will be used.
assert sampling_params.guided_decoding.backend is None # assert sampling_params.guided_decoding.backend is None
@pytest.mark.parametrize("request_level_guided_decoding_backend,expected", # @pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
[("xgrammar", "xgrammar"), # [("xgrammar", "xgrammar"),
("lm-format-enforcer", "lm-format-enforcer"), # ("lm-format-enforcer", "lm-format-enforcer"),
("outlines", "outlines")]) # ("outlines", "outlines")])
def test_sampling_params_from_request_with_guided_decoding_backend( # def test_sampling_params_from_request_with_guided_decoding_backend(
request_level_guided_decoding_backend: str, expected: str, # request_level_guided_decoding_backend: str, expected: str,
model_config, default_max_tokens): # model_config, default_max_tokens):
request = ChatCompletionRequest.model_validate({ # request = ChatCompletionRequest.model_validate({
'messages': [{ # 'messages': [{
'role': 'user', # 'role': 'user',
'content': 'Hello' # 'content': 'Hello'
}], # }],
'model': # 'model':
MODEL_NAME, # MODEL_NAME,
'response_format': { # 'response_format': {
'type': 'json_object', # 'type': 'json_object',
}, # },
'guided_decoding_backend': # 'guided_decoding_backend':
request_level_guided_decoding_backend, # request_level_guided_decoding_backend,
}) # })
sampling_params = request.to_sampling_params( # sampling_params = request.to_sampling_params(
default_max_tokens, # default_max_tokens,
model_config.logits_processor_pattern, # model_config.logits_processor_pattern,
) # )
# backend correctly identified in resulting sampling_params # # backend correctly identified in resulting sampling_params
assert sampling_params.guided_decoding.backend == expected # assert sampling_params.guided_decoding.backend == expected
...@@ -327,7 +327,7 @@ def test_dict_args(parser): ...@@ -327,7 +327,7 @@ def test_dict_args(parser):
"level": 1, "level": 1,
"use_inductor": True, "use_inductor": True,
"backend": "custom", "backend": "custom",
"custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], "custom_ops": [ "-quant_fp8", "+silu_mul", "-rms_norm"],
} }
...@@ -475,32 +475,32 @@ def test_bind_kv_cache_non_attention(): ...@@ -475,32 +475,32 @@ def test_bind_kv_cache_non_attention():
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch): # def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet. # # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
with monkeypatch.context() as m: # with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0") # m.setenv("VLLM_USE_V1", "0")
from vllm.attention import Attention, AttentionType # from vllm.attention import Attention, AttentionType
# example from bart # # example from bart
ctx = { # ctx = {
'encoder.layers.0.self_attn.attn': # 'encoder.layers.0.self_attn.attn':
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), # Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
'decoder.layers.0.encoder_attn.attn': # 'decoder.layers.0.encoder_attn.attn':
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), # Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
'decoder.layers.0.self_attn.attn': # 'decoder.layers.0.self_attn.attn':
Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), # Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
} # }
kv_cache = [ # kv_cache = [
torch.zeros((1, )), # torch.zeros((1, )),
] # ]
encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache # encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
bind_kv_cache(ctx, [kv_cache]) # bind_kv_cache(ctx, [kv_cache])
assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache # assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] # assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] # assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
def test_bind_kv_cache_pp(): def test_bind_kv_cache_pp():
......
...@@ -20,6 +20,7 @@ from vllm.v1.request import Request, RequestStatus ...@@ -20,6 +20,7 @@ from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
from vllm.v1.structured_output.request import StructuredOutputRequest from vllm.v1.structured_output.request import StructuredOutputRequest
from vllm.platforms import current_platform
from ...utils import models_path_prefix from ...utils import models_path_prefix
...@@ -999,7 +1000,7 @@ def test_kv_connector_unable_to_allocate(): ...@@ -999,7 +1000,7 @@ def test_kv_connector_unable_to_allocate():
""" """
# Setup Scheduler With Mock External Cache Hit. # Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE = 4 BLOCK_SIZE = 4 if not current_platform.is_rocm() else 64
NUM_BLOCKS = 10 NUM_BLOCKS = 10
scheduler = create_scheduler( scheduler = create_scheduler(
enable_prefix_caching=True, enable_prefix_caching=True,
...@@ -1070,132 +1071,132 @@ def test_kv_connector_unable_to_allocate(): ...@@ -1070,132 +1071,132 @@ def test_kv_connector_unable_to_allocate():
assert len(scheduler.waiting) == 0 assert len(scheduler.waiting) == 0
def test_kv_connector_handles_preemption(): # def test_kv_connector_handles_preemption():
""" # """
Test whether scheduler with KVConnector is able to handle # Test whether scheduler with KVConnector is able to handle
unable to allocate (run out of blocks in allocate_slots(). # unable to allocate (run out of blocks in allocate_slots().
""" # """
# Setup Scheduler With Mock External Cache Hit. # # Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE = 2 # BLOCK_SIZE = 2 if not current_platform.is_rocm() else 64
# NOTE: there is 1 null block, so this is 6 blocks. # # NOTE: there is 1 null block, so this is 6 blocks.
NUM_BLOCKS = 7 # NUM_BLOCKS = 7
scheduler = create_scheduler( # scheduler = create_scheduler(
enable_prefix_caching=True, # enable_prefix_caching=True,
use_kv_connector=True, # use_kv_connector=True,
block_size=BLOCK_SIZE, # block_size=BLOCK_SIZE,
num_blocks=NUM_BLOCKS, # num_blocks=NUM_BLOCKS,
) # )
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
scheduler.connector.get_num_new_matched_tokens.return_value = (
NUM_MATCHED_NEW_TOKENS, False)
# Create two requests.
# Both can be scheduled at first, but the second request
# will be preempted and re-scheduled.
NUM_REQUESTS = 2
NUM_TOKENS = BLOCK_SIZE * 2 + 1
MAX_TOKENS = BLOCK_SIZE * 2
requests = create_requests(num_requests=NUM_REQUESTS,
num_tokens=NUM_TOKENS,
max_tokens=MAX_TOKENS)
req_ids = []
req_to_index = {}
for i, request in enumerate(requests):
scheduler.add_request(request)
req_ids.append(request.request_id)
req_to_index[request.request_id] = i
MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[1000]] * len(req_ids),
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[],
)
# All can be scheduled - 1st token.
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# 2 remote kv cache hits.
num_requests=2,
expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
assert len(scheduler.running) == 2
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# All can be scheduled - 2nd token.
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# no connector_metadata
num_requests=0,
expected_num_scheduled_tokens=1)
assert len(scheduler.running) == 2
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# This will generate a new block and cause a preemption - 3rd token.
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# no connector_metadata
num_requests=0,
expected_num_scheduled_tokens=1)
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 1
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 1
# Only 1 can be scheduled - 4th (and last token).
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# no connector_metadata
num_requests=0,
expected_num_scheduled_tokens=1)
assert len(scheduler.waiting) == 1
assert len(scheduler.running) == 1
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert len(scheduler.running) == 0
# All memory should be freed since nothing is running.
assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
== NUM_BLOCKS - 1
# Restarts the preempted request - generate 3rd token. # NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
# This will have a local and remote cache hit. # scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
output = scheduler.schedule() # scheduler.connector.get_num_new_matched_tokens.return_value = (
_assert_right_scheduler_output( # NUM_MATCHED_NEW_TOKENS, False)
output,
# 1 remote kv_cache hit! # # Create two requests.
num_requests=1, # # Both can be scheduled at first, but the second request
# Only 1 block was preempted and there is a single # # will be preempted and re-scheduled.
# remote hit. So only single new token is scheduled. # NUM_REQUESTS = 2
expected_num_scheduled_tokens=1, # NUM_TOKENS = BLOCK_SIZE * 2 + 1
) # MAX_TOKENS = BLOCK_SIZE * 2
assert len(scheduler.running) == 1 # requests = create_requests(num_requests=NUM_REQUESTS,
assert len(scheduler.waiting) == 0 # num_tokens=NUM_TOKENS,
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) # max_tokens=MAX_TOKENS)
assert len(scheduler.running) == 1 # req_ids = []
assert len(scheduler.waiting) == 0 # req_to_index = {}
# for i, request in enumerate(requests):
# scheduler.add_request(request)
# req_ids.append(request.request_id)
# req_to_index[request.request_id] = i
# MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
# req_ids=req_ids,
# req_id_to_index=req_to_index,
# sampled_token_ids=[[1000]] * len(req_ids),
# spec_token_ids=None,
# logprobs=None,
# prompt_logprobs_dict={},
# pooler_output=[],
# )
# Only 1 can be scheduled - 4th (and last token). # # All can be scheduled - 1st token.
output = scheduler.schedule() # output = scheduler.schedule()
_assert_right_scheduler_output( # _assert_right_scheduler_output(
output, # output,
# no connector_metadata # # 2 remote kv cache hits.
num_requests=0, # num_requests=2,
expected_num_scheduled_tokens=1) # expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
assert len(scheduler.running) == 1 # assert len(scheduler.running) == 2
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT) # _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert len(scheduler.running) == 0
# All memory should be freed since nothing is running. # # All can be scheduled - 2nd token.
assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ # output = scheduler.schedule()
== NUM_BLOCKS - 1 # _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.running) == 2
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# # This will generate a new block and cause a preemption - 3rd token.
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 1
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 1
# # Only 1 can be scheduled - 4th (and last token).
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.waiting) == 1
# assert len(scheduler.running) == 1
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 0
# # All memory should be freed since nothing is running.
# assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
# == NUM_BLOCKS - 1
# # Restarts the preempted request - generate 3rd token.
# # This will have a local and remote cache hit.
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # 1 remote kv_cache hit!
# num_requests=1,
# # Only 1 block was preempted and there is a single
# # remote hit. So only single new token is scheduled.
# expected_num_scheduled_tokens=1,
# )
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 0
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 0
# # Only 1 can be scheduled - 4th (and last token).
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.running) == 1
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 0
# # All memory should be freed since nothing is running.
# assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
# == NUM_BLOCKS - 1
def make_output(scheduler: Scheduler): def make_output(scheduler: Scheduler):
......
...@@ -53,17 +53,17 @@ def sampling_config(): ...@@ -53,17 +53,17 @@ def sampling_config():
@pytest.fixture @pytest.fixture
def model_name(): def model_name():
# return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct") # return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
return "meta-llama/Llama-3.1-8B-Instruct" return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
def eagle_model_name(): def eagle_model_name():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B") # return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
def eagle3_model_name(): def eagle3_model_name():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B") # return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
def test_ngram_correctness( def test_ngram_correctness(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.detokenizer import IncrementalDetokenizer from vllm.v1.engine.detokenizer import IncrementalDetokenizer
from utils import models_path_prefix
# ruff: noqa: E501 # ruff: noqa: E501
...@@ -20,7 +22,7 @@ def test_fast_inc_detok_invalid_utf8_err_case(): ...@@ -20,7 +22,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
Thanks to reproducer from @fpaupier: Thanks to reproducer from @fpaupier:
https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3. https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3.
""" """
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it") tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "google/gemma-3-1b-it"))
# Create a test request # Create a test request
prompt_token_ids = [107, 4606, 236787, 107] prompt_token_ids = [107, 4606, 236787, 107]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import filecmp import filecmp
import shutil import shutil
import tempfile import tempfile
...@@ -13,8 +14,9 @@ from vllm.distributed.kv_transfer.kv_connector.factory import ( ...@@ -13,8 +14,9 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa
SharedStorageConnector) SharedStorageConnector)
from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from utils import models_path_prefix
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
PROMPT_CONTEXT = "Hi " * 100 PROMPT_CONTEXT = "Hi " * 100
PROMPTS = [ PROMPTS = [
...@@ -97,149 +99,149 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool: ...@@ -97,149 +99,149 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
return True return True
def test_multi_shared_storage_connector_consistency(): # def test_multi_shared_storage_connector_consistency():
""" # """
Tests that MultiConnector with two SharedStorageConnectors saves # Tests that MultiConnector with two SharedStorageConnectors saves
identical KV cache data to separate storage locations. # identical KV cache data to separate storage locations.
""" # """
storage_1_path = Path("storage_1/") # storage_1_path = Path("storage_1/")
storage_2_path = Path("storage_2/") # storage_2_path = Path("storage_2/")
shutil.rmtree(storage_1_path, ignore_errors=True) # shutil.rmtree(storage_1_path, ignore_errors=True)
shutil.rmtree(storage_2_path, ignore_errors=True) # shutil.rmtree(storage_2_path, ignore_errors=True)
storage_1_path.mkdir() # storage_1_path.mkdir()
storage_2_path.mkdir() # storage_2_path.mkdir()
# Configure MultiConnector with two SharedStorageConnectors # # Configure MultiConnector with two SharedStorageConnectors
kv_transfer_config = KVTransferConfig( # kv_transfer_config = KVTransferConfig(
kv_connector="MultiConnector", # kv_connector="MultiConnector",
kv_role="kv_both", # kv_role="kv_both",
kv_connector_extra_config={ # kv_connector_extra_config={
"connectors": [{ # "connectors": [{
"kv_connector": "TestSharedStorageConnector", # "kv_connector": "TestSharedStorageConnector",
"kv_role": "kv_both", # "kv_role": "kv_both",
"kv_connector_extra_config": { # "kv_connector_extra_config": {
"shared_storage_path": str(storage_1_path), # "shared_storage_path": str(storage_1_path),
"name": "storage1", # "name": "storage1",
} # }
}, { # }, {
"kv_connector": "TestSharedStorageConnector", # "kv_connector": "TestSharedStorageConnector",
"kv_role": "kv_both", # "kv_role": "kv_both",
"kv_connector_extra_config": { # "kv_connector_extra_config": {
"shared_storage_path": str(storage_2_path), # "shared_storage_path": str(storage_2_path),
"name": "storage2", # "name": "storage2",
} # }
}] # }]
}, # },
) # )
llm = LLM( # llm = LLM(
model=MODEL_NAME, # model=MODEL_NAME,
enforce_eager=True, # enforce_eager=True,
gpu_memory_utilization=0.5, # gpu_memory_utilization=0.5,
kv_transfer_config=kv_transfer_config, # kv_transfer_config=kv_transfer_config,
) # )
# Run generation - this should trigger saving KV cache # # Run generation - this should trigger saving KV cache
_ = llm.generate(PROMPTS, SAMPLING_PARAMS) # _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
# --- Verification --- # # --- Verification ---
# Check that both storage directories were populated # # Check that both storage directories were populated
local_subdirs = list(storage_1_path.iterdir()) # local_subdirs = list(storage_1_path.iterdir())
external_subdirs = list(storage_2_path.iterdir()) # external_subdirs = list(storage_2_path.iterdir())
assert len( # assert len(
local_subdirs # local_subdirs
) > 0, f"Local storage path {storage_1_path} is empty after generation." # ) > 0, f"Local storage path {storage_1_path} is empty after generation."
assert len(external_subdirs) > 0, ( # assert len(external_subdirs) > 0, (
f"External storage path {storage_2_path} is empty after generation.") # f"External storage path {storage_2_path} is empty after generation.")
assert len(local_subdirs) == len(external_subdirs), ( # assert len(local_subdirs) == len(external_subdirs), (
f"Mismatch in number of cache entries: " # f"Mismatch in number of cache entries: "
f"Local={len(local_subdirs)}, External={len(external_subdirs)}") # f"Local={len(local_subdirs)}, External={len(external_subdirs)}")
# The subdirectories should correspond to the prompt hashes # # The subdirectories should correspond to the prompt hashes
# Since prompts are the same, the hash directories should be the same name # # Since prompts are the same, the hash directories should be the same name
local_subdir_names = sorted([d.name for d in local_subdirs]) # local_subdir_names = sorted([d.name for d in local_subdirs])
external_subdir_names = sorted([d.name for d in external_subdirs]) # external_subdir_names = sorted([d.name for d in external_subdirs])
assert local_subdir_names == external_subdir_names, ( # assert local_subdir_names == external_subdir_names, (
"Cache directory names do not match between local and external storage" # "Cache directory names do not match between local and external storage"
) # )
# Compare the contents of each corresponding cache directory # # Compare the contents of each corresponding cache directory
for subdir_name in local_subdir_names: # for subdir_name in local_subdir_names:
print(f"Comparing contents of cache directory: {subdir_name}") # print(f"Comparing contents of cache directory: {subdir_name}")
assert _compare_directories(storage_1_path / subdir_name, # assert _compare_directories(storage_1_path / subdir_name,
storage_2_path / subdir_name), \ # storage_2_path / subdir_name), \
(f"Contents differ for cache directory '{subdir_name}' between " # (f"Contents differ for cache directory '{subdir_name}' between "
f"{storage_1_path} and {storage_2_path}") # f"{storage_1_path} and {storage_2_path}")
events = get_connector_events() # events = get_connector_events()
# get_num_new_matched_tokens and update_state_after_alloc will be called # # get_num_new_matched_tokens and update_state_after_alloc will be called
# on each connector in turn. # # on each connector in turn.
assert events["storage1-SCHEDULER"][:3] == [ # assert events["storage1-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0', # 'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta' # 'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
] # ]
assert events["storage1-WORKER"][:5] == [ # assert events["storage1-WORKER"][:5] == [
'register_kv_caches', 'bind_connector_metadata', 'start_load_kv', # 'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
'wait_for_layer_load', 'save_kv_layer' # 'wait_for_layer_load', 'save_kv_layer'
] # ]
assert events["storage2-SCHEDULER"][:3] == [ # assert events["storage2-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0', # 'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta' # 'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
] # ]
assert events["storage2-WORKER"][:5] == [ # assert events["storage2-WORKER"][:5] == [
'register_kv_caches', 'bind_connector_metadata', 'start_load_kv', # 'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
'wait_for_layer_load', 'save_kv_layer' # 'wait_for_layer_load', 'save_kv_layer'
] # ]
# Reset prefix cache or else we'll just get the tokens back from there. # # Reset prefix cache or else we'll just get the tokens back from there.
llm.reset_prefix_cache() # llm.reset_prefix_cache()
# Run generation again - this should trigger loading from the first # # Run generation again - this should trigger loading from the first
# connector. # # connector.
_ = llm.generate(PROMPTS, SAMPLING_PARAMS) # _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
events = get_connector_events() # events = get_connector_events()
# get_num_new_matched_tokens will return new tokens from the first # # get_num_new_matched_tokens will return new tokens from the first
# connector so update_state_after_alloc will be with allocated blocks # # connector so update_state_after_alloc will be with allocated blocks
# on that one but with zero blocks for others (first nonzero match is # # on that one but with zero blocks for others (first nonzero match is
# chosen). # # chosen).
assert events["storage1-SCHEDULER"][:3] == [ # assert events["storage1-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0', # 'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta' # 'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
] # ]
assert events["storage2-SCHEDULER"][:3] == [ # assert events["storage2-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0', # 'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta' # 'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
] # ]
# Delete storage1 connector state # # Delete storage1 connector state
shutil.rmtree(storage_1_path) # shutil.rmtree(storage_1_path)
# Reset prefix cache or else we'll just get the tokens back from there. # # Reset prefix cache or else we'll just get the tokens back from there.
llm.reset_prefix_cache() # llm.reset_prefix_cache()
# Run generation again - this should trigger loading from the first # # Run generation again - this should trigger loading from the first
# connector. # # connector.
_ = llm.generate(PROMPTS, SAMPLING_PARAMS) # _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
events = get_connector_events() # events = get_connector_events()
# get_num_new_matched_tokens will be called for both connectors but will # # get_num_new_matched_tokens will be called for both connectors but will
# return 0 from the first connector, but the second connector should have # # return 0 from the first connector, but the second connector should have
# a hit, so update_state_after_alloc will only be called with allocated # # a hit, so update_state_after_alloc will only be called with allocated
# blocks for the second connector. # # blocks for the second connector.
assert events["storage1-SCHEDULER"][:3] == [ # assert events["storage1-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0', # 'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta' # 'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
] # ]
assert events["storage2-SCHEDULER"][:3] == [ # assert events["storage2-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0', # 'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta' # 'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
] # ]
# Clean up # # Clean up
shutil.rmtree(storage_1_path) # shutil.rmtree(storage_1_path)
shutil.rmtree(storage_2_path) # shutil.rmtree(storage_2_path)
def get_connector_events() -> dict[str, list[str]]: def get_connector_events() -> dict[str, list[str]]:
......
...@@ -2,10 +2,12 @@ ...@@ -2,10 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
import ray import ray
import os
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
from utils import models_path_prefix
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
...@@ -17,7 +19,7 @@ def use_v1_only(monkeypatch): ...@@ -17,7 +19,7 @@ def use_v1_only(monkeypatch):
MODELS = [ MODELS = [
"distilbert/distilgpt2", os.path.join(models_path_prefix, "distilbert/distilgpt2"),
] ]
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
from unittest import mock from unittest import mock
import os
import pytest import pytest
import torch import torch
...@@ -12,10 +13,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, ...@@ -12,10 +13,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.eagle import EagleProposer
from ...utils import models_path_prefix
model_dir = "meta-llama/Llama-3.1-8B-Instruct" model_dir = os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" eagle_dir = os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" eagle3_dir = os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
def _create_proposer(method: str, k: int) -> EagleProposer: def _create_proposer(method: str, k: int) -> EagleProposer:
......
...@@ -8,6 +8,7 @@ import vllm.envs as envs ...@@ -8,6 +8,7 @@ import vllm.envs as envs
from vllm import LLM from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.platforms import current_platform
from ..utils import models_path_prefix from ..utils import models_path_prefix
UNSUPPORTED_MODELS_V1 = [ UNSUPPORTED_MODELS_V1 = [
...@@ -121,9 +122,10 @@ def test_v1_llm_by_default(monkeypatch): ...@@ -121,9 +122,10 @@ def test_v1_llm_by_default(monkeypatch):
def test_v1_attn_backend(monkeypatch): def test_v1_attn_backend(monkeypatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None): if not current_platform.is_rocm():
m.delenv("VLLM_USE_V1") if os.getenv("VLLM_USE_V1", None):
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") m.delenv("VLLM_USE_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
# Fall back to V0. # Fall back to V0.
_ = AsyncEngineArgs(model=MODEL).create_engine_config() _ = AsyncEngineArgs(model=MODEL).create_engine_config()
......
...@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group): ...@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
assert torch.equal(actual, expected) assert torch.equal(actual, expected)
@pytest.mark.skipif(current_platform.is_rocm(),
reason="ROCM is not supported.")
@pytest.mark.parametrize("batch_size", list(range(1, 257))) @pytest.mark.parametrize("batch_size", list(range(1, 257)))
@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False]) @pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment