Commit a0d02d42 authored by 王敏's avatar 王敏
Browse files

Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev

parents 69f30ae0 7a97637e
......@@ -20,6 +20,7 @@ from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from vllm.v1.structured_output.request import StructuredOutputRequest
from vllm.platforms import current_platform
from ...utils import models_path_prefix
......@@ -999,7 +1000,7 @@ def test_kv_connector_unable_to_allocate():
"""
# Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE = 4
BLOCK_SIZE = 4 if not current_platform.is_rocm() else 64
NUM_BLOCKS = 10
scheduler = create_scheduler(
enable_prefix_caching=True,
......@@ -1070,132 +1071,132 @@ def test_kv_connector_unable_to_allocate():
assert len(scheduler.waiting) == 0
def test_kv_connector_handles_preemption():
"""
Test whether scheduler with KVConnector is able to handle
unable to allocate (run out of blocks in allocate_slots().
"""
# Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE = 2
# NOTE: there is 1 null block, so this is 6 blocks.
NUM_BLOCKS = 7
scheduler = create_scheduler(
enable_prefix_caching=True,
use_kv_connector=True,
block_size=BLOCK_SIZE,
num_blocks=NUM_BLOCKS,
)
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
scheduler.connector.get_num_new_matched_tokens.return_value = (
NUM_MATCHED_NEW_TOKENS, False)
# Create two requests.
# Both can be scheduled at first, but the second request
# will be preempted and re-scheduled.
NUM_REQUESTS = 2
NUM_TOKENS = BLOCK_SIZE * 2 + 1
MAX_TOKENS = BLOCK_SIZE * 2
requests = create_requests(num_requests=NUM_REQUESTS,
num_tokens=NUM_TOKENS,
max_tokens=MAX_TOKENS)
req_ids = []
req_to_index = {}
for i, request in enumerate(requests):
scheduler.add_request(request)
req_ids.append(request.request_id)
req_to_index[request.request_id] = i
MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[1000]] * len(req_ids),
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[],
)
# All can be scheduled - 1st token.
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# 2 remote kv cache hits.
num_requests=2,
expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
assert len(scheduler.running) == 2
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# All can be scheduled - 2nd token.
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# no connector_metadata
num_requests=0,
expected_num_scheduled_tokens=1)
assert len(scheduler.running) == 2
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# This will generate a new block and cause a preemption - 3rd token.
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# no connector_metadata
num_requests=0,
expected_num_scheduled_tokens=1)
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 1
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 1
# Only 1 can be scheduled - 4th (and last token).
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# no connector_metadata
num_requests=0,
expected_num_scheduled_tokens=1)
assert len(scheduler.waiting) == 1
assert len(scheduler.running) == 1
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert len(scheduler.running) == 0
# All memory should be freed since nothing is running.
assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
== NUM_BLOCKS - 1
# def test_kv_connector_handles_preemption():
# """
# Test whether scheduler with KVConnector is able to handle
# unable to allocate (run out of blocks in allocate_slots().
# """
# # Setup Scheduler With Mock External Cache Hit.
# BLOCK_SIZE = 2 if not current_platform.is_rocm() else 64
# # NOTE: there is 1 null block, so this is 6 blocks.
# NUM_BLOCKS = 7
# scheduler = create_scheduler(
# enable_prefix_caching=True,
# use_kv_connector=True,
# block_size=BLOCK_SIZE,
# num_blocks=NUM_BLOCKS,
# )
# Restarts the preempted request - generate 3rd token.
# This will have a local and remote cache hit.
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# 1 remote kv_cache hit!
num_requests=1,
# Only 1 block was preempted and there is a single
# remote hit. So only single new token is scheduled.
expected_num_scheduled_tokens=1,
)
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 0
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 0
# NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
# scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
# scheduler.connector.get_num_new_matched_tokens.return_value = (
# NUM_MATCHED_NEW_TOKENS, False)
# # Create two requests.
# # Both can be scheduled at first, but the second request
# # will be preempted and re-scheduled.
# NUM_REQUESTS = 2
# NUM_TOKENS = BLOCK_SIZE * 2 + 1
# MAX_TOKENS = BLOCK_SIZE * 2
# requests = create_requests(num_requests=NUM_REQUESTS,
# num_tokens=NUM_TOKENS,
# max_tokens=MAX_TOKENS)
# req_ids = []
# req_to_index = {}
# for i, request in enumerate(requests):
# scheduler.add_request(request)
# req_ids.append(request.request_id)
# req_to_index[request.request_id] = i
# MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
# req_ids=req_ids,
# req_id_to_index=req_to_index,
# sampled_token_ids=[[1000]] * len(req_ids),
# spec_token_ids=None,
# logprobs=None,
# prompt_logprobs_dict={},
# pooler_output=[],
# )
# Only 1 can be scheduled - 4th (and last token).
output = scheduler.schedule()
_assert_right_scheduler_output(
output,
# no connector_metadata
num_requests=0,
expected_num_scheduled_tokens=1)
assert len(scheduler.running) == 1
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
assert len(scheduler.running) == 0
# All memory should be freed since nothing is running.
assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
== NUM_BLOCKS - 1
# # All can be scheduled - 1st token.
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # 2 remote kv cache hits.
# num_requests=2,
# expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
# assert len(scheduler.running) == 2
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# # All can be scheduled - 2nd token.
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.running) == 2
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# # This will generate a new block and cause a preemption - 3rd token.
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 1
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 1
# # Only 1 can be scheduled - 4th (and last token).
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.waiting) == 1
# assert len(scheduler.running) == 1
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 0
# # All memory should be freed since nothing is running.
# assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
# == NUM_BLOCKS - 1
# # Restarts the preempted request - generate 3rd token.
# # This will have a local and remote cache hit.
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # 1 remote kv_cache hit!
# num_requests=1,
# # Only 1 block was preempted and there is a single
# # remote hit. So only single new token is scheduled.
# expected_num_scheduled_tokens=1,
# )
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 0
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 1
# assert len(scheduler.waiting) == 0
# # Only 1 can be scheduled - 4th (and last token).
# output = scheduler.schedule()
# _assert_right_scheduler_output(
# output,
# # no connector_metadata
# num_requests=0,
# expected_num_scheduled_tokens=1)
# assert len(scheduler.running) == 1
# _ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
# assert len(scheduler.running) == 0
# # All memory should be freed since nothing is running.
# assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
# == NUM_BLOCKS - 1
def make_output(scheduler: Scheduler):
......
......@@ -53,17 +53,17 @@ def sampling_config():
@pytest.fixture
def model_name():
# return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
return "meta-llama/Llama-3.1-8B-Instruct"
return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
def eagle_model_name():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
return os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
def eagle3_model_name():
# return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
return os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
def test_ngram_correctness(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from transformers import AutoTokenizer
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.detokenizer import IncrementalDetokenizer
from utils import models_path_prefix
# ruff: noqa: E501
......@@ -20,7 +22,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
Thanks to reproducer from @fpaupier:
https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3.
"""
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, "google/gemma-3-1b-it"))
# Create a test request
prompt_token_ids = [107, 4606, 236787, 107]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import filecmp
import shutil
import tempfile
......@@ -13,8 +14,9 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa
SharedStorageConnector)
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from utils import models_path_prefix
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
PROMPT_CONTEXT = "Hi " * 100
PROMPTS = [
......@@ -97,149 +99,149 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
return True
def test_multi_shared_storage_connector_consistency():
"""
Tests that MultiConnector with two SharedStorageConnectors saves
identical KV cache data to separate storage locations.
"""
storage_1_path = Path("storage_1/")
storage_2_path = Path("storage_2/")
shutil.rmtree(storage_1_path, ignore_errors=True)
shutil.rmtree(storage_2_path, ignore_errors=True)
storage_1_path.mkdir()
storage_2_path.mkdir()
# Configure MultiConnector with two SharedStorageConnectors
kv_transfer_config = KVTransferConfig(
kv_connector="MultiConnector",
kv_role="kv_both",
kv_connector_extra_config={
"connectors": [{
"kv_connector": "TestSharedStorageConnector",
"kv_role": "kv_both",
"kv_connector_extra_config": {
"shared_storage_path": str(storage_1_path),
"name": "storage1",
}
}, {
"kv_connector": "TestSharedStorageConnector",
"kv_role": "kv_both",
"kv_connector_extra_config": {
"shared_storage_path": str(storage_2_path),
"name": "storage2",
}
}]
},
)
llm = LLM(
model=MODEL_NAME,
enforce_eager=True,
gpu_memory_utilization=0.5,
kv_transfer_config=kv_transfer_config,
)
# Run generation - this should trigger saving KV cache
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
# --- Verification ---
# Check that both storage directories were populated
local_subdirs = list(storage_1_path.iterdir())
external_subdirs = list(storage_2_path.iterdir())
assert len(
local_subdirs
) > 0, f"Local storage path {storage_1_path} is empty after generation."
assert len(external_subdirs) > 0, (
f"External storage path {storage_2_path} is empty after generation.")
assert len(local_subdirs) == len(external_subdirs), (
f"Mismatch in number of cache entries: "
f"Local={len(local_subdirs)}, External={len(external_subdirs)}")
# The subdirectories should correspond to the prompt hashes
# Since prompts are the same, the hash directories should be the same name
local_subdir_names = sorted([d.name for d in local_subdirs])
external_subdir_names = sorted([d.name for d in external_subdirs])
assert local_subdir_names == external_subdir_names, (
"Cache directory names do not match between local and external storage"
)
# Compare the contents of each corresponding cache directory
for subdir_name in local_subdir_names:
print(f"Comparing contents of cache directory: {subdir_name}")
assert _compare_directories(storage_1_path / subdir_name,
storage_2_path / subdir_name), \
(f"Contents differ for cache directory '{subdir_name}' between "
f"{storage_1_path} and {storage_2_path}")
events = get_connector_events()
# get_num_new_matched_tokens and update_state_after_alloc will be called
# on each connector in turn.
assert events["storage1-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
]
assert events["storage1-WORKER"][:5] == [
'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
'wait_for_layer_load', 'save_kv_layer'
]
assert events["storage2-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
]
assert events["storage2-WORKER"][:5] == [
'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
'wait_for_layer_load', 'save_kv_layer'
]
# Reset prefix cache or else we'll just get the tokens back from there.
llm.reset_prefix_cache()
# Run generation again - this should trigger loading from the first
# connector.
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
events = get_connector_events()
# get_num_new_matched_tokens will return new tokens from the first
# connector so update_state_after_alloc will be with allocated blocks
# on that one but with zero blocks for others (first nonzero match is
# chosen).
assert events["storage1-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
]
assert events["storage2-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
]
# Delete storage1 connector state
shutil.rmtree(storage_1_path)
# Reset prefix cache or else we'll just get the tokens back from there.
llm.reset_prefix_cache()
# Run generation again - this should trigger loading from the first
# connector.
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
events = get_connector_events()
# get_num_new_matched_tokens will be called for both connectors but will
# return 0 from the first connector, but the second connector should have
# a hit, so update_state_after_alloc will only be called with allocated
# blocks for the second connector.
assert events["storage1-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
]
assert events["storage2-SCHEDULER"][:3] == [
'get_num_new_matched_tokens 0',
'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
]
# Clean up
shutil.rmtree(storage_1_path)
shutil.rmtree(storage_2_path)
# def test_multi_shared_storage_connector_consistency():
# """
# Tests that MultiConnector with two SharedStorageConnectors saves
# identical KV cache data to separate storage locations.
# """
# storage_1_path = Path("storage_1/")
# storage_2_path = Path("storage_2/")
# shutil.rmtree(storage_1_path, ignore_errors=True)
# shutil.rmtree(storage_2_path, ignore_errors=True)
# storage_1_path.mkdir()
# storage_2_path.mkdir()
# # Configure MultiConnector with two SharedStorageConnectors
# kv_transfer_config = KVTransferConfig(
# kv_connector="MultiConnector",
# kv_role="kv_both",
# kv_connector_extra_config={
# "connectors": [{
# "kv_connector": "TestSharedStorageConnector",
# "kv_role": "kv_both",
# "kv_connector_extra_config": {
# "shared_storage_path": str(storage_1_path),
# "name": "storage1",
# }
# }, {
# "kv_connector": "TestSharedStorageConnector",
# "kv_role": "kv_both",
# "kv_connector_extra_config": {
# "shared_storage_path": str(storage_2_path),
# "name": "storage2",
# }
# }]
# },
# )
# llm = LLM(
# model=MODEL_NAME,
# enforce_eager=True,
# gpu_memory_utilization=0.5,
# kv_transfer_config=kv_transfer_config,
# )
# # Run generation - this should trigger saving KV cache
# _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
# # --- Verification ---
# # Check that both storage directories were populated
# local_subdirs = list(storage_1_path.iterdir())
# external_subdirs = list(storage_2_path.iterdir())
# assert len(
# local_subdirs
# ) > 0, f"Local storage path {storage_1_path} is empty after generation."
# assert len(external_subdirs) > 0, (
# f"External storage path {storage_2_path} is empty after generation.")
# assert len(local_subdirs) == len(external_subdirs), (
# f"Mismatch in number of cache entries: "
# f"Local={len(local_subdirs)}, External={len(external_subdirs)}")
# # The subdirectories should correspond to the prompt hashes
# # Since prompts are the same, the hash directories should be the same name
# local_subdir_names = sorted([d.name for d in local_subdirs])
# external_subdir_names = sorted([d.name for d in external_subdirs])
# assert local_subdir_names == external_subdir_names, (
# "Cache directory names do not match between local and external storage"
# )
# # Compare the contents of each corresponding cache directory
# for subdir_name in local_subdir_names:
# print(f"Comparing contents of cache directory: {subdir_name}")
# assert _compare_directories(storage_1_path / subdir_name,
# storage_2_path / subdir_name), \
# (f"Contents differ for cache directory '{subdir_name}' between "
# f"{storage_1_path} and {storage_2_path}")
# events = get_connector_events()
# # get_num_new_matched_tokens and update_state_after_alloc will be called
# # on each connector in turn.
# assert events["storage1-SCHEDULER"][:3] == [
# 'get_num_new_matched_tokens 0',
# 'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
# ]
# assert events["storage1-WORKER"][:5] == [
# 'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
# 'wait_for_layer_load', 'save_kv_layer'
# ]
# assert events["storage2-SCHEDULER"][:3] == [
# 'get_num_new_matched_tokens 0',
# 'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
# ]
# assert events["storage2-WORKER"][:5] == [
# 'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
# 'wait_for_layer_load', 'save_kv_layer'
# ]
# # Reset prefix cache or else we'll just get the tokens back from there.
# llm.reset_prefix_cache()
# # Run generation again - this should trigger loading from the first
# # connector.
# _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
# events = get_connector_events()
# # get_num_new_matched_tokens will return new tokens from the first
# # connector so update_state_after_alloc will be with allocated blocks
# # on that one but with zero blocks for others (first nonzero match is
# # chosen).
# assert events["storage1-SCHEDULER"][:3] == [
# 'get_num_new_matched_tokens 0',
# 'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
# ]
# assert events["storage2-SCHEDULER"][:3] == [
# 'get_num_new_matched_tokens 0',
# 'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
# ]
# # Delete storage1 connector state
# shutil.rmtree(storage_1_path)
# # Reset prefix cache or else we'll just get the tokens back from there.
# llm.reset_prefix_cache()
# # Run generation again - this should trigger loading from the first
# # connector.
# _ = llm.generate(PROMPTS, SAMPLING_PARAMS)
# events = get_connector_events()
# # get_num_new_matched_tokens will be called for both connectors but will
# # return 0 from the first connector, but the second connector should have
# # a hit, so update_state_after_alloc will only be called with allocated
# # blocks for the second connector.
# assert events["storage1-SCHEDULER"][:3] == [
# 'get_num_new_matched_tokens 0',
# 'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
# ]
# assert events["storage2-SCHEDULER"][:3] == [
# 'get_num_new_matched_tokens 0',
# 'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
# ]
# # Clean up
# shutil.rmtree(storage_1_path)
# shutil.rmtree(storage_2_path)
def get_connector_events() -> dict[str, list[str]]:
......
......@@ -2,10 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import ray
import os
from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
from utils import models_path_prefix
@pytest.fixture(scope="function", autouse=True)
......@@ -17,7 +19,7 @@ def use_v1_only(monkeypatch):
MODELS = [
"distilbert/distilgpt2",
os.path.join(models_path_prefix, "distilbert/distilgpt2"),
]
......
......@@ -3,6 +3,7 @@
from unittest import mock
import os
import pytest
import torch
......@@ -12,10 +13,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.platforms import current_platform
from vllm.v1.spec_decode.eagle import EagleProposer
from ...utils import models_path_prefix
model_dir = "meta-llama/Llama-3.1-8B-Instruct"
eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
model_dir = os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
eagle_dir = os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3.1-Instruct-8B")
eagle3_dir = os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")
def _create_proposer(method: str, k: int) -> EagleProposer:
......
......@@ -8,6 +8,7 @@ import vllm.envs as envs
from vllm import LLM
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.platforms import current_platform
from ..utils import models_path_prefix
UNSUPPORTED_MODELS_V1 = [
......@@ -121,9 +122,10 @@ def test_v1_llm_by_default(monkeypatch):
def test_v1_attn_backend(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
if not current_platform.is_rocm():
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
# Fall back to V0.
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
......
......@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
assert torch.equal(actual, expected)
@pytest.mark.skipif(current_platform.is_rocm(),
reason="ROCM is not supported.")
@pytest.mark.parametrize("batch_size", list(range(1, 257)))
@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
......
......@@ -32,7 +32,7 @@ def test_deepseek_mla_attn_backend_module():
trust_remote_code=True,
enable_chunked_prefill=False,
)
assert model_runner.attn_backend.__name__ == "TritonMLABackend"
assert model_runner.attn_backend.__name__ == "FlashMLABackend" # "TritonMLABackend"
@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
......
......@@ -50,6 +50,7 @@ def get_config_quant_dtype(
use_int8_w8a8: bool,
use_int8_w8a16: bool,
use_int4_w4a16: bool,
use_int4_w4a8: bool,
) -> Optional[torch.dtype]:
if use_fp8_w8a8:
return torch.float8_e4m3fn
......@@ -126,6 +127,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8: bool = False,
use_int8_w8a16: bool = False,
use_int4_w4a16: bool = False,
use_int4_w4a8: bool = False,
per_act_token_quant: bool = False,
per_out_ch_quant: bool = False,
block_shape: Optional[list[int]] = None,
......@@ -136,6 +138,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8,
use_int8_w8a16,
use_int4_w4a16,
use_int4_w4a8,
]
]) <= 1, "Quantization flags are mutually exclusive."
......@@ -144,6 +147,7 @@ class FusedMoEQuantConfig:
use_int8_w8a8=use_int8_w8a8,
use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16,
use_int4_w4a8=use_int4_w4a8,
)
return FusedMoEQuantConfig(
quant_dtype,
......
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 2,
"num_stages": 2,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"24": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"96": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
}
}
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"24": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"48": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"96": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"1536": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"3072": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
}
}
......@@ -652,7 +652,6 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
B_scale: Optional[torch.Tensor],
B_zp: Optional[torch.Tensor],
topk_weights: Optional[torch.Tensor],
topk_ids: torch.Tensor,
sorted_token_ids: torch.Tensor,
expert_ids: torch.Tensor,
num_tokens_post_padded: torch.Tensor,
......@@ -1603,7 +1602,8 @@ def fused_experts_impl(
qtype = get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a8=use_int8_w8a8,
use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16)
use_int4_w4a16=use_int4_w4a16,
use_int4_w4a8=use_int4_w4a8)
get_config_func = functools.partial(
try_get_optimal_moe_config,
......@@ -1689,7 +1689,6 @@ def fused_experts_impl(
w1_scale,
w1_zp,
curr_topk_weights,
curr_topk_ids,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
......@@ -1729,7 +1728,6 @@ def fused_experts_impl(
w2_scale,
w2_zp,
curr_topk_weights,
curr_topk_ids,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
......@@ -1877,7 +1875,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
use_int8_w8a8: bool = False,
use_int8_w8a16: bool = False,
use_int4_w4a16: bool = False,
use_int4_w4a8: bool =False,
use_int4_w4a8: bool = False,
per_act_token_quant: bool = False,
block_shape: Optional[List[int]] = None,
):
......@@ -1896,7 +1894,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
self.use_int4_w4a16 = use_int4_w4a16
self.use_int8_w8a8 = use_int8_w8a8
self.use_int8_w8a16 = use_int8_w8a16
self.use_int4_w4a8= use_int4_w4a8
self.use_int4_w4a8 = use_int4_w4a8
@property
def activation_formats(
......@@ -2027,7 +2025,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
use_int8_w8a8=self.use_int8_w8a8,
use_int8_w8a16=self.use_int8_w8a16,
use_int4_w4a16=self.use_int4_w4a16,
use_int4_w4a8= self.use_int4_w4a8,
use_int4_w4a8=self.use_int4_w4a8,
per_channel_quant=self.per_act_token_quant,
block_shape=self.block_shape)
......@@ -2068,7 +2066,7 @@ def modular_triton_fused_moe(
use_int8_w8a8: bool,
use_int8_w8a16: bool,
use_int4_w4a16: bool,
use_int4_w4a8:bool,
use_int4_w4a8: bool,
per_act_token_quant: bool,
block_shape: Optional[List[int]] = None,
) -> mk.FusedMoEModularKernel:
......@@ -2079,7 +2077,7 @@ def modular_triton_fused_moe(
use_int8_w8a8=use_int8_w8a8,
use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16,
use_int4_w4a8= use_int4_w4a8,
use_int4_w4a8=use_int4_w4a8,
per_act_token_quant=per_act_token_quant,
block_shape=block_shape,
),
......
......@@ -791,7 +791,6 @@ class FusedMoE(torch.nn.Module):
if quant_config is None:
# Not considering quant for now, temporarily
self.use_nn_moe = int(os.environ.get('MOE_NN', 1)) == 1
# self.use_nn_moe = os.environ.get('MOE_NN') == '1'
else:
self.use_nn_moe = False
......
......@@ -36,9 +36,10 @@ class ActivationMethod(IntEnum):
@cache
def is_rocm_aiter_moe_enabled() -> bool:
return current_platform.is_rocm() \
and envs.VLLM_ROCM_USE_AITER_MOE \
and envs.VLLM_ROCM_USE_AITER
return False
# return current_platform.is_rocm() \
# and envs.VLLM_ROCM_USE_AITER_MOE \
# and envs.VLLM_ROCM_USE_AITER
def rocm_aiter_asm_moe_tkw1_impl(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment