Commit a99300bd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev

parents cc3e01c7 5438967f
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import os
from typing import Any, Callable, Optional, Union
import pytest
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.llm_engine import LLMEngine
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
class Mock:
...
class CustomMultiprocExecutor(MultiprocExecutor):
def collective_rpc(self,
method: Union[str, Callable],
timeout: Optional[float] = None,
args: tuple = (),
kwargs: Optional[dict] = None,
non_block: bool = False,
unique_reply_rank: Optional[int] = None) -> list[Any]:
# Drop marker to show that this was ran
with open(".marker", "w"):
...
return super().collective_rpc(method, timeout, args, kwargs)
CustomMultiprocExecutorAsync = CustomMultiprocExecutor
MODEL = "Qwen/Qwen3-0.6B"
def test_custom_executor_type_checking():
with pytest.raises(ValueError):
engine_args = EngineArgs(
model=MODEL,
gpu_memory_utilization=0.2,
max_model_len=8192,
distributed_executor_backend=Mock,
)
LLMEngine.from_engine_args(engine_args)
with pytest.raises(ValueError):
engine_args = AsyncEngineArgs(model=MODEL,
gpu_memory_utilization=0.2,
max_model_len=8192,
distributed_executor_backend=Mock)
AsyncLLM.from_engine_args(engine_args)
@pytest.mark.parametrize("distributed_executor_backend", [
CustomMultiprocExecutor,
"tests.v1.executor.test_executor.CustomMultiprocExecutor"
])
def test_custom_executor(distributed_executor_backend, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
try:
assert not os.path.exists(".marker")
engine_args = EngineArgs(
model=MODEL,
gpu_memory_utilization=0.2,
max_model_len=8192,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True, # reduce test time
)
engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
engine.add_request("0", "foo", sampling_params)
engine.step()
assert os.path.exists(".marker")
finally:
os.chdir(cwd)
@pytest.mark.parametrize("distributed_executor_backend", [
CustomMultiprocExecutorAsync,
"tests.v1.executor.test_executor.CustomMultiprocExecutorAsync"
])
def test_custom_executor_async(distributed_executor_backend, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
try:
assert not os.path.exists(".marker")
engine_args = AsyncEngineArgs(
model=MODEL,
gpu_memory_utilization=0.2,
max_model_len=8192,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True, # reduce test time
)
engine = AsyncLLM.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
async def t():
stream = engine.generate(request_id="0",
prompt="foo",
sampling_params=sampling_params)
async for x in stream:
...
asyncio.run(t())
assert os.path.exists(".marker")
finally:
os.chdir(cwd)
...@@ -14,6 +14,7 @@ from unittest.mock import patch ...@@ -14,6 +14,7 @@ from unittest.mock import patch
import pytest import pytest
import ray import ray
import torch
from vllm import LLM from vllm import LLM
from vllm.config import KVTransferConfig from vllm.config import KVTransferConfig
...@@ -22,6 +23,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( ...@@ -22,6 +23,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
NixlConnectorWorker) NixlConnectorWorker)
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
from .utils import create_request, create_scheduler, create_vllm_config from .utils import create_request, create_scheduler, create_vllm_config
...@@ -98,7 +100,6 @@ class FakeNixlWrapper: ...@@ -98,7 +100,6 @@ class FakeNixlWrapper:
def set_cycles_before_xfer_done(self, cycles: int): def set_cycles_before_xfer_done(self, cycles: int):
"""Set the number of cycles before a transfer is considered done.""" """Set the number of cycles before a transfer is considered done."""
self._cycles_before_xfer_done = cycles
@contextlib.contextmanager @contextlib.contextmanager
...@@ -562,3 +563,86 @@ def _run_abort_timeout_test(llm_kwargs: dict, timeout: int): ...@@ -562,3 +563,86 @@ def _run_abort_timeout_test(llm_kwargs: dict, timeout: int):
sampling_params) sampling_params)
# Request-0 times out and is cleared! # Request-0 times out and is cleared!
assert '0' not in req_to_blocks assert '0' not in req_to_blocks
def test_register_kv_caches(dist_init):
"""
Test that register_kv_caches() properly calls nixl_wrapper methods with
correct data.
This test verifies:
1. nixl_wrapper.get_reg_descs() is called with caches_data containing
tensor metadata
2. nixl_wrapper.get_xfer_descs() is called with blocks_data containing
block layout info
"""
vllm_config = create_vllm_config()
# Create test kv cache tensors using proper backend shape
kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(num_blocks=2,
block_size=16,
num_kv_heads=4,
head_size=64)
shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
unique_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16)
kv_caches = {
"layer0": shared_tensor,
"layer1": unique_tensor,
"layer2": shared_tensor,
}
# Store tensor info for validation
expected_tensor_size = shared_tensor[0].element_size(
) * shared_tensor[0].numel()
expected_base_addrs = [
shared_tensor[0].data_ptr(), shared_tensor[1].data_ptr(),
unique_tensor[0].data_ptr(), unique_tensor[1].data_ptr()
]
with patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper") as mock_nixl_wrapper, \
patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"), \
patch("vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"): # noqa: E501
# Create connector
connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
connector.connector_worker = FakeNixlConnectorWorker(
vllm_config, connector.engine_id, hand_shake_latency=0)
# Get the mock instance
mock_wrapper_instance = mock_nixl_wrapper.return_value
connector.connector_worker.nixl_wrapper = mock_wrapper_instance
# Execute register_kv_caches
connector.register_kv_caches(kv_caches)
# Verify get_reg_descs was called with caches_data
assert mock_wrapper_instance.get_reg_descs.called
caches_data, _ = mock_wrapper_instance.get_reg_descs.call_args[0]
assert len(caches_data) == 4
for i, cache_entry in enumerate(caches_data):
base_addr, size, _tp_rank, _ = cache_entry
assert size == expected_tensor_size, \
f"Entry {i}: Expected tensor size {expected_tensor_size}, " \
f"got {size}"
assert base_addr == expected_base_addrs[i], \
f"Entry {i}: Expected base address {expected_base_addrs[i]}, " \
f"got {base_addr}"
# Verify get_xfer_descs was called with blocks_data
assert mock_wrapper_instance.get_xfer_descs.called
blocks_data, _ = mock_wrapper_instance.get_xfer_descs.call_args[0]
# Validate blocks_data structure and size
expected_blocks_count = 8
assert len(blocks_data) == expected_blocks_count, \
f"Expected {expected_blocks_count} blocks, " \
f"got {len(blocks_data)}"
expected_block_len = expected_tensor_size // 2
for i, block_entry in enumerate(blocks_data):
block_start_addr, block_len, tp_rank = block_entry
assert block_len == expected_block_len, \
f"Block entry {i}: Expected block len {expected_block_len}, " \
f"got {block_len}"
...@@ -162,9 +162,7 @@ def create_request(request_id: int, ...@@ -162,9 +162,7 @@ def create_request(request_id: int,
prompt_token_ids=prompt_token_ids, prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params, sampling_params=sampling_params,
pooling_params=None, pooling_params=None,
multi_modal_kwargs=None, mm_features=None,
multi_modal_placeholders=None,
multi_modal_hashes=None,
eos_token_id=EOS_TOKEN_ID, eos_token_id=EOS_TOKEN_ID,
block_hasher=get_request_block_hasher(block_size, hash_fn), block_hasher=get_request_block_hasher(block_size, hash_fn),
) )
...@@ -200,7 +198,6 @@ def create_model_runner_output( ...@@ -200,7 +198,6 @@ def create_model_runner_output(
req_ids=req_ids, req_ids=req_ids,
req_id_to_index=req_id_to_index, req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids, sampled_token_ids=sampled_token_ids,
spec_token_ids=None,
logprobs=None, logprobs=None,
prompt_logprobs_dict={}, prompt_logprobs_dict={},
pooler_output=None, pooler_output=None,
......
...@@ -8,10 +8,9 @@ from typing import Optional ...@@ -8,10 +8,9 @@ from typing import Optional
import torch import torch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.sampling_params import SamplingParams
from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate, from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
LogitsProcessor, LogitsProcessor)
MoveDirectionality) from vllm.v1.sample.logits_processor.builtin import process_dict_updates
MODEL_NAME = "facebook/opt-125m" MODEL_NAME = "facebook/opt-125m"
POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5" POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
...@@ -45,37 +44,19 @@ class DummyLogitsProcessor(LogitsProcessor): ...@@ -45,37 +44,19 @@ class DummyLogitsProcessor(LogitsProcessor):
def __init__(self, vllm_config: "VllmConfig", device: torch.device, def __init__(self, vllm_config: "VllmConfig", device: torch.device,
is_pin_memory: bool): is_pin_memory: bool):
self.req_info: dict[int, SamplingParams] = {} self.req_info: dict[int, int] = {}
def is_argmax_invariant(self) -> bool: def is_argmax_invariant(self) -> bool:
"""Never impacts greedy sampling""" """Never impacts greedy sampling"""
return False return False
def update_state(self, batch_update: Optional[BatchUpdate]): def update_state(self, batch_update: Optional[BatchUpdate]):
if not batch_update: process_dict_updates(
return self.req_info,
batch_update,
# Process added requests. lambda params, _, __: params.extra_args and
for index, params, _, _ in batch_update.added: (params.extra_args.get("target_token")),
assert params is not None )
if params.extra_args and (target_token :=
params.extra_args.get("target_token")):
self.req_info[index] = target_token
if self.req_info:
# Process removed requests.
for index in batch_update.removed:
self.req_info.pop(index, None)
# Process moved requests, unidirectional move (a->b) and swap
# (a<->b)
for adx, bdx, direct in batch_update.moved:
a_val = self.req_info.pop(adx, None)
b_val = self.req_info.pop(bdx, None)
if a_val is not None:
self.req_info[bdx] = a_val
if direct == MoveDirectionality.SWAP and b_val is not None:
self.req_info[adx] = b_val
def apply(self, logits: torch.Tensor) -> torch.Tensor: def apply(self, logits: torch.Tensor) -> torch.Tensor:
if not self.req_info: if not self.req_info:
......
...@@ -458,9 +458,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): ...@@ -458,9 +458,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
assert len(logprob) == vocab_size assert len(logprob) == vocab_size
@pytest.mark.parametrize( @pytest.mark.parametrize("logprobs_mode", list(LogprobsMode))
"logprobs_mode",
["raw_logprobs", "raw_logits", "processed_logprobs", "processed_logits"])
def test_logprobs_mode(logprobs_mode: LogprobsMode, def test_logprobs_mode(logprobs_mode: LogprobsMode,
monkeypatch: pytest.MonkeyPatch): monkeypatch: pytest.MonkeyPatch):
"""Test with LLM engine with different logprobs_mode. """Test with LLM engine with different logprobs_mode.
...@@ -489,12 +487,14 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode, ...@@ -489,12 +487,14 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode,
for logprobs in output.logprobs: for logprobs in output.logprobs:
for token_id in logprobs: for token_id in logprobs:
logprob = logprobs[token_id] logprob = logprobs[token_id]
if "logprobs" in logprobs_mode: if logprobs_mode in (LogprobsMode.RAW_LOGPROBS,
LogprobsMode.PROCESSED_LOGPROBS):
assert logprob.logprob <= 0 assert logprob.logprob <= 0
if logprob.logprob > 0: if logprob.logprob > 0:
positive_values = positive_values + 1 positive_values = positive_values + 1
total_token_with_logprobs = total_token_with_logprobs + 1 total_token_with_logprobs = total_token_with_logprobs + 1
assert total_token_with_logprobs >= len(results[0].outputs) assert total_token_with_logprobs >= len(results[0].outputs)
if "logits" in logprobs_mode: if logprobs_mode in (LogprobsMode.RAW_LOGITS,
LogprobsMode.PROCESSED_LOGITS):
assert positive_values > 0 assert positive_values > 0
del llm del llm
...@@ -50,6 +50,7 @@ def forward_attention( ...@@ -50,6 +50,7 @@ def forward_attention(
dtype=torch.int32, dtype=torch.int32,
) )
context_lens = seq_lens - query_lens context_lens = seq_lens - query_lens
max_seq_len = int(seq_lens.max())
max_query_len = q_len max_query_len = q_len
num_actual_tokens = query_start_loc[-1] num_actual_tokens = query_start_loc[-1]
...@@ -81,6 +82,7 @@ def forward_attention( ...@@ -81,6 +82,7 @@ def forward_attention(
num_reqs=batch_size, num_reqs=batch_size,
num_actual_tokens=num_actual_tokens, num_actual_tokens=num_actual_tokens,
max_query_len=max_query_len, max_query_len=max_query_len,
max_seq_len=max_seq_len,
block_table_tensor=block_table, block_table_tensor=block_table,
slot_mapping=slot_mapping, slot_mapping=slot_mapping,
) )
......
...@@ -78,9 +78,10 @@ async def generate( ...@@ -78,9 +78,10 @@ async def generate(
], ],
) )
@pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"]) @pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
@pytest.mark.parametrize("async_scheduling", [True, False])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load(output_kind: RequestOutputKind, async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str,
data_parallel_backend: str): async_scheduling: bool):
stats_loggers = {} stats_loggers = {}
...@@ -108,6 +109,7 @@ async def test_load(output_kind: RequestOutputKind, ...@@ -108,6 +109,7 @@ async def test_load(output_kind: RequestOutputKind,
prompt = "This is a test of data parallel" prompt = "This is a test of data parallel"
engine_args.data_parallel_backend = data_parallel_backend engine_args.data_parallel_backend = data_parallel_backend
engine_args.async_scheduling = async_scheduling
engine = AsyncLLM.from_engine_args(engine_args, engine = AsyncLLM.from_engine_args(engine_args,
stat_loggers=[SimpleStatsLogger]) stat_loggers=[SimpleStatsLogger])
after.callback(engine.shutdown) after.callback(engine.shutdown)
......
...@@ -11,7 +11,8 @@ import torch ...@@ -11,7 +11,8 @@ import torch
from vllm.multimodal.inputs import (MultiModalBatchedField, from vllm.multimodal.inputs import (MultiModalBatchedField,
MultiModalFieldElem, MultiModalFlatField, MultiModalFieldElem, MultiModalFlatField,
MultiModalKwargs, MultiModalKwargsItem, MultiModalKwargsItem,
MultiModalKwargsItems,
MultiModalSharedField, NestedTensors) MultiModalSharedField, NestedTensors)
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
...@@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch): ...@@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
class MyRequest(msgspec.Struct): class MyRequest(msgspec.Struct):
mm: Optional[list[MultiModalKwargs]] mm: Optional[list[MultiModalKwargsItems]]
def test_multimodal_kwargs(): def test_multimodal_kwargs():
...@@ -119,7 +120,7 @@ def test_multimodal_kwargs(): ...@@ -119,7 +120,7 @@ def test_multimodal_kwargs():
audio = MultiModalKwargsItem.from_elems([e1]) audio = MultiModalKwargsItem.from_elems([e1])
video = MultiModalKwargsItem.from_elems([e2]) video = MultiModalKwargsItem.from_elems([e2])
image = MultiModalKwargsItem.from_elems([e3, e4]) image = MultiModalKwargsItem.from_elems([e3, e4])
mm = MultiModalKwargs([audio, video, image]) mm = MultiModalKwargsItems.from_seq([audio, video, image])
# pack mm kwargs into a mock request so that it can be decoded properly # pack mm kwargs into a mock request so that it can be decoded properly
req = MyRequest([mm]) req = MyRequest([mm])
...@@ -133,19 +134,22 @@ def test_multimodal_kwargs(): ...@@ -133,19 +134,22 @@ def test_multimodal_kwargs():
total_len = sum(memoryview(x).cast("B").nbytes for x in encoded) total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
# expected total encoding length, should be 14255, +-20 for minor changes # expected total encoding length, should be 14306, +-20 for minor changes
assert 14250 <= total_len <= 14300 assert 14275 <= total_len <= 14325
decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] decoded = decoder.decode(encoded).mm[0]
assert isinstance(decoded, MultiModalKwargsItems)
# check all modalities were recovered and do some basic sanity checks # check all modalities were recovered and do some basic sanity checks
assert len(decoded.modalities) == 3 assert len(decoded) == 3
images = decoded.get_items("image") images = decoded["image"]
assert len(images) == 1 assert len(images) == 1
assert len(images[0].items()) == 2 assert len(images[0].items()) == 2
assert list(images[0].keys()) == ["i0", "i1"] assert list(images[0].keys()) == ["i0", "i1"]
# check the tensor contents and layout in the main dict # check the tensor contents and layout in the main dict
assert all(nested_equal(mm[k], decoded[k]) for k in mm) mm_data = mm.get_data()
decoded_data = decoded.get_data()
assert all(nested_equal(mm_data[k], decoded_data[k]) for k in mm_data)
def nested_equal(a: NestedTensors, b: NestedTensors): def nested_equal(a: NestedTensors, b: NestedTensors):
......
...@@ -85,7 +85,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: ...@@ -85,7 +85,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -164,7 +164,7 @@ def test_update_states_request_finished(model_runner): ...@@ -164,7 +164,7 @@ def test_update_states_request_finished(model_runner):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids={req_id}, finished_req_ids={req_id},
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -194,7 +194,7 @@ def test_update_states_request_resumed(model_runner): ...@@ -194,7 +194,7 @@ def test_update_states_request_resumed(model_runner):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -221,7 +221,7 @@ def test_update_states_request_resumed(model_runner): ...@@ -221,7 +221,7 @@ def test_update_states_request_resumed(model_runner):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -252,7 +252,7 @@ def test_update_states_no_changes(model_runner): ...@@ -252,7 +252,7 @@ def test_update_states_no_changes(model_runner):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -287,7 +287,7 @@ def test_update_states_request_unscheduled(model_runner): ...@@ -287,7 +287,7 @@ def test_update_states_request_unscheduled(model_runner):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
......
...@@ -205,6 +205,7 @@ def _construct_cached_request_state(req_id_suffix: int): ...@@ -205,6 +205,7 @@ def _construct_cached_request_state(req_id_suffix: int):
pooling_params=None, pooling_params=None,
mm_kwargs=[], mm_kwargs=[],
mm_positions=[], mm_positions=[],
mm_hashes=[],
block_ids=([], ), block_ids=([], ),
generator=None, generator=None,
num_computed_tokens=len(output_token_ids), num_computed_tokens=len(output_token_ids),
......
...@@ -143,7 +143,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: ...@@ -143,7 +143,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -209,7 +209,7 @@ def test_update_states_request_finished(model_runner, dist_init): ...@@ -209,7 +209,7 @@ def test_update_states_request_finished(model_runner, dist_init):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids={req_id}, finished_req_ids={req_id},
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -241,7 +241,7 @@ def test_update_states_request_resumed(model_runner, dist_init): ...@@ -241,7 +241,7 @@ def test_update_states_request_resumed(model_runner, dist_init):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -268,7 +268,7 @@ def test_update_states_request_resumed(model_runner, dist_init): ...@@ -268,7 +268,7 @@ def test_update_states_request_resumed(model_runner, dist_init):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -349,7 +349,7 @@ def test_update_states_no_changes(model_runner, dist_init): ...@@ -349,7 +349,7 @@ def test_update_states_no_changes(model_runner, dist_init):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -386,7 +386,7 @@ def test_update_states_request_unscheduled(model_runner, dist_init): ...@@ -386,7 +386,7 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
scheduled_encoder_inputs={}, scheduled_encoder_inputs={},
num_common_prefix_blocks=0, num_common_prefix_blocks=0,
finished_req_ids=set(), finished_req_ids=set(),
free_encoder_input_ids=[], free_encoder_mm_hashes=[],
structured_output_request_ids={}, structured_output_request_ids={},
grammar_bitmask=None, grammar_bitmask=None,
) )
...@@ -682,6 +682,7 @@ def test_init_kv_cache_with_kv_sharing_valid(): ...@@ -682,6 +682,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
kv_cache_spec[layer_0].page_size_bytes kv_cache_spec[layer_0].page_size_bytes
runner.initialize_kv_cache(kv_cache_config) runner.initialize_kv_cache(kv_cache_config)
kv_cache_config_after_init = runner.kv_cache_config
layer_0_kv = vllm_ctx[layer_0].kv_cache[0] layer_0_kv = vllm_ctx[layer_0].kv_cache[0]
layer_1_kv = vllm_ctx[layer_1].kv_cache[0] layer_1_kv = vllm_ctx[layer_1].kv_cache[0]
...@@ -689,10 +690,12 @@ def test_init_kv_cache_with_kv_sharing_valid(): ...@@ -689,10 +690,12 @@ def test_init_kv_cache_with_kv_sharing_valid():
assert id(layer_1_kv) == id(layer_0_kv) assert id(layer_1_kv) == id(layer_0_kv)
# check layer 1 added to kv cache group's layer names # check layer 1 added to kv cache group's layer names
assert len(kv_cache_config.kv_cache_groups) == 1 assert len(kv_cache_config_after_init.kv_cache_groups) == 1
assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 assert len(kv_cache_config_after_init.kv_cache_groups[0].layer_names) == 2
assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[
assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 0] == layer_0
assert kv_cache_config_after_init.kv_cache_groups[0].layer_names[
1] == layer_1
def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
......
...@@ -26,9 +26,5 @@ compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing ...@@ -26,9 +26,5 @@ compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing
awq, casperhansen/mixtral-instruct-awq, main awq, casperhansen/mixtral-instruct-awq, main
awq_marlin, casperhansen/mixtral-instruct-awq, main awq_marlin, casperhansen/mixtral-instruct-awq, main
fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
qqq, HandH1998/QQQ-Llama-3-8b-g128, main
qqq, HandH1998/QQQ-Llama-3-8b, main
hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
None, mgleize/fairseq2-dummy-Llama-3.2-1B, main None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
\ No newline at end of file
...@@ -9,10 +9,7 @@ from vllm.attention import AttentionMetadata, AttentionMetadataBuilder ...@@ -9,10 +9,7 @@ from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.backends.utils import CommonAttentionState
from vllm.model_executor import SamplingMetadata from vllm.model_executor import SamplingMetadata
from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
from vllm.worker.pooling_model_runner import (
ModelInputForGPUWithPoolingMetadata)
class MockAttentionBackend(AttentionBackend): class MockAttentionBackend(AttentionBackend):
...@@ -114,54 +111,3 @@ def test_model_runner_input(): ...@@ -114,54 +111,3 @@ def test_model_runner_input():
assert (received_model_input.sampling_metadata.selected_token_indices == assert (received_model_input.sampling_metadata.selected_token_indices ==
sampling_metadata.selected_token_indices) sampling_metadata.selected_token_indices)
assert received_model_input.sampling_metadata.seq_groups is None assert received_model_input.sampling_metadata.seq_groups is None
def test_embedding_model_runner_input():
pooling_metadata = PoolingMetadata(
seq_groups=[[0]],
seq_data={},
prompt_lens=[1],
)
attn_metadata = AttentionMetadata(
num_prefills=1,
num_prefill_tokens=2,
num_decode_tokens=3,
slot_mapping=torch.zeros(1),
multi_modal_placeholder_index_maps=None,
enable_kv_scales_calculation=True,
)
model_input = ModelInputForGPUWithPoolingMetadata(
input_tokens=torch.ones(10),
input_positions=torch.ones(10),
pooling_metadata=pooling_metadata,
attn_metadata=attn_metadata)
assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
# Test round trip serialization.
tensor_dict = model_input.as_broadcastable_tensor_dict()
attn_backend = MockAttentionBackend()
received_model_input = (
ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
tensor_dict, attn_backend=attn_backend))
# Check that received copy has correct values.
assert isinstance(received_model_input,
ModelInputForGPUWithPoolingMetadata)
assert received_model_input.input_tokens is not None
assert (
received_model_input.input_tokens == model_input.input_tokens).all()
assert received_model_input.input_positions is not None
assert (received_model_input.input_positions == model_input.input_positions
).all()
assert received_model_input.multi_modal_kwargs is None
assert (received_model_input.multi_modal_kwargs ==
model_input.multi_modal_kwargs)
assert received_model_input.lora_requests is None
assert received_model_input.lora_requests == model_input.lora_requests
assert received_model_input.lora_mapping is None
assert received_model_input.lora_mapping == model_input.lora_mapping
for field in dataclasses.fields(AttentionMetadata):
assert getattr(received_model_input.attn_metadata, field.name,
None) == getattr(attn_metadata, field.name, None)
# Pooling metadata is not broadcast.
assert received_model_input.pooling_metadata is None
...@@ -37,7 +37,7 @@ ALLOWED_FILES = set([ ...@@ -37,7 +37,7 @@ ALLOWED_FILES = set([
'vllm/distributed/utils.py', 'vllm/distributed/utils.py',
'vllm/distributed/parallel_state.py', 'vllm/distributed/parallel_state.py',
'vllm/engine/multiprocessing/client.py', 'vllm/engine/multiprocessing/client.py',
'vllm/distributed/device_communicators/custom_all_reduce_utils.py', 'vllm/distributed/device_communicators/all_reduce_utils.py',
'vllm/distributed/device_communicators/shm_broadcast.py', 'vllm/distributed/device_communicators/shm_broadcast.py',
'vllm/engine/multiprocessing/engine.py', 'vllm/engine/multiprocessing/engine.py',
'benchmarks/kernels/graph_machete_bench.py', 'benchmarks/kernels/graph_machete_bench.py',
......
...@@ -77,6 +77,7 @@ clone_repo() { ...@@ -77,6 +77,7 @@ clone_repo() {
local repo_url=$1 local repo_url=$1
local dir_name=$2 local dir_name=$2
local key_file=$3 local key_file=$3
local commit_hash=$4
if [ -d "$dir_name" ]; then if [ -d "$dir_name" ]; then
# Check if directory has uncommitted changes (dirty) # Check if directory has uncommitted changes (dirty)
...@@ -87,17 +88,27 @@ clone_repo() { ...@@ -87,17 +88,27 @@ clone_repo() {
echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning" echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
rm -rf "$dir_name" rm -rf "$dir_name"
git clone "$repo_url" git clone "$repo_url"
if [ -n "$commit_hash" ]; then
cd "$dir_name"
git checkout "$commit_hash"
cd ..
fi
else else
echo "$dir_name directory exists and appears complete; manually update if needed" echo "$dir_name directory exists and appears complete; manually update if needed"
fi fi
else else
git clone "$repo_url" git clone "$repo_url"
if [ -n "$commit_hash" ]; then
cd "$dir_name"
git checkout "$commit_hash"
cd ..
fi
fi fi
} }
# build and install pplx, require pytorch installed # build and install pplx, require pytorch installed
pushd $WORKSPACE pushd $WORKSPACE
clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
cd pplx-kernels cd pplx-kernels
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
# PIP_NO_BUILD_ISOLATION=0 disables build isolation # PIP_NO_BUILD_ISOLATION=0 disables build isolation
...@@ -106,7 +117,7 @@ popd ...@@ -106,7 +117,7 @@ popd
# build and install deepep, require pytorch installed # build and install deepep, require pytorch installed
pushd $WORKSPACE pushd $WORKSPACE
clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "e3908bf"
cd DeepEP cd DeepEP
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e . PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e .
......
#!/bin/bash
# Script to install DeepGEMM from source
# This script can be used both in Docker builds and by users locally
set -e
# Default values
DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--ref)
if [[ -z "$2" || "$2" =~ ^- ]]; then
echo "Error: --ref requires an argument." >&2
exit 1
fi
DEEPGEMM_GIT_REF="$2"
shift 2
;;
--cuda-version)
if [[ -z "$2" || "$2" =~ ^- ]]; then
echo "Error: --cuda-version requires an argument." >&2
exit 1
fi
CUDA_VERSION="$2"
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --ref REF Git reference to checkout (default: $DEEPGEMM_GIT_REF)"
echo " --cuda-version VER CUDA version (auto-detected if not provided)"
echo " -h, --help Show this help message"
exit 0
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
# Auto-detect CUDA version if not provided
if [ -z "$CUDA_VERSION" ]; then
if command -v nvcc >/dev/null 2>&1; then
CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \([0-9]\+\.[0-9]\+\).*/\1/p')
echo "Auto-detected CUDA version: $CUDA_VERSION"
else
echo "Warning: Could not auto-detect CUDA version. Please specify with --cuda-version"
exit 1
fi
fi
# Extract major and minor version numbers
CUDA_MAJOR="${CUDA_VERSION%%.*}"
CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
CUDA_MINOR="${CUDA_MINOR%%.*}"
echo "CUDA version: $CUDA_VERSION (major: $CUDA_MAJOR, minor: $CUDA_MINOR)"
# Check CUDA version requirement
if [ "$CUDA_MAJOR" -lt 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -lt 8 ]; }; then
echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
exit 0
fi
echo "Installing DeepGEMM from source..."
echo "Repository: $DEEPGEMM_GIT_REPO"
echo "Reference: $DEEPGEMM_GIT_REF"
# Create a temporary directory for the build
INSTALL_DIR=$(mktemp -d)
trap 'rm -rf "$INSTALL_DIR"' EXIT
# Clone the repository
git clone --recursive --shallow-submodules "$DEEPGEMM_GIT_REPO" "$INSTALL_DIR/deepgemm"
echo "🏗️ Building DeepGEMM"
pushd "$INSTALL_DIR/deepgemm"
# Checkout the specific reference
git checkout "$DEEPGEMM_GIT_REF"
# Build DeepGEMM
# (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
rm -rf build dist
rm -rf *.egg-info
python3 setup.py bdist_wheel
# Install the wheel
if command -v uv >/dev/null 2>&1; then
echo "Installing DeepGEMM wheel using uv..."
# Use --system in Docker contexts, respect user's environment otherwise
if [ -n "$VLLM_DOCKER_BUILD_CONTEXT" ]; then
uv pip install --system dist/*.whl
else
uv pip install dist/*.whl
fi
else
echo "Installing DeepGEMM wheel using pip..."
python3 -m pip install dist/*.whl
fi
popd
echo "✅ DeepGEMM installation completed successfully"
\ No newline at end of file
...@@ -36,8 +36,7 @@ profiling and analyzing nsys profile output. ...@@ -36,8 +36,7 @@ profiling and analyzing nsys profile output.
## Notes ## Notes
- Make sure you have pandas installed. - Make sure you have pandas installed.
- Make sure nsys is installed, and specify the path to the `nsys` command with - Make sure [nsys](https://developer.nvidia.com/nsight-systems/get-started) is installed, and specify the path to the `nsys` command with `--nsys_cmd` if it is not in your PATH.
`--nsys_cmd` if it is not in your PATH.
- For more details on available engines and models, see the help string in - For more details on available engines and models, see the help string in
the script or run: the script or run:
...@@ -135,34 +134,31 @@ time which would cause a difference for the overall category. ...@@ -135,34 +134,31 @@ time which would cause a difference for the overall category.
## Example 3: add new classification for a new model ## Example 3: add new classification for a new model
Suppose there's a new model ABC that is available for engine DEF, and say there To create a new engine DEF with model ABC, just add another json file in the same directory as
are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels gputrc2graph.py with the same format as the other json files. The script will automatically pick up all the json files in the same directory as engine/model specifications.
Then, for this new model, suppose there are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*" have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*"
or "*K*" in them, add a new entry like so: or "*K*" in them, just add another .json file in the same directory as
gputrc2graph.py with the same format as the other json files, like the following:
```python
engine_model = { ```json
'DEF': { {
'ABC': { "DEF": {
'layer_anno': { "ABC": {
'Stage': { "H|I": "gemm",
'.*': 'layer', "J|K": "attn",
}, "CUDA mem": "non-gpu-H_D_memops",
'Substage': { ".*": "misc"
'H|I': 'gemm', }
'J|K': 'attn', }
'CUDA mem': 'non-gpu-H_D_memops', }
'.*': 'misc'
}
}
},
}
'vllm': {...}
``` ```
Basically Substage is a dictionary with a list of key/value pairs, where the Each entry in the dictionary consists of:
keys are regex's of the kernel names to be classified, and values are the
classification bins which one wishes to compare across engines/models. - key: a regex used to classify the kernels
- value: the category to classify the kernels into.
The last 2 entries are common for all engine/models, consisting of CUDA memory The last 2 entries are common for all engine/models, consisting of CUDA memory
operations and a 'misc' for anything that's leftover and can't be classified. operations and a 'misc' for anything that's leftover and can't be classified.
...@@ -173,3 +169,6 @@ like the following: ...@@ -173,3 +169,6 @@ like the following:
```bash ```bash
--infile new.nsys-rep,DEF,ABC,<runtime> --infile new.nsys-rep,DEF,ABC,<runtime>
``` ```
If the engine_DEF.json file already exists, just add the model as a new node in
the existing engine file, after the other models.
...@@ -15,132 +15,18 @@ logger = logging.getLogger(__name__) ...@@ -15,132 +15,18 @@ logger = logging.getLogger(__name__)
# helper data class for annotating kernels # helper data class for annotating kernels
class EngineModelData: def load_engine_model():
# engine + model mappings """ returns engine_model built from all json files in the current dir """
engine_model = { import glob
'vllm': { import json
'llama': { engine_model = {}
'layer_anno': {
'Stage': { json_files = glob.glob(
'.*': 'layer', os.path.join(os.path.dirname(__file__) or ".", "*.json"))
}, for fname in json_files:
'Substage': { with open(fname, encoding="utf-8") as f:
'gemm': 'gemm', engine_model.update(json.load(f))
'fused_moe_kernel|GroupProblemShape|group_gemm_starts': return engine_model
'moe_gemm', #llama4
'moe|sigmoid': 'moe', #llama4
'CatArrayBatched|prepare_inputs': 'prepare_next',
'flash': 'attn',
'ncclDevKernel|cross_device_reduce':
'nccl_and_custom_ar',
'_norm_': 'norm',
'act_and_mul_': 'silu',
'rotary_embedding_kernel': 'rope',
'SoftMax': 'softmax',
'elementwise': 'elementwise',
'fp8_quant': 'quantize',
'reduce_kernel': 'reduce',
'triton': 'triton_kernel',
'CUDA mem': 'non-gpu-H_D_memops',
'.*': 'misc'
}
}
},
'ds': {
'layer_anno': {
'Stage': {
'.*': 'layer',
},
'Substage': {
'block_fp8|gemm_fp8_blockwise':
'block_fp8_gemm',
'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal':
'moe_gemm',
'gemm|matmul|nvjet':
'gemm',
'moe|sigmoid|expert':
'moe',
'_fwd_|FlashAttn|_mla_|_attn_':
'attn',
'CatArrayBatched':
'prepare_next',
'ncclDevKernel|cross_device_reduce':
'nccl_and_custom_ar',
'Norm|_norm_':
'norm',
'sbtopk':
'topk',
'act_and_mul_':
'activation',
'compute_position_kernel':
'rope',
'elementwise':
'elementwise',
'fp8_quant|quant_fp8|cvt_fp16_to_fp4':
'quantize',
'reduce':
'reduce',
'SoftMax':
'softmax',
'triton':
'triton_kernel',
'CUDA mem':
'non-gpu-H_D_memops',
'.*':
'misc'
}
}
},
'gpt-oss': {
'layer_anno': {
'Stage': {
'.*': 'layer',
},
'Substage': {
'block_fp8|gemm_fp8_blockwise':
'block_fp8_gemm',
'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_'
# this section is triton_moe_gemm
'|matmul_ogs_|_topk_forward|_combined_routing'
'|_sum_bitmatrix_rows|_compute_writeback_idx':
'moe_gemm',
'gemm|matmul|nvjet':
'gemm',
'moe|sigmoid|expert|splitKreduce':
'moe',
'_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha':
'attn',
'CatArrayBatched':
'prepare_next',
'ncclDevKernel|cross_device_reduce':
'nccl_and_custom_ar',
'Norm|_norm_':
'norm',
'sbtopk':
'topk',
'act_and_mul_':
'activation',
'compute_position_kernel':
'rope',
'elementwise':
'elementwise',
'fp8_quant|quant_fp8|cvt_fp16_to_fp4|quantize':
'quantize',
'reduce':
'reduce',
'SoftMax':
'softmax',
'triton':
'triton_kernel',
'CUDA mem':
'non-gpu-H_D_memops',
'.*':
'misc'
}
}
}
},
}
class GPUTrace2Graph: class GPUTrace2Graph:
...@@ -148,8 +34,7 @@ class GPUTrace2Graph: ...@@ -148,8 +34,7 @@ class GPUTrace2Graph:
Parses output of nsys report, generates csv and bar chart output Parses output of nsys report, generates csv and bar chart output
""" """
def __init__(self, nsys_cmd): def __init__(self):
self.nsys_cmd = nsys_cmd
import pandas as pd # avoid importing till needed import pandas as pd # avoid importing till needed
self.pd = pd self.pd = pd
self.pd.options.mode.copy_on_write = True self.pd.options.mode.copy_on_write = True
...@@ -227,7 +112,7 @@ class GPUTrace2Graph: ...@@ -227,7 +112,7 @@ class GPUTrace2Graph:
title = 'Model_Engine' title = 'Model_Engine'
x = 'Model_Engine' x = 'Model_Engine'
y = 'Elapsed Time (sec)' y = 'Elapsed Time (sec)'
color = 'Substage' color = 'Category'
""" generate kernel mapping table """ """ generate kernel mapping table """
# Sort Model_Engine categories by last field after underscore # Sort Model_Engine categories by last field after underscore
df['Model_Engine'] = self.pd.Categorical( df['Model_Engine'] = self.pd.Categorical(
...@@ -249,14 +134,13 @@ class GPUTrace2Graph: ...@@ -249,14 +134,13 @@ class GPUTrace2Graph:
Generate data table with columns per Model_Engine into result.html Generate data table with columns per Model_Engine into result.html
""" """
pivot_df = df.pivot_table(values='Elapsed Time (sec)', pivot_df = df.pivot_table(values='Elapsed Time (sec)',
index='Substage', index='Category',
columns='Model_Engine', columns='Model_Engine',
aggfunc='sum', aggfunc='sum',
observed=False).round(2) observed=False).round(2)
# Add sum row at bottom # Add sum row at bottom
pivot_df.loc['total_elapsed_sec'] = pivot_df.sum() pivot_df.loc['total_elapsed_sec'] = pivot_df.sum()
pivot_df.fillna('').to_html('temp.html') pivot_df.fillna('').to_html('temp.html')
print('got')
with (open(f'{output_name}.html', 'a', encoding='utf-8') as with (open(f'{output_name}.html', 'a', encoding='utf-8') as
outfile, open('temp.html', encoding='utf-8') as infile): outfile, open('temp.html', encoding='utf-8') as infile):
outfile.write(infile.read()) outfile.write(infile.read())
...@@ -264,23 +148,22 @@ class GPUTrace2Graph: ...@@ -264,23 +148,22 @@ class GPUTrace2Graph:
print(f'Finished generating: \n' print(f'Finished generating: \n'
f' {output_name}.html for stack bar chart \n' f' {output_name}.html for stack bar chart \n'
f' {output_name}.csv for Kernel-Substage mapping') f' {output_name}.csv for Kernel-Category mapping')
def anno_gpu_kernname(self, df, mapping): def anno_gpu_kernname(self, df, mapping):
""" add "stage" and "substage" columns """ """ add "Category" column """
def anno_gpu_kernname_helper(name, stage): def anno_gpu_kernname_helper(name):
for kern_name, val in mapping['layer_anno'][stage].items(): for kern_name, val in mapping.items():
if re.search(kern_name, name): if re.search(kern_name, name):
return val return val
for stage in ['Stage', 'Substage']: df['Category'] = df['Name'].apply(anno_gpu_kernname_helper)
df[stage] = df['Name'].apply(anno_gpu_kernname_helper, stage=stage)
def make_nongpu_row(self, df, nongpu_sec): def make_nongpu_row(self, df, nongpu_sec):
""" this will append non-gpu time entry at end of df """ """ this will append non-gpu time entry at end of df """
nongpu_row = self.pd.DataFrame([df.iloc[-1]]) nongpu_row = self.pd.DataFrame([df.iloc[-1]])
nongpu_row['Substage'] = nongpu_row['Name'] = 'CPU(non-GPU)' nongpu_row['Category'] = nongpu_row['Name'] = 'CPU(non-GPU)'
nongpu_row['Instances'] = 1 nongpu_row['Instances'] = 1
nongpu_row['Elapsed Time (sec)'] = nongpu_sec nongpu_row['Elapsed Time (sec)'] = nongpu_sec
return (nongpu_row) return (nongpu_row)
...@@ -302,7 +185,7 @@ class GPUTrace2Graph: ...@@ -302,7 +185,7 @@ class GPUTrace2Graph:
logger.info('generating %s', new_file) logger.info('generating %s', new_file)
return True return True
def gen_sum_file(self, file): def gen_sum_file(self, file, nsys_cmd):
""" """
generates sum file from nsys trace with times per kernel and generates sum file from nsys trace with times per kernel and
returns the name of the sum file returns the name of the sum file
...@@ -318,17 +201,21 @@ class GPUTrace2Graph: ...@@ -318,17 +201,21 @@ class GPUTrace2Graph:
sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv' sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv'
if self.should_gen_file(nsys_stats_file, file): if self.should_gen_file(nsys_stats_file, file):
cmd = [ cmd = [
self.nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o', nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
f'{file_dir}/{file_name}' f'{file_dir}/{file_name}'
] ]
cmd_str = ' '.join(cmd) cmd_str = ' '.join(cmd)
logger.info('+ %s', cmd_str) logger.info('+ %s', cmd_str)
# estimate time based on calibrated 240M/min
file_size_mb = os.path.getsize(file) / 1e6
logger.info(
'nsys stats for %.2f MB file expected to take %.2f min',
file_size_mb, file_size_mb / 240)
try: try:
subprocess.run(cmd) subprocess.run(cmd, check=True)
except Exception: except Exception:
logger.error( logger.error("%s failed; Use --nsys_cmd to specify nsys path",
"%s failed, specify --nsys_cmd for correct nsys path", cmd_str)
cmd_str)
exit(1) exit(1)
logger.info('generating non-overalapped sum %s', sum_file) logger.info('generating non-overalapped sum %s', sum_file)
self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file) self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
...@@ -336,7 +223,7 @@ class GPUTrace2Graph: ...@@ -336,7 +223,7 @@ class GPUTrace2Graph:
logger.info('Finished generating %s', sum_file) logger.info('Finished generating %s', sum_file)
return sum_file return sum_file
def gen_graph(self, in_file, out_dir, title): def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model):
""" generates graph and csv file from in_file into out_dir """ """ generates graph and csv file from in_file into out_dir """
# Initialize an empty DataFrame to store combined data # Initialize an empty DataFrame to store combined data
combined_df = self.pd.DataFrame() combined_df = self.pd.DataFrame()
...@@ -345,17 +232,16 @@ class GPUTrace2Graph: ...@@ -345,17 +232,16 @@ class GPUTrace2Graph:
file_name = os.path.basename(file) file_name = os.path.basename(file)
if not file_dir: if not file_dir:
file_dir = '.' file_dir = '.'
sum_file = self.gen_sum_file(file) sum_file = self.gen_sum_file(file, nsys_cmd)
# read kernel summary file # read kernel summary file
df = self.pd.read_csv(sum_file) df = self.pd.read_csv(sum_file)
# annotate kernel to their categories # annotate kernel to their categories
assert EngineModelData.engine_model.get(engine) assert engine_model.get(engine), f'engine {engine} unknown'
assert EngineModelData.engine_model[engine].get(model) assert engine_model[engine].get(model), f'model {model} unknown'
# remove nsys-rep from file_name for shorter x-label # remove nsys-rep from file_name for shorter x-label
file_name = file_name.replace('.nsys-rep', '') file_name = file_name.replace('.nsys-rep', '')
df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}' df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}'
self.anno_gpu_kernname(df, self.anno_gpu_kernname(df, engine_model[engine][model])
EngineModelData.engine_model[engine][model])
# patch in non-gpu time # patch in non-gpu time
gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1) gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1)
total_sec = round(float(total_sec), 1) total_sec = round(float(total_sec), 1)
...@@ -393,12 +279,12 @@ def main(): ...@@ -393,12 +279,12 @@ def main():
"--out_dir results/ --title \"Model=gpt-oss vLLM chart\""), "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""),
formatter_class=argparse.RawDescriptionHelpFormatter) formatter_class=argparse.RawDescriptionHelpFormatter)
# Build help string showing available engine/model combinations # load supported engine_model
engine_model_help = [] engine_model_supported = load_engine_model()
for engine, models in EngineModelData.engine_model.items(): # Get a string representation of supported engine/model combinations
model_list = list(models.keys()) engine_model_supported_str = ', '.join(
engine_model_help.append(f"{engine}:[{','.join(model_list)}]") f"{engine}:[{', '.join(models.keys())}]"
engine_model_str = ' '.join(engine_model_help) for engine, models in engine_model_supported.items())
parser.add_argument( parser.add_argument(
'--in_file', '--in_file',
type=parse_tuple, type=parse_tuple,
...@@ -408,7 +294,7 @@ def main(): ...@@ -408,7 +294,7 @@ def main():
'separated by space. Elapsed_nonprofiled_sec is runtime without ' 'separated by space. Elapsed_nonprofiled_sec is runtime without '
'profiling used to calculate non-gpu time. Specify 0 to use ' 'profiling used to calculate non-gpu time. Specify 0 to use '
'elapsed time from nsys-rep but that might inflate non-gpu time. ' 'elapsed time from nsys-rep but that might inflate non-gpu time. '
f'Available engine:[model] are: {engine_model_str} ' f'Available engine:[model] are: {engine_model_supported_str} '
f'Example: --infile d1.nsys-rep,vllm,llama,100 ' f'Example: --infile d1.nsys-rep,vllm,llama,100 '
'd2.nsys-rep,vllm,gpt-oss,102'), 'd2.nsys-rep,vllm,gpt-oss,102'),
required=True) required=True)
...@@ -418,8 +304,9 @@ def main(): ...@@ -418,8 +304,9 @@ def main():
help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'), help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'),
default="nsys") default="nsys")
args = parser.parse_args() args = parser.parse_args()
gputrace = GPUTrace2Graph(args.nsys_cmd) gputrace = GPUTrace2Graph()
gputrace.gen_graph(args.in_file, args.out_dir, args.title) gputrace.gen_graph(args.in_file, args.out_dir, args.title, args.nsys_cmd,
engine_model_supported)
if __name__ == '__main__': if __name__ == '__main__':
......
{
"vllm": {
"llama": {
"fused_moe_kernel|GroupProblemShape|group_gemm_starts|bmm_|GemmUniversal": "moe_gemm",
"gemm|nvjet": "gemm",
"moe|sigmoid": "moe",
"CatArrayBatched|prepare_inputs": "prepare_next",
"ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
"_norm_|Norm": "norm",
"act_and_mul_": "activation",
"Rotary": "rope",
"SoftMax": "softmax",
"flash|fmha": "attn",
"elementwise": "elementwise",
"fp8_quant|cvt_": "quantize",
"reduce_kernel": "reduce",
"triton": "triton_kernel",
"CUDA mem": "non-gpu-H_D_memops",
".*": "misc"
},
"ds": {
"block_fp8|gemm_fp8_blockwise": "block_fp8_gemm",
"fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_": "moe_gemm",
"gemm|matmul|nvjet": "gemm",
"moe|sigmoid|expert": "moe",
"CatArrayBatched": "prepare_next",
"ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
"Norm|_norm_": "norm",
"sbtopk": "topk",
"act_and_mul_": "activation",
"compute_position_kernel": "rope",
"elementwise": "elementwise",
"fp8_quant|quant_fp8|cvt_": "quantize",
"reduce": "reduce",
"SoftMax": "softmax",
"_fwd_|FlashAttn|_mla_|_attn_|fmha": "attn",
"triton": "triton_kernel",
"topk": "topk",
"CUDA mem": "non-gpu-H_D_memops",
".*": "misc"
},
"gpt-oss": {
"block_fp8|gemm_fp8_blockwise": "block_fp8_gemm",
"fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_|matmul_ogs_|_topk_forward|_combined_routing|_sum_bitmatrix_rows|_compute_writeback_idx": "moe_gemm",
"gemm|matmul|nvjet": "gemm",
"moe|sigmoid|expert|splitKreduce": "moe",
"CatArrayBatched": "prepare_next",
"ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
"Norm|_norm_": "norm",
"topk": "topk",
"act_and_mul_": "activation",
"compute_position_kernel": "rope",
"elementwise": "elementwise",
"fp8_quant|quant_fp8|cvt_|quantize": "quantize",
"reduce": "reduce",
"SoftMax": "softmax",
"_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha": "attn",
"triton": "triton_kernel",
"CUDA mem": "non-gpu-H_D_memops",
".*": "misc"
}
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment