"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "f75aa727320f0edabe5d31410ef86ac8ae27d86b"
Unverified Commit cec418b5 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[Model Runner V2] Change Numba AoT to JIT (#29328)


Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent cc313cb7
...@@ -4,7 +4,6 @@ from dataclasses import dataclass ...@@ -4,7 +4,6 @@ from dataclasses import dataclass
from typing import Any from typing import Any
import numba import numba
import numba.types as types
import numpy as np import numpy as np
import torch import torch
...@@ -147,80 +146,42 @@ class InputBatch: ...@@ -147,80 +146,42 @@ class InputBatch:
) )
# NOTE: With the type annotations, this function is pre-compiled @numba.njit(cache=True)
# before the first call.
@numba.jit(
[
types.none(
types.int32[:], # idx_mapping
types.int32[:], # num_scheduled_tokens
types.int32[:, :], # prefill_token_ids
types.int32[:], # num_computed_prefill_tokens
types.int32[:], # prefill_len
types.int32[:], # input_ids
types.int32[:], # query_start_loc
)
],
nopython=True,
cache=True,
)
def _prepare_prefill_inputs( def _prepare_prefill_inputs(
idx_mapping: np.ndarray, # batch_idx -> req_idx idx_mapping: np.ndarray, # [B]
num_scheduled_tokens: np.ndarray, # [B] query_lens: np.ndarray, # [B]
query_start_loc: np.ndarray, # [B + 1]
prefill_token_ids: np.ndarray, # [N, max_model_len] prefill_token_ids: np.ndarray, # [N, max_model_len]
num_computed_prefill_tokens: np.ndarray, # [N] num_computed_prefill_tokens: np.ndarray, # [N]
prefill_len: np.ndarray, # [N]
input_ids: np.ndarray, # [num_input_tokens] input_ids: np.ndarray, # [num_input_tokens]
query_start_loc: np.ndarray, # [B + 1]
) -> None: ) -> None:
num_reqs = num_scheduled_tokens.shape[0] num_reqs = idx_mapping.shape[0]
query_start_loc[0] = 0 query_starts = query_start_loc[:num_reqs]
query_ends = query_start_loc[1 : num_reqs + 1]
cu_num_tokens = 0 starts = num_computed_prefill_tokens[idx_mapping]
ends = starts + query_lens
for i in range(num_reqs): for i in range(num_reqs):
req_idx = idx_mapping[i] input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[
query_len = num_scheduled_tokens[i] idx_mapping[i], starts[i] : ends[i]
]
start = num_computed_prefill_tokens[req_idx]
end = min(start + query_len, prefill_len[req_idx])
n = end - start
start_idx = cu_num_tokens
input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end]
cu_num_tokens = start_idx + query_len
query_start_loc[i + 1] = cu_num_tokens
# Pad the inputs for CUDA graphs.
# Note: pad query_start_loc to be non-decreasing, as kernels
# like FlashAttention requires that
query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
def prepare_prefill_inputs( def prepare_prefill_inputs(
idx_mapping: np.ndarray, idx_mapping: np.ndarray,
num_scheduled_tokens: np.ndarray, num_scheduled_tokens: np.ndarray,
total_num_tokens: int, query_start_loc: np.ndarray,
prefill_token_ids: np.ndarray, prefill_token_ids: np.ndarray,
num_computed_prefill_tokens: np.ndarray, num_computed_prefill_tokens: np.ndarray,
prefill_len: np.ndarray, input_ids: np.ndarray,
input_ids: CpuGpuBuffer,
query_start_loc: CpuGpuBuffer,
) -> None: ) -> None:
_prepare_prefill_inputs( _prepare_prefill_inputs(
idx_mapping, idx_mapping,
num_scheduled_tokens, num_scheduled_tokens,
query_start_loc,
prefill_token_ids, prefill_token_ids,
num_computed_prefill_tokens, num_computed_prefill_tokens,
prefill_len, input_ids,
input_ids.np,
query_start_loc.np,
) )
input_ids.copy_to_gpu(total_num_tokens)
# NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
# tensors from CPU to GPU, because they may include paddings needed
# for full CUDA graph mode.
query_start_loc.copy_to_gpu()
@triton.jit @triton.jit
......
...@@ -502,20 +502,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -502,20 +502,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks] # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
block_tables = self.block_tables.gather_block_tables(idx_mapping) block_tables = self.block_tables.gather_block_tables(idx_mapping)
# Copy prefill tokens from CPU to GPU and get query_start_loc. # Get query_start_loc.
np.cumsum(
num_scheduled_tokens,
out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1],
)
# Pad for full CUDA graph mode.
# Some attention backends like FA3 require query_start_loc to be non-decreasing.
self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
self.input_buffers.query_start_loc.copy_to_gpu()
query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
# Copy prefill tokens from CPU to GPU.
prepare_prefill_inputs( prepare_prefill_inputs(
idx_mapping_np, idx_mapping_np,
num_scheduled_tokens, num_scheduled_tokens,
num_tokens, query_start_loc_np,
self.req_states.prefill_token_ids, self.req_states.prefill_token_ids,
self.req_states.num_computed_prefill_tokens, self.req_states.num_computed_prefill_tokens,
self.req_states.prefill_len.np, self.input_buffers.input_ids.np,
self.input_buffers.input_ids,
self.input_buffers.query_start_loc,
) )
query_start_loc = self.input_buffers.query_start_loc self.input_buffers.input_ids.copy_to_gpu(num_tokens)
query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
query_start_loc_np = query_start_loc.np[: num_reqs + 1]
# Prepare positions and seq_lens. # Prepare positions and seq_lens.
prepare_pos_seq_lens( prepare_pos_seq_lens(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment