Commit c721b814 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1

parent d53fe7e5
......@@ -303,8 +303,8 @@ def chunked_prefill_paged_decode(
num_seqs = len(seq_lens)
num_query_heads = query.shape[1]
# key may be None in cross-attention decode (already cached from encoder)
num_kv_heads = key.shape[1] if key is not None else key_cache.shape[1]
num_queries_per_kv = num_query_heads // num_kv_heads
num_kv_heads = key.shape[1]
num_queries_per_kv = query.shape[1] // key.shape[1]
head_size = query.shape[2]
# Conversion of FP8 Tensor from uint8 storage to
......
......@@ -22,6 +22,7 @@ else:
if current_platform.is_cuda():
try:
import vllm._flashmla_extension_C # noqa: F401
_flashmla_extension_C_AVAILABLE = True
except ImportError:
_flashmla_extension_C_AVAILABLE = False
......
......@@ -8,7 +8,6 @@ import sys
import uuid
import weakref
from abc import ABC, abstractmethod
from collections import defaultdict, deque
from collections.abc import Awaitable, Callable, Sequence
from concurrent.futures import Future
......
......@@ -403,7 +403,7 @@ class SpecDecodeBaseProposer:
return draft_token_ids.view(-1, 1)
if self.uses_mrope:
positions = self.mrope_positions[:, last_token_indices]
positions = self.positions[:, last_token_indices]
else:
positions = self.positions[last_token_indices]
if self.method in (
......@@ -1126,7 +1126,6 @@ class SpecDecodeBaseProposer:
"Qwen2_5_VLForConditionalGeneration",
"Qwen3VLForConditionalGeneration",
"Qwen3VLMoeForConditionalGeneration",
"GlmOcrForConditionalGeneration",
]:
self.model.config.image_token_index = target_model.config.image_token_id
elif self.get_model_name(target_model) == "PixtralForConditionalGeneration":
......
......@@ -74,6 +74,9 @@ class StructuredOutputManager:
self.tokenizer = cached_tokenizer_from_config(
model_config=self.vllm_config.model_config
)
reasoning_parser = (
self.vllm_config.structured_outputs_config.reasoning_parser
)
reasoning_parser_plugin = (
self.vllm_config.structured_outputs_config.reasoning_parser_plugin
)
......
......@@ -11,26 +11,6 @@ from vllm.utils.platform_utils import is_uva_available
from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
def async_copy_to_gpu(
x: torch.Tensor | np.ndarray,
out: torch.Tensor | None = None,
device: torch.device | None = None,
) -> torch.Tensor:
if isinstance(x, np.ndarray):
x = torch.from_numpy(x)
assert x.is_cpu
assert not x.is_pinned()
if out is None:
assert device is not None
out = torch.empty_like(x, device=device)
# CPU-to-CPU copy
tmp = x.pin_memory()
# CPU-to-GPU copy
return out.copy_(tmp, non_blocking=True)
class UvaBuffer:
def __init__(self, size: int | Sequence[int], dtype: torch.dtype):
if not is_uva_available():
......@@ -241,4 +221,4 @@ def _apply_write_kernel(
content = tl.load(write_contents_ptr + cu_start + block, mask=mask)
tl.store(
output_ptr + row_idx * output_stride + start_idx + block, content, mask=mask
)
)
\ No newline at end of file
......@@ -6,6 +6,7 @@ import torch
from vllm.model_executor.models.interfaces import SupportsMultiModal
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.v1.worker.gpu.buffer_utils import UvaBufferPool
from vllm.v1.worker.utils import sanity_check_mm_encoder_outputs
......@@ -31,6 +32,8 @@ class EncoderRunner:
self.req_id_to_mm_features: dict[str, list[MultiModalFeatureSpec]] = {}
self.encoder_cache: dict[str, torch.Tensor] = {}
self.tmp_is_mm_embed = UvaBufferPool(max_num_tokens, torch.bool)
def add_request(self, req_id: str, mm_features: list[MultiModalFeatureSpec]):
self.req_id_to_mm_features[req_id] = mm_features
......@@ -111,7 +114,7 @@ class EncoderRunner:
total_num_scheduled_tokens,
dtype=torch.bool,
device="cpu",
pin_memory=True,
pin_memory=False,
)
for i, req_id in enumerate(req_ids):
if not is_prefilling[i]:
......@@ -160,7 +163,7 @@ class EncoderRunner:
mm_embeds.append(mm_embeds_item)
# Copy the is_mm_embed tensor to the GPU.
is_mm_embed = is_mm_embed.to(device=self.device, non_blocking=True)
is_mm_embed = self.tmp_is_mm_embed.copy_to_gpu(is_mm_embed)
return mm_embeds, is_mm_embed
@torch.inference_mode()
......@@ -178,4 +181,4 @@ class EncoderRunner:
)
# Copy to the pre-allocated buffer for CUDA graphs.
self.inputs_embeds[: x.shape[0]] = x
return self.inputs_embeds
return self.inputs_embeds
\ No newline at end of file
......@@ -30,7 +30,7 @@ from vllm.v1.worker.gpu.attn_utils import (
init_kv_cache,
)
from vllm.v1.worker.gpu.block_table import BlockTables
from vllm.v1.worker.gpu.buffer_utils import async_copy_to_gpu
from vllm.v1.worker.gpu.buffer_utils import UvaBufferPool
from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
from vllm.v1.worker.gpu.dp_utils import (
get_cudagraph_and_dp_padding,
......@@ -172,6 +172,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# LoRA-related workers.
self.lora_state = LoraState(max_num_reqs=self.max_num_reqs)
# Buffers for CPU-to-GPU copies.
self.tmp_idx_mapping = UvaBufferPool(self.max_num_reqs, torch.int32)
self.tmp_cu_num_logits = UvaBufferPool(self.max_num_reqs + 1, torch.int32)
self.tmp_query_start_loc = UvaBufferPool(self.max_num_reqs + 1, torch.int32)
self.kv_connector: KVConnector = NO_OP_KV_CONNECTOR
def update_max_model_len(self, max_model_len: int) -> None:
......@@ -513,7 +518,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.req_states.req_id_to_index[req_id] for req_id in req_ids
]
idx_mapping_np = np.array(idx_mapping_list, dtype=np.int32)
idx_mapping = async_copy_to_gpu(idx_mapping_np, device=self.device)
idx_mapping = self.tmp_idx_mapping.copy_to_gpu(idx_mapping_np)
# Get the number of draft tokens for each request.
if not scheduler_output.scheduled_spec_decode_tokens:
......@@ -541,7 +546,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32)
cu_num_logits_np[0] = 0
np.cumsum(num_logits, out=cu_num_logits_np[1:])
cu_num_logits = async_copy_to_gpu(cu_num_logits_np, device=self.device)
cu_num_logits = self.tmp_cu_num_logits.copy_to_gpu(cu_num_logits_np)
expanded_idx_mapping = expand_idx_mapping(
idx_mapping,
......@@ -560,8 +565,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Pad for full CUDA graph mode.
# Some attention backends like FA3 require query_start_loc to be non-decreasing.
query_start_loc_np[num_reqs + 1 :] = num_tokens
async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc)
self.tmp_query_start_loc.copy_to_gpu(
query_start_loc_np,
out=self.input_buffers.query_start_loc,
)
query_start_loc_np = query_start_loc_np[: num_reqs + 1]
query_start_loc_cpu = torch.from_numpy(query_start_loc_np)
query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
......@@ -969,4 +976,4 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if self.use_async_scheduling:
return async_output
return async_output.get_output()
return async_output.get_output()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment