Commit c721b814 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1

parent d53fe7e5
...@@ -303,8 +303,8 @@ def chunked_prefill_paged_decode( ...@@ -303,8 +303,8 @@ def chunked_prefill_paged_decode(
num_seqs = len(seq_lens) num_seqs = len(seq_lens)
num_query_heads = query.shape[1] num_query_heads = query.shape[1]
# key may be None in cross-attention decode (already cached from encoder) # key may be None in cross-attention decode (already cached from encoder)
num_kv_heads = key.shape[1] if key is not None else key_cache.shape[1] num_kv_heads = key.shape[1]
num_queries_per_kv = num_query_heads // num_kv_heads num_queries_per_kv = query.shape[1] // key.shape[1]
head_size = query.shape[2] head_size = query.shape[2]
# Conversion of FP8 Tensor from uint8 storage to # Conversion of FP8 Tensor from uint8 storage to
......
...@@ -22,6 +22,7 @@ else: ...@@ -22,6 +22,7 @@ else:
if current_platform.is_cuda(): if current_platform.is_cuda():
try: try:
import vllm._flashmla_extension_C # noqa: F401 import vllm._flashmla_extension_C # noqa: F401
_flashmla_extension_C_AVAILABLE = True _flashmla_extension_C_AVAILABLE = True
except ImportError: except ImportError:
_flashmla_extension_C_AVAILABLE = False _flashmla_extension_C_AVAILABLE = False
......
...@@ -8,7 +8,6 @@ import sys ...@@ -8,7 +8,6 @@ import sys
import uuid import uuid
import weakref import weakref
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections import defaultdict, deque from collections import defaultdict, deque
from collections.abc import Awaitable, Callable, Sequence from collections.abc import Awaitable, Callable, Sequence
from concurrent.futures import Future from concurrent.futures import Future
......
...@@ -403,7 +403,7 @@ class SpecDecodeBaseProposer: ...@@ -403,7 +403,7 @@ class SpecDecodeBaseProposer:
return draft_token_ids.view(-1, 1) return draft_token_ids.view(-1, 1)
if self.uses_mrope: if self.uses_mrope:
positions = self.mrope_positions[:, last_token_indices] positions = self.positions[:, last_token_indices]
else: else:
positions = self.positions[last_token_indices] positions = self.positions[last_token_indices]
if self.method in ( if self.method in (
...@@ -1126,7 +1126,6 @@ class SpecDecodeBaseProposer: ...@@ -1126,7 +1126,6 @@ class SpecDecodeBaseProposer:
"Qwen2_5_VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration",
"Qwen3VLForConditionalGeneration", "Qwen3VLForConditionalGeneration",
"Qwen3VLMoeForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration",
"GlmOcrForConditionalGeneration",
]: ]:
self.model.config.image_token_index = target_model.config.image_token_id self.model.config.image_token_index = target_model.config.image_token_id
elif self.get_model_name(target_model) == "PixtralForConditionalGeneration": elif self.get_model_name(target_model) == "PixtralForConditionalGeneration":
......
...@@ -74,6 +74,9 @@ class StructuredOutputManager: ...@@ -74,6 +74,9 @@ class StructuredOutputManager:
self.tokenizer = cached_tokenizer_from_config( self.tokenizer = cached_tokenizer_from_config(
model_config=self.vllm_config.model_config model_config=self.vllm_config.model_config
) )
reasoning_parser = (
self.vllm_config.structured_outputs_config.reasoning_parser
)
reasoning_parser_plugin = ( reasoning_parser_plugin = (
self.vllm_config.structured_outputs_config.reasoning_parser_plugin self.vllm_config.structured_outputs_config.reasoning_parser_plugin
) )
......
...@@ -11,26 +11,6 @@ from vllm.utils.platform_utils import is_uva_available ...@@ -11,26 +11,6 @@ from vllm.utils.platform_utils import is_uva_available
from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
def async_copy_to_gpu(
x: torch.Tensor | np.ndarray,
out: torch.Tensor | None = None,
device: torch.device | None = None,
) -> torch.Tensor:
if isinstance(x, np.ndarray):
x = torch.from_numpy(x)
assert x.is_cpu
assert not x.is_pinned()
if out is None:
assert device is not None
out = torch.empty_like(x, device=device)
# CPU-to-CPU copy
tmp = x.pin_memory()
# CPU-to-GPU copy
return out.copy_(tmp, non_blocking=True)
class UvaBuffer: class UvaBuffer:
def __init__(self, size: int | Sequence[int], dtype: torch.dtype): def __init__(self, size: int | Sequence[int], dtype: torch.dtype):
if not is_uva_available(): if not is_uva_available():
...@@ -241,4 +221,4 @@ def _apply_write_kernel( ...@@ -241,4 +221,4 @@ def _apply_write_kernel(
content = tl.load(write_contents_ptr + cu_start + block, mask=mask) content = tl.load(write_contents_ptr + cu_start + block, mask=mask)
tl.store( tl.store(
output_ptr + row_idx * output_stride + start_idx + block, content, mask=mask output_ptr + row_idx * output_stride + start_idx + block, content, mask=mask
) )
\ No newline at end of file
...@@ -6,6 +6,7 @@ import torch ...@@ -6,6 +6,7 @@ import torch
from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.model_executor.models.interfaces import SupportsMultiModal
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem
from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.v1.worker.gpu.buffer_utils import UvaBufferPool
from vllm.v1.worker.utils import sanity_check_mm_encoder_outputs from vllm.v1.worker.utils import sanity_check_mm_encoder_outputs
...@@ -31,6 +32,8 @@ class EncoderRunner: ...@@ -31,6 +32,8 @@ class EncoderRunner:
self.req_id_to_mm_features: dict[str, list[MultiModalFeatureSpec]] = {} self.req_id_to_mm_features: dict[str, list[MultiModalFeatureSpec]] = {}
self.encoder_cache: dict[str, torch.Tensor] = {} self.encoder_cache: dict[str, torch.Tensor] = {}
self.tmp_is_mm_embed = UvaBufferPool(max_num_tokens, torch.bool)
def add_request(self, req_id: str, mm_features: list[MultiModalFeatureSpec]): def add_request(self, req_id: str, mm_features: list[MultiModalFeatureSpec]):
self.req_id_to_mm_features[req_id] = mm_features self.req_id_to_mm_features[req_id] = mm_features
...@@ -111,7 +114,7 @@ class EncoderRunner: ...@@ -111,7 +114,7 @@ class EncoderRunner:
total_num_scheduled_tokens, total_num_scheduled_tokens,
dtype=torch.bool, dtype=torch.bool,
device="cpu", device="cpu",
pin_memory=True, pin_memory=False,
) )
for i, req_id in enumerate(req_ids): for i, req_id in enumerate(req_ids):
if not is_prefilling[i]: if not is_prefilling[i]:
...@@ -160,7 +163,7 @@ class EncoderRunner: ...@@ -160,7 +163,7 @@ class EncoderRunner:
mm_embeds.append(mm_embeds_item) mm_embeds.append(mm_embeds_item)
# Copy the is_mm_embed tensor to the GPU. # Copy the is_mm_embed tensor to the GPU.
is_mm_embed = is_mm_embed.to(device=self.device, non_blocking=True) is_mm_embed = self.tmp_is_mm_embed.copy_to_gpu(is_mm_embed)
return mm_embeds, is_mm_embed return mm_embeds, is_mm_embed
@torch.inference_mode() @torch.inference_mode()
...@@ -178,4 +181,4 @@ class EncoderRunner: ...@@ -178,4 +181,4 @@ class EncoderRunner:
) )
# Copy to the pre-allocated buffer for CUDA graphs. # Copy to the pre-allocated buffer for CUDA graphs.
self.inputs_embeds[: x.shape[0]] = x self.inputs_embeds[: x.shape[0]] = x
return self.inputs_embeds return self.inputs_embeds
\ No newline at end of file
...@@ -30,7 +30,7 @@ from vllm.v1.worker.gpu.attn_utils import ( ...@@ -30,7 +30,7 @@ from vllm.v1.worker.gpu.attn_utils import (
init_kv_cache, init_kv_cache,
) )
from vllm.v1.worker.gpu.block_table import BlockTables from vllm.v1.worker.gpu.block_table import BlockTables
from vllm.v1.worker.gpu.buffer_utils import async_copy_to_gpu from vllm.v1.worker.gpu.buffer_utils import UvaBufferPool
from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
from vllm.v1.worker.gpu.dp_utils import ( from vllm.v1.worker.gpu.dp_utils import (
get_cudagraph_and_dp_padding, get_cudagraph_and_dp_padding,
...@@ -172,6 +172,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -172,6 +172,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# LoRA-related workers. # LoRA-related workers.
self.lora_state = LoraState(max_num_reqs=self.max_num_reqs) self.lora_state = LoraState(max_num_reqs=self.max_num_reqs)
# Buffers for CPU-to-GPU copies.
self.tmp_idx_mapping = UvaBufferPool(self.max_num_reqs, torch.int32)
self.tmp_cu_num_logits = UvaBufferPool(self.max_num_reqs + 1, torch.int32)
self.tmp_query_start_loc = UvaBufferPool(self.max_num_reqs + 1, torch.int32)
self.kv_connector: KVConnector = NO_OP_KV_CONNECTOR self.kv_connector: KVConnector = NO_OP_KV_CONNECTOR
def update_max_model_len(self, max_model_len: int) -> None: def update_max_model_len(self, max_model_len: int) -> None:
...@@ -513,7 +518,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -513,7 +518,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.req_states.req_id_to_index[req_id] for req_id in req_ids self.req_states.req_id_to_index[req_id] for req_id in req_ids
] ]
idx_mapping_np = np.array(idx_mapping_list, dtype=np.int32) idx_mapping_np = np.array(idx_mapping_list, dtype=np.int32)
idx_mapping = async_copy_to_gpu(idx_mapping_np, device=self.device) idx_mapping = self.tmp_idx_mapping.copy_to_gpu(idx_mapping_np)
# Get the number of draft tokens for each request. # Get the number of draft tokens for each request.
if not scheduler_output.scheduled_spec_decode_tokens: if not scheduler_output.scheduled_spec_decode_tokens:
...@@ -541,7 +546,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -541,7 +546,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32) cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32)
cu_num_logits_np[0] = 0 cu_num_logits_np[0] = 0
np.cumsum(num_logits, out=cu_num_logits_np[1:]) np.cumsum(num_logits, out=cu_num_logits_np[1:])
cu_num_logits = async_copy_to_gpu(cu_num_logits_np, device=self.device) cu_num_logits = self.tmp_cu_num_logits.copy_to_gpu(cu_num_logits_np)
expanded_idx_mapping = expand_idx_mapping( expanded_idx_mapping = expand_idx_mapping(
idx_mapping, idx_mapping,
...@@ -560,8 +565,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -560,8 +565,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Pad for full CUDA graph mode. # Pad for full CUDA graph mode.
# Some attention backends like FA3 require query_start_loc to be non-decreasing. # Some attention backends like FA3 require query_start_loc to be non-decreasing.
query_start_loc_np[num_reqs + 1 :] = num_tokens query_start_loc_np[num_reqs + 1 :] = num_tokens
async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc) self.tmp_query_start_loc.copy_to_gpu(
query_start_loc_np,
out=self.input_buffers.query_start_loc,
)
query_start_loc_np = query_start_loc_np[: num_reqs + 1] query_start_loc_np = query_start_loc_np[: num_reqs + 1]
query_start_loc_cpu = torch.from_numpy(query_start_loc_np) query_start_loc_cpu = torch.from_numpy(query_start_loc_np)
query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1] query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
...@@ -969,4 +976,4 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -969,4 +976,4 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if self.use_async_scheduling: if self.use_async_scheduling:
return async_output return async_output
return async_output.get_output() return async_output.get_output()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment