Commit 341b1b91 authored by guanyu1's avatar guanyu1
Browse files

vl-mrope-1d优化

parent 18459e7a
...@@ -156,6 +156,7 @@ if TYPE_CHECKING: ...@@ -156,6 +156,7 @@ if TYPE_CHECKING:
VLLM_MXFP4_USE_MARLIN: bool | None = None VLLM_MXFP4_USE_MARLIN: bool | None = None
VLLM_DEEPEPLL_NVFP4_DISPATCH: bool = False VLLM_DEEPEPLL_NVFP4_DISPATCH: bool = False
VLLM_V1_USE_OUTLINES_CACHE: bool = False VLLM_V1_USE_OUTLINES_CACHE: bool = False
VLLM_1D_MROPE: bool = False
VLLM_TPU_BUCKET_PADDING_GAP: int = 0 VLLM_TPU_BUCKET_PADDING_GAP: int = 0
VLLM_TPU_MOST_MODEL_LEN: int | None = None VLLM_TPU_MOST_MODEL_LEN: int | None = None
VLLM_TPU_USING_PATHWAYS: bool = False VLLM_TPU_USING_PATHWAYS: bool = False
...@@ -1888,6 +1889,8 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1888,6 +1889,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_MOE_W16A16_TRITON": "VLLM_USE_MOE_W16A16_TRITON":
lambda: (os.environ.get("VLLM_USE_MOE_W16A16_TRITON", "0").lower() in lambda: (os.environ.get("VLLM_USE_MOE_W16A16_TRITON", "0").lower() in
("true", "1")), ("true", "1")),
"VLLM_1D_MROPE":
lambda: (os.environ.get("VLLM_1D_MROPE", "0").lower() in ("true", "1")),
#If set to 1/True, enable the V1 fast token-id copy path in InputBatch. #If set to 1/True, enable the V1 fast token-id copy path in InputBatch.
"VLLM_V1_FAST_TOKEN_ID_COPY": "VLLM_V1_FAST_TOKEN_ID_COPY":
lambda: (os.environ.get("VLLM_V1_FAST_TOKEN_ID_COPY", "False").lower() in lambda: (os.environ.get("VLLM_V1_FAST_TOKEN_ID_COPY", "False").lower() in
......
...@@ -789,6 +789,30 @@ class GPUModelRunner( ...@@ -789,6 +789,30 @@ class GPUModelRunner(
pin_memory=self.pin_memory, pin_memory=self.pin_memory,
with_numpy=numpy, with_numpy=numpy,
) )
def _copy_mrope_positions_to_gpu(self, num_tokens: int) -> None:
if not self.uses_mrope:
return
if self.use_1d_mrope:
num_values = 3 * num_tokens
self.mrope_positions.gpu[:num_values].copy_(
self.mrope_positions.cpu[:num_values],
non_blocking=True,
)
return
self.mrope_positions.gpu[:, :num_tokens].copy_(
self.mrope_positions.cpu[:, :num_tokens],
non_blocking=True,
)
def _copy_xdrope_positions_to_gpu(self, num_tokens: int) -> None:
if self.uses_xdrope_dim <= 0:
return
self.xdrope_positions.gpu[:, :num_tokens].copy_(
self.xdrope_positions.cpu[:, :num_tokens],
non_blocking=True,
)
def _init_model_kwargs(self): def _init_model_kwargs(self):
model_kwargs = dict[str, Any]() model_kwargs = dict[str, Any]()
...@@ -1592,19 +1616,12 @@ class GPUModelRunner( ...@@ -1592,19 +1616,12 @@ class GPUModelRunner(
total_num_scheduled_tokens, total_num_scheduled_tokens,
cu_num_tokens, cu_num_tokens,
) )
if self.uses_mrope: if self.uses_mrope:
# Only relevant for models using M-RoPE (e.g, Qwen2-VL) # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( self._copy_mrope_positions_to_gpu(total_num_scheduled_tokens)
self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
non_blocking=True,
)
elif self.uses_xdrope_dim > 0: elif self.uses_xdrope_dim > 0:
# Only relevant for models using XD-RoPE (e.g, HunYuan-VL) # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( self._copy_xdrope_positions_to_gpu(total_num_scheduled_tokens)
self.xdrope_positions.cpu[:, :total_num_scheduled_tokens],
non_blocking=True,
)
else: else:
# Common case (1D positions) # Common case (1D positions)
self.positions.copy_to_gpu(total_num_scheduled_tokens) self.positions.copy_to_gpu(total_num_scheduled_tokens)
...@@ -2047,6 +2064,13 @@ class GPUModelRunner( ...@@ -2047,6 +2064,13 @@ class GPUModelRunner(
def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
mrope_pos_ptr = 0 mrope_pos_ptr = 0
if self.use_1d_mrope:
mrope_positions_token_major = self.mrope_positions.cpu.view(
self.max_num_tokens + 1, 3
)
mrope_positions_token_major_np = self.mrope_positions.np.reshape(
self.max_num_tokens + 1, 3
)
for index, req_id in enumerate(self.input_batch.req_ids): for index, req_id in enumerate(self.input_batch.req_ids):
req = self.requests[req_id] req = self.requests[req_id]
assert req.mrope_positions is not None assert req.mrope_positions is not None
...@@ -2073,6 +2097,11 @@ class GPUModelRunner( ...@@ -2073,6 +2097,11 @@ class GPUModelRunner(
src_start = num_computed_tokens src_start = num_computed_tokens
src_end = num_computed_tokens + prompt_part_len src_end = num_computed_tokens + prompt_part_len
if self.use_1d_mrope:
mrope_positions_token_major[dst_start:dst_end, :].copy_(
req.mrope_positions[:, src_start:src_end].transpose(0, 1)
)
else:
self.mrope_positions.cpu[:, dst_start:dst_end] = req.mrope_positions[ self.mrope_positions.cpu[:, dst_start:dst_end] = req.mrope_positions[
:, src_start:src_end :, src_start:src_end
] ]
...@@ -2084,6 +2113,19 @@ class GPUModelRunner( ...@@ -2084,6 +2113,19 @@ class GPUModelRunner(
dst_end = mrope_pos_ptr + completion_part_len dst_end = mrope_pos_ptr + completion_part_len
assert req.mrope_position_delta is not None assert req.mrope_position_delta is not None
if self.use_1d_mrope:
values = np.arange(
req.mrope_position_delta + num_computed_tokens + prompt_part_len,
req.mrope_position_delta
+ num_computed_tokens
+ prompt_part_len
+ completion_part_len,
dtype=mrope_positions_token_major_np.dtype,
)
mrope_positions_token_major_np[dst_start:dst_end, :] = values[
:, None
]
else:
MRotaryEmbedding.get_next_input_positions_tensor( MRotaryEmbedding.get_next_input_positions_tensor(
out=self.mrope_positions.np, out=self.mrope_positions.np,
out_offset=dst_start, out_offset=dst_start,
...@@ -2574,11 +2616,11 @@ class GPUModelRunner( ...@@ -2574,11 +2616,11 @@ class GPUModelRunner(
if should_sync_mrope_positions: if should_sync_mrope_positions:
self._calc_mrope_positions(scheduler_output) self._calc_mrope_positions(scheduler_output)
self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens) self._copy_mrope_positions_to_gpu(total_num_scheduled_tokens)
if should_sync_xdrope_positions: if should_sync_xdrope_positions:
self._calc_xdrope_positions(scheduler_output) self._calc_xdrope_positions(scheduler_output)
self.xdrope_positions.copy_to_gpu(total_num_scheduled_tokens) self._copy_xdrope_positions_to_gpu(total_num_scheduled_tokens)
return mm_embeds, is_mm_embed return mm_embeds, is_mm_embed
...@@ -2837,12 +2879,7 @@ class GPUModelRunner( ...@@ -2837,12 +2879,7 @@ class GPUModelRunner(
inputs_embeds = None inputs_embeds = None
model_kwargs = self._init_model_kwargs() model_kwargs = self._init_model_kwargs()
if self.uses_mrope: positions = self._get_positions(num_input_tokens)
positions = self.mrope_positions.gpu[:, :num_input_tokens]
elif self.uses_xdrope_dim > 0:
positions = self.xdrope_positions.gpu[:, :num_input_tokens]
else:
positions = self.positions.gpu[:num_input_tokens]
if is_first_rank: if is_first_rank:
intermediate_tensors = None intermediate_tensors = None
...@@ -4727,13 +4764,13 @@ class GPUModelRunner( ...@@ -4727,13 +4764,13 @@ class GPUModelRunner(
input_ids = self.input_ids.gpu[:num_tokens_padded] input_ids = self.input_ids.gpu[:num_tokens_padded]
inputs_embeds = None inputs_embeds = None
if self.uses_mrope: # if self.uses_mrope:
positions = self.mrope_positions.gpu[:, :num_tokens_padded] # positions = self.mrope_positions.gpu[:, :num_tokens_padded]
elif self.uses_xdrope_dim > 0: # elif self.uses_xdrope_dim > 0:
positions = self.xdrope_positions.gpu[:, :num_tokens_padded] # positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
else: # else:
positions = self.positions.gpu[:num_tokens_padded] # positions = self.positions.gpu[:num_tokens_padded]
positions = self._get_positions(num_tokens_padded)
if get_pp_group().is_first_rank: if get_pp_group().is_first_rank:
intermediate_tensors = None intermediate_tensors = None
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment