"vscode:/vscode.git/clone" did not exist on "09e9245478a44faec3c9bc888edea4089085e222"
Commit 0a9b4a7f authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.1-dev-wm' into 'v0.9.1-dev'

[fix]1.优化mtp代码,解决prefix-caching不兼容问题; 2.修复v1 engine从tokenizer config中读取max_model_len导致长文本输入报错

See merge request dcutoolkit/deeplearing/vllm!158
parents e34d3444 b16f183c
...@@ -1440,17 +1440,17 @@ class ModelConfig: ...@@ -1440,17 +1440,17 @@ class ModelConfig:
spec_target_max_model_len=self.spec_target_max_model_len, spec_target_max_model_len=self.spec_target_max_model_len,
encoder_config=self.encoder_config) encoder_config=self.encoder_config)
tokenizer_config = try_get_tokenizer_config( # tokenizer_config = try_get_tokenizer_config(
self.tokenizer, # self.tokenizer,
trust_remote_code=self.trust_remote_code, # trust_remote_code=self.trust_remote_code,
revision=self.tokenizer_revision) # revision=self.tokenizer_revision)
if tokenizer_config is None: # if tokenizer_config is None:
return max_model_len # return max_model_len
model_max_length = tokenizer_config.get("model_max_length", # model_max_length = tokenizer_config.get("model_max_length",
max_model_len) # max_model_len)
max_model_len = min(max_model_len, model_max_length) #max_model_len = min(max_model_len, model_max_length)
return max_model_len return max_model_len
......
...@@ -388,30 +388,8 @@ class MLACommonMetadataBuilder(Generic[M]): ...@@ -388,30 +388,8 @@ class MLACommonMetadataBuilder(Generic[M]):
) )
self.block_table = block_table self.block_table = block_table
self._use_spec_decode = False self.use_spec_decode = False
self.pin_memory = is_pin_memory_available() self.num_scheduled_tokens_np = np.zeros(scheduler_config.max_num_seqs, dtype=np.int32)
self._num_scheduled_tokens = torch.zeros(scheduler_config.max_num_seqs,
dtype=torch.int32,
device=runner.device)
self._num_scheduled_tokens_cpu_tensor = torch.zeros(
(scheduler_config.max_num_seqs, ),
device="cpu",
dtype=torch.int32,
pin_memory=self.pin_memory,
)
self._num_scheduled_tokens_np = self._num_scheduled_tokens_cpu_tensor.numpy()
self._seq_lens_minus = torch.zeros(scheduler_config.max_num_seqs*5,
dtype=torch.int32,
device=runner.device)
self._seq_lens_minus_cpu_tensor = torch.zeros(
(scheduler_config.max_num_seqs*5, ),
device="cpu",
dtype=torch.int32,
pin_memory=self.pin_memory,
)
self._seq_lens_minus_np = self._seq_lens_minus_cpu_tensor.numpy()
def reorder_batch(self, input_batch: "InputBatch", def reorder_batch(self, input_batch: "InputBatch",
...@@ -445,7 +423,7 @@ class MLACommonMetadataBuilder(Generic[M]): ...@@ -445,7 +423,7 @@ class MLACommonMetadataBuilder(Generic[M]):
req_idx = input_batch.req_id_to_index[req_id] req_idx = input_batch.req_id_to_index[req_id]
num_computed_tokens = input_batch.num_computed_tokens_cpu[req_idx] num_computed_tokens = input_batch.num_computed_tokens_cpu[req_idx]
num_prompt_tokens = input_batch.num_prompt_tokens[req_idx] num_prompt_tokens = input_batch.num_prompt_tokens[req_idx]
self._num_scheduled_tokens_np[i] = num_tokens self.num_scheduled_tokens_np[i] = num_tokens
if num_computed_tokens < num_prompt_tokens: if num_computed_tokens < num_prompt_tokens:
prefills.append(i) prefills.append(i)
num_prefill_tokens += num_tokens num_prefill_tokens += num_tokens
...@@ -478,9 +456,10 @@ class MLACommonMetadataBuilder(Generic[M]): ...@@ -478,9 +456,10 @@ class MLACommonMetadataBuilder(Generic[M]):
modified_batch = True modified_batch = True
# num_scheduled_tokens also need to be swapped # num_scheduled_tokens also need to be swapped
tmp = self._num_scheduled_tokens_np[decode_idx] tmp = self.num_scheduled_tokens_np[decode_idx]
self._num_scheduled_tokens_np[decode_idx] = self._num_scheduled_tokens_np[prefills[i - 1]] self.num_scheduled_tokens_np[decode_idx] = self.num_scheduled_tokens_np[prefills[i - 1]]
self._num_scheduled_tokens_np[prefills[i - 1]] = tmp self.num_scheduled_tokens_np[prefills[i - 1]] = tmp
# Save for next `build` call # Save for next `build` call
# TODO(lucas): this is a bit of a hack, we should probably have a # TODO(lucas): this is a bit of a hack, we should probably have a
...@@ -490,11 +469,7 @@ class MLACommonMetadataBuilder(Generic[M]): ...@@ -490,11 +469,7 @@ class MLACommonMetadataBuilder(Generic[M]):
self._num_decode_tokens = num_decode_tokens self._num_decode_tokens = num_decode_tokens
self._num_prefill_tokens = num_prefill_tokens self._num_prefill_tokens = num_prefill_tokens
self._use_spec_decode = use_spec_decode self.use_spec_decode = use_spec_decode
if use_spec_decode:
self._num_scheduled_tokens[:len(input_batch.req_ids)].copy_(
self._num_scheduled_tokens_cpu_tensor[:len(input_batch.req_ids)],
non_blocking=True)
return modified_batch return modified_batch
...@@ -601,31 +576,27 @@ class MLACommonMetadataBuilder(Generic[M]): ...@@ -601,31 +576,27 @@ class MLACommonMetadataBuilder(Generic[M]):
decode_metadata = None decode_metadata = None
if self._num_decodes > 0: if self._num_decodes > 0:
if self._use_spec_decode: if self.use_spec_decode:
# generate block_table/seq_lens of mla in spec decoding scenarios query_lens = self.num_scheduled_tokens_np[:self._num_decodes]
if common_attn_metadata.num_rejected_tokens_tuple is None: if common_attn_metadata.num_rejected_tokens is not None:
repeats = self._num_scheduled_tokens[:self._num_decodes] num_rejected_tokens = common_attn_metadata.num_rejected_tokens[:self._num_decodes]
repeats_cpu = self._num_scheduled_tokens_np[:self._num_decodes] query_lens = query_lens - np.array(num_rejected_tokens, dtype=np.int32)
else:
repeats = self._num_scheduled_tokens[:self._num_decodes] - \
common_attn_metadata.num_rejected_tokens_tuple[1][:self._num_decodes]
num_rejected_tokens = common_attn_metadata.num_rejected_tokens_tuple[0][:self._num_decodes]
repeats_cpu = self._num_scheduled_tokens_np[:self._num_decodes] - \
np.array(num_rejected_tokens)
self._num_decode_tokens -= sum(num_rejected_tokens) self._num_decode_tokens -= sum(num_rejected_tokens)
cu_num_blocks = np.cumsum(query_lens)
virtual_batches = cu_num_blocks[-1]
block_offsets = np.repeat(cu_num_blocks - query_lens, query_lens)
arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
rarange = np.repeat(query_lens, query_lens) - arange - 1
repeats = torch.from_numpy(query_lens).pin_memory().to(
block_table_tensor.device, non_blocking=True)
decode_block_table_tensor = torch.repeat_interleave( decode_block_table_tensor = torch.repeat_interleave(
block_table_tensor[:self._num_decodes, ...], block_table_tensor[:self._num_decodes, ...],
repeats, dim=0) repeats, dim=0)
total_decode_tokens = np.sum(repeats_cpu)
decode_seq_lens = torch.repeat_interleave(seq_lens[:self._num_decodes], repeats, dim=0) decode_seq_lens = torch.repeat_interleave(seq_lens[:self._num_decodes], repeats, dim=0)
self._seq_lens_minus_np[:total_decode_tokens] = np.fromiter( seq_lens_minus = torch.from_numpy(rarange).to(torch.int32).pin_memory().to(
chain.from_iterable(np.flip(np.arange(x)) for x in repeats_cpu), seq_lens.device, non_blocking=True)
dtype=int) decode_seq_lens = decode_seq_lens - seq_lens_minus
self._seq_lens_minus[:total_decode_tokens].copy_(self._seq_lens_minus_cpu_tensor[:total_decode_tokens],
non_blocking=True)
decode_seq_lens = decode_seq_lens - self._seq_lens_minus[:total_decode_tokens]
decode_metadata = self._build_decode( decode_metadata = self._build_decode(
block_table_tensor=decode_block_table_tensor, block_table_tensor=decode_block_table_tensor,
......
...@@ -17,7 +17,7 @@ class CommonAttentionMetadata: ...@@ -17,7 +17,7 @@ class CommonAttentionMetadata:
seq_lens: torch.Tensor seq_lens: torch.Tensor
"""(batch_size,), the length of each request including both computed tokens """(batch_size,), the length of each request including both computed tokens
and newly scheduled tokens""" and newly scheduled tokens"""
num_rejected_tokens_tuple: tuple[list[int], torch.Tensor] = None num_rejected_tokens: list[int] = None
"""(batch_size,), record the rejected tokens number in cpu and gpu""" """(batch_size,), record the rejected tokens number in cpu and gpu"""
def validate_kv_sharing_target(current_layer_name, target_layer_name, def validate_kv_sharing_target(current_layer_name, target_layer_name,
......
...@@ -93,7 +93,8 @@ class EagleProposer: ...@@ -93,7 +93,8 @@ class EagleProposer:
# [batch_size, max_num_blocks_per_req] # [batch_size, max_num_blocks_per_req]
block_table: torch.Tensor, block_table: torch.Tensor,
# [batch_size] # [batch_size]
num_rejected_tokens_tuple: tuple[list[int], torch.Tensor], num_rejected_tokens: list[int],
# [batch_size]
sampling_metadata: SamplingMetadata sampling_metadata: SamplingMetadata
) -> torch.Tensor: ) -> torch.Tensor:
num_tokens = target_token_ids.shape[0] num_tokens = target_token_ids.shape[0]
...@@ -143,7 +144,7 @@ class EagleProposer: ...@@ -143,7 +144,7 @@ class EagleProposer:
common_attn_metadata = CommonAttentionMetadata( common_attn_metadata = CommonAttentionMetadata(
query_start_loc=cu_num_tokens, query_start_loc=cu_num_tokens,
seq_lens=seq_lens, seq_lens=seq_lens,
num_rejected_tokens_tuple=num_rejected_tokens_tuple) num_rejected_tokens=num_rejected_tokens)
assert self.runner is not None assert self.runner is not None
......
...@@ -1441,7 +1441,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1441,7 +1441,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
else: else:
block_table = None block_table = None
num_rejected_tokens_tuple = None num_rejected_tokens = None
if spec_decode_metadata is None: if spec_decode_metadata is None:
# input_ids can be None for multimodal models. # input_ids can be None for multimodal models.
target_token_ids = self.input_ids[:num_scheduled_tokens] target_token_ids = self.input_ids[:num_scheduled_tokens]
...@@ -1481,7 +1481,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1481,7 +1481,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
target_hidden_states = hidden_states[token_indices] target_hidden_states = hidden_states[token_indices]
target_slot_mapping = eagle_attn_metadata.slot_mapping[ target_slot_mapping = eagle_attn_metadata.slot_mapping[
token_indices] token_indices]
num_rejected_tokens_tuple = (num_rejected_tokens, num_rejected_tokens_tensor)
draft_token_ids = self.drafter.propose( draft_token_ids = self.drafter.propose(
target_token_ids=target_token_ids, target_token_ids=target_token_ids,
target_positions=target_positions, target_positions=target_positions,
...@@ -1491,7 +1490,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1491,7 +1490,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
cu_num_tokens=cu_num_tokens, cu_num_tokens=cu_num_tokens,
block_table=block_table, block_table=block_table,
sampling_metadata=sampling_metadata, sampling_metadata=sampling_metadata,
num_rejected_tokens_tuple=num_rejected_tokens_tuple num_rejected_tokens=num_rejected_tokens
) )
spec_token_ids = draft_token_ids.tolist() spec_token_ids = draft_token_ids.tolist()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment