Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
...@@ -733,12 +733,13 @@ def _pythonize_sampler_output( ...@@ -733,12 +733,13 @@ def _pythonize_sampler_output(
logprobs_tensor: Optional[torch.Tensor], logprobs_tensor: Optional[torch.Tensor],
cache: Optional[PythonizationCache], cache: Optional[PythonizationCache],
) -> None: ) -> None:
""" This function is only called when the output tensors are ready. """ This function is only called when the output tensors are ready.
See {class}`ModelOutput`. See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput].
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
adding a Pythonized output data structure adding a Pythonized output data structure
({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`. ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
for each [`SequenceGroup`][vllm.sequence.SequenceGroup].
Args: Args:
model_input model_input
...@@ -824,7 +825,7 @@ def _pythonize_sampler_output( ...@@ -824,7 +825,7 @@ def _pythonize_sampler_output(
for sgdx, (seq_group, for sgdx, (seq_group,
sample_result) in enumerate(zip(seq_groups, samples_list)): sample_result) in enumerate(zip(seq_groups, samples_list)):
# Reminder: Please update docs/source/features/compatibility_matrix.md # Reminder: Please update docs/features/compatibility_matrix.md
# If the feature combo become valid # If the feature combo become valid
# (Check for Guided Decoding) # (Check for Guided Decoding)
if seq_group.sampling_params.logits_processors: if seq_group.sampling_params.logits_processors:
......
...@@ -70,8 +70,11 @@ class MultiStepNeuronModelRunner(NeuronModelRunner): ...@@ -70,8 +70,11 @@ class MultiStepNeuronModelRunner(NeuronModelRunner):
input_ids=model_input.input_tokens, input_ids=model_input.input_tokens,
positions=model_input.input_positions, positions=model_input.input_positions,
input_block_ids=model_input.input_block_ids, input_block_ids=model_input.input_block_ids,
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, **MultiModalKwargs.as_kwargs(
device=self.device), model_input.multi_modal_kwargs or {},
dtype=self.model_config.dtype,
device=self.device,
),
) )
output = self.model.sample( output = self.model.sample(
......
...@@ -49,8 +49,11 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner): ...@@ -49,8 +49,11 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
positions=model_input.input_positions, positions=model_input.input_positions,
input_block_ids=model_input.input_block_ids, input_block_ids=model_input.input_block_ids,
sampling_params=sampling_params, sampling_params=sampling_params,
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, **MultiModalKwargs.as_kwargs(
device=self.device), model_input.multi_modal_kwargs or {},
dtype=self.model_config.dtype,
device=self.device,
),
) )
output = self.model.sample( output = self.model.sample(
......
...@@ -378,9 +378,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): ...@@ -378,9 +378,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
positions=model_input.input_positions, positions=model_input.input_positions,
input_block_ids=model_input.input_block_ids, input_block_ids=model_input.input_block_ids,
sampling_params=sampling_params, sampling_params=sampling_params,
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs **MultiModalKwargs.as_kwargs(
or {}, model_input.multi_modal_kwargs or {},
device=self.device), dtype=self.model_config.dtype,
device=self.device,
),
) )
elif current_platform.use_transformers_neuronx(): elif current_platform.use_transformers_neuronx():
# [TODO] validate on-device sampling # [TODO] validate on-device sampling
...@@ -389,9 +391,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): ...@@ -389,9 +391,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
input_ids=model_input.input_tokens, input_ids=model_input.input_tokens,
positions=model_input.input_positions, positions=model_input.input_positions,
input_block_ids=model_input.input_block_ids, input_block_ids=model_input.input_block_ids,
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs **MultiModalKwargs.as_kwargs(
or {}, model_input.multi_modal_kwargs or {},
device=self.device), dtype=self.model_config.dtype,
device=self.device,
),
) )
# Compute the logits only if the on-device sampling is turned off as # Compute the logits only if the on-device sampling is turned off as
......
...@@ -119,10 +119,14 @@ class PoolingModelRunner( ...@@ -119,10 +119,14 @@ class PoolingModelRunner(
input_ids=model_input.input_tokens, input_ids=model_input.input_tokens,
positions=model_input.input_positions, positions=model_input.input_positions,
intermediate_tensors=intermediate_tensors, intermediate_tensors=intermediate_tensors,
**MultiModalKwargs.as_kwargs(multi_modal_kwargs, **MultiModalKwargs.as_kwargs(
device=self.device), multi_modal_kwargs,
dtype=self.model_config.dtype,
device=self.device,
),
**cross_enc_kwargs, **cross_enc_kwargs,
**seqlen_agnostic_kwargs) **seqlen_agnostic_kwargs,
)
if (self.observability_config is not None if (self.observability_config is not None
and self.observability_config.collect_model_forward_time): and self.observability_config.collect_model_forward_time):
......
...@@ -76,8 +76,7 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): ...@@ -76,8 +76,7 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
) )
ensure_model_parallel_initialized( ensure_model_parallel_initialized(
self.parallel_config.tensor_parallel_size, self.parallel_config.tensor_parallel_size,
self.parallel_config.pipeline_parallel_size, self.parallel_config.pipeline_parallel_size)
self.parallel_config.enable_expert_parallel)
# Device initialization should happen after initializing the distributed # Device initialization should happen after initializing the distributed
# runtime. # runtime.
......
...@@ -14,7 +14,7 @@ def assert_enc_dec_mr_supported_scenario( ...@@ -14,7 +14,7 @@ def assert_enc_dec_mr_supported_scenario(
a supported scenario. a supported scenario.
''' '''
# Reminder: Please update docs/source/features/compatibility_matrix.md # Reminder: Please update docs/features/compatibility_matrix.md
# If the feature combo become valid # If the feature combo become valid
if enc_dec_mr.cache_config.enable_prefix_caching: if enc_dec_mr.cache_config.enable_prefix_caching:
......
...@@ -234,10 +234,9 @@ class Worker(LocalOrDistributedWorkerBase): ...@@ -234,10 +234,9 @@ class Worker(LocalOrDistributedWorkerBase):
Then, it calculate the maximum possible number of GPU and CPU blocks Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory. that can be allocated with the remaining free memory.
:::{tip} Tip:
You may limit the usage of GPU memory You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter. by adjusting the `gpu_memory_utilization` parameter.
:::
""" """
# Profile the memory usage of the model and get the maximum number of # Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory. # cache blocks that can be allocated with the remaining free memory.
...@@ -530,8 +529,7 @@ def init_worker_distributed_environment( ...@@ -530,8 +529,7 @@ def init_worker_distributed_environment(
init_distributed_environment(parallel_config.world_size, rank, init_distributed_environment(parallel_config.world_size, rank,
distributed_init_method, local_rank) distributed_init_method, local_rank)
ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size, parallel_config.pipeline_parallel_size)
parallel_config.enable_expert_parallel)
ensure_kv_transfer_initialized(vllm_config) ensure_kv_transfer_initialized(vllm_config)
......
...@@ -562,9 +562,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): ...@@ -562,9 +562,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
input_ids=model_input.input_tokens, input_ids=model_input.input_tokens,
positions=model_input.input_positions, positions=model_input.input_positions,
intermediate_tensors=intermediate_tensors, intermediate_tensors=intermediate_tensors,
**MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs **MultiModalKwargs.as_kwargs(
or {}, model_input.multi_modal_kwargs or {},
device=self.device)) dtype=self.model_config.dtype,
device=self.device,
),
)
# Compute the logits in the last pipeline stage. # Compute the logits in the last pipeline stage.
if not get_pp_group().is_last_rank: if not get_pp_group().is_last_rank:
return hidden_or_intermediate_states return hidden_or_intermediate_states
......
...@@ -93,10 +93,9 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): ...@@ -93,10 +93,9 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
Then, it calculate the maximum possible number of GPU and CPU blocks Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory. that can be allocated with the remaining free memory.
:::{tip} Tip:
You may limit the usage of GPU memory You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter. by adjusting the `gpu_memory_utilization` parameter.
:::
""" """
# Profile the memory usage of the model and get the maximum number of # Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory. # cache blocks that can be allocated with the remaining free memory.
...@@ -176,8 +175,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker): ...@@ -176,8 +175,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
ensure_model_parallel_initialized( ensure_model_parallel_initialized(
parallel_config.tensor_parallel_size, parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size, parallel_config.pipeline_parallel_size)
parallel_config.enable_expert_parallel)
# global all_reduce needed for overall oneccl warm up # global all_reduce needed for overall oneccl warm up
torch.distributed.all_reduce(torch.zeros(1).xpu()) torch.distributed.all_reduce(torch.zeros(1).xpu())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment