[Core][5/N] Fully working chunked prefill e2e (#3884)

67b4221a · SangBin Cho · GitHub · 63e7176f · 67b4221a · 67b4221a
Unverified Commit 67b4221a authored Apr 11, 2024 by SangBin Cho Committed by GitHub Apr 10, 2024
6 changed files
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -173,10 +173,18 @@ def broadcast_tensor_dict(
        torch.distributed.broadcast_object_list([metadata_list],
                                                src=src,
                                                group=group)
+        async_handles = []
        for key, value in metadata_list:
            if isinstance(value, TensorMetadata):
                tensor = tensor_dict[key]
-                torch.distributed.broadcast(tensor, src=src, group=group)
+                async_handles.append(
+                    torch.distributed.broadcast(tensor,
+                                                src=src,
+                                                group=group,
+                                                async_op=True))
+        for async_handle in async_handles:
+            async_handle.wait()
+
    else:
        recv_metadata_list = [None]
        torch.distributed.broadcast_object_list(recv_metadata_list,

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -386,9 +386,8 @@ class EngineArgs:
            'prompt latency) before scheduling next prompt.')
        parser.add_argument(
            '--enable-chunked-prefill',
-            type=bool,
-            default=False,
-            help='If True, the prefill requests can be chunked based on the '
+            action='store_true',
+            help='If set, the prefill requests can be chunked based on the '
            'max_num_batched_tokens')

        parser.add_argument(

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -633,7 +633,10 @@ class LLMEngine:
            seq_group = scheduled_seq_group.seq_group
            seq_group.update_num_computed_tokens(
                scheduled_seq_group.token_chunk_size)
-            self._process_sequence_group_outputs(seq_group, outputs)
+            # If uncomputed tokens > 0, it means prefill is chunked.
+            # We don't need to process outputs in that case.
+            if seq_group.get_num_uncomputed_tokens() == 0:
+                self._process_sequence_group_outputs(seq_group, outputs)

        # Free the finished sequence groups.
        self.scheduler.free_finished_seq_groups()

--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -267,12 +267,13 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        added_tokens_mask = x > self.base_layer.org_vocab_size - 1
-        indices = self.embeddings_indices[1][:self.indices_len[3]].view_as(x)
+        embedding_len = self.indices_len[3]
+        indices = self.embeddings_indices[1][:embedding_len].view_as(x)
        full_lora_a_embeddings = F.embedding(
            x + indices,
            self.lora_a_stacked_2d,
        )
-        indices = self.embeddings_indices[0][:self.indices_len[3]].view_as(x)
+        indices = self.embeddings_indices[0][:embedding_len].view_as(x)
        full_output = self.base_layer.forward(
            x.add_(indices * added_tokens_mask))


--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -500,7 +500,8 @@ class SequenceGroup:
    def get_num_uncomputed_tokens(self) -> int:
        num_uncomputed_tokens = 0
        for seq in self.get_seqs():
-            num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
+            if not seq.is_finished():
+                num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
        return num_uncomputed_tokens

    def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py