[Core][Optimization] change python dict to pytorch tensor for blocks to swap (#4659)

20cfcdec · youkaichao · GitHub · ad932a22 · 20cfcdec
Unverified Commit 20cfcdec authored May 08, 2024 by youkaichao Committed by GitHub May 08, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 7 deletions

vllm/worker/worker.py vllm/worker/worker.py +20 -7

No files found.
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -195,15 +195,14 @@ class Worker(WorkerBase):

    def cache_swap(
        self,
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
+        blocks_to_swap_in: torch.Tensor,
+        blocks_to_swap_out: torch.Tensor,
        blocks_to_copy: torch.Tensor,
    ) -> None:
        # Issue cache operations.
-        # TODO(woosuk): Profile swapping overhead and optimize if needed.
-        if blocks_to_swap_in:
+        if blocks_to_swap_in.numel() > 0:
            self.cache_engine.swap_in(blocks_to_swap_in)
-        if blocks_to_swap_out:
+        if blocks_to_swap_out.numel() > 0:
            self.cache_engine.swap_out(blocks_to_swap_out)
        if blocks_to_copy.numel() > 0:
            self.cache_engine.copy(blocks_to_copy)
@@ -219,12 +218,26 @@ class Worker(WorkerBase):
        else:
            seq_group_metadata_list = execute_model_req.seq_group_metadata_list

+        blocks_to_swap_in: torch.Tensor
+        blocks_to_swap_out: torch.Tensor
+        blocks_to_copy: torch.Tensor
        if self.is_driver_worker:
            assert seq_group_metadata_list is not None
            assert execute_model_req is not None
            num_seq_groups = len(seq_group_metadata_list)
-            blocks_to_swap_in = execute_model_req.blocks_to_swap_in
-            blocks_to_swap_out = execute_model_req.blocks_to_swap_out
+            # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
+            # they contain parameters to launch cudamemcpyasync.
+            blocks_to_swap_in = torch.tensor(
+                execute_model_req.blocks_to_swap_in,
+                device="cpu",
+                dtype=torch.int64).view(-1, 2)
+            blocks_to_swap_out = torch.tensor(
+                execute_model_req.blocks_to_swap_out,
+                device="cpu",
+                dtype=torch.int64).view(-1, 2)
+            # `blocks_to_copy` is a gpu tensor. The src and tgt of
+            # blocks to copy are in the same device, and `blocks_to_copy`
+            # can be used directly within cuda kernels.
            blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
                                          device=self.device,
                                          dtype=torch.int64).view(-1, 2)