[Bugfix] More type hint fixes for py 3.8 (#4039)

5c2e66e4 · Dylan Hawk · GitHub · 546e7211 · 5c2e66e4 · 5c2e66e4
Unverified Commit 5c2e66e4 authored Apr 12, 2024 by Dylan Hawk Committed by GitHub Apr 12, 2024
4 changed files
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -39,7 +39,7 @@ class ExecutorBase(ABC):
        ExecutorBase may require modification of the result, e.g. to ensure the
        selected cache sizes are compatible with all workers.

-        Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
        are blocks that are "active" on the device and can be appended to.
        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
        appended to.

--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
 """A CPU worker class."""
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple

 import torch
 import torch.distributed
@@ -157,7 +157,7 @@ class CPUWorker(LoraNotSupportedWorkerBase):
    def load_model(self):
        self.model_runner.load_model()

-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
        """Determine the number of blocks available for the KV cache.

        This determines how many KV blocks can fit into the configured CPU

--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
 """A Neuron worker class."""
-from typing import List, Optional
+from typing import List, Optional, Tuple

 import torch
 import torch.distributed
@@ -40,7 +40,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase):
    def load_model(self):
        self.model_runner.load_model()

-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
        """Determine the number of available KV blocks.

        Swapping is not yet supported, so always return num_cpu_blocks=0.

--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
 from abc import ABC, abstractmethod
-from typing import Dict, List
+from typing import Dict, List, Tuple

 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
@@ -18,14 +18,14 @@ class WorkerBase(ABC):
        raise NotImplementedError

    @abstractmethod
-    def determine_num_available_blocks(self) -> tuple[int, int]:
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
        """Determine the number of available blocks for the GPU KV cache and
        swappable CPU KV cache.

        The implementation may run profiling or other heuristics to determine
        the size of caches.

-        Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
        are blocks that are "active" on the device and can be appended to.
        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
        appended to.