Update deprecated type hinting in `vllm/device_allocator` and `vllm/distributed` (#18126)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Update deprecated type hinting in `vllm/device_allocator` and `vllm/distributed` (#18126)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
dc372b9c · Harry Mellor · GitHub · 9b5b39b6 · dc372b9c · dc372b9c
Unverified Commit dc372b9c authored May 14, 2025 by Harry Mellor Committed by GitHub May 14, 2025
20 changed files
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,8 +74,6 @@ exclude = [
 # Python 3.8 typing. TODO: Remove these excludes after v1.0.0
 "vllm/attention/**/*.py" = ["UP006", "UP035"]
 "vllm/core/**/*.py" = ["UP006", "UP035"]
-"vllm/device_allocator/**/*.py" = ["UP006", "UP035"]
-"vllm/distributed/**/*.py" = ["UP006", "UP035"]
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
 "vllm/model_executor/model_loader/**/*.py" = ["UP006", "UP035"]

--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -11,7 +11,7 @@ import dataclasses
 import gc
 import os
 from contextlib import contextmanager
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union

 import torch

@@ -63,7 +63,7 @@ except ModuleNotFoundError:
    libcudart = None

 # py_device, py_alignedSize, py_d_mem, py_p_memHandle
-HandleType = Tuple[int, int, int, int]
+HandleType = tuple[int, int, int, int]


 @dataclasses.dataclass
@@ -148,9 +148,9 @@ class CuMemAllocator:
            "Please track https://github.com/pytorch/pytorch/issues/147851 "
            "for the latest updates.")

-        self.pointer_to_data: Dict[int, AllocationData] = {}
+        self.pointer_to_data: dict[int, AllocationData] = {}
        self.current_tag: str = CuMemAllocator.default_tag
-        self.allocator_and_pools: Dict[str, Any] = {}
+        self.allocator_and_pools: dict[str, Any] = {}

    def python_malloc_callback(self, allocation_handle: HandleType) -> None:
        """
@@ -172,7 +172,7 @@ class CuMemAllocator:

    def sleep(
            self,
-            offload_tags: Optional[Union[Tuple[str, ...],
+            offload_tags: Optional[Union[tuple[str, ...],
                                         str]] = None) -> None:
        """
        Put the allocator in sleep mode.

--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import Any, Dict, Optional, Union
+from typing import Any, Optional, Union

 import torch
 import torch.distributed
@@ -32,7 +32,7 @@ def tensor_model_parallel_gather(input_: torch.Tensor,
    return get_tp_group().gather(input_, dst, dim)


-def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor,
+def broadcast_tensor_dict(tensor_dict: Optional[dict[Any, Union[torch.Tensor,
                                                                Any]]] = None,
                          src: int = 0):
    if not torch.distributed.is_initialized():

--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional, Tuple
+from typing import Optional

 import torch
 import torch.distributed as dist
@@ -160,7 +160,7 @@ class DeviceCommunicatorBase:

    def dispatch(
            self, hidden_states: torch.Tensor,
-            router_logits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Dispatch the hidden states and router logits to the appropriate device.
        This is a no-op in the base class.

--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
 # SPDX-License-Identifier: Apache-2.0

 import os
-from typing import List, Optional
+from typing import Optional

 import torch
 from torch.distributed import ProcessGroup
@@ -126,7 +126,7 @@ class _CPUSHMDistributed:

    def gather(self,
               input: torch.Tensor,
-               gather_list: Optional[List[torch.Tensor]],
+               gather_list: Optional[list[torch.Tensor]],
               dst: int = -1,
               group: Optional[ProcessGroup] = None) -> None:
        # Note: different from the torch gather, here we use local dst rank.

--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import Optional, Tuple
+from typing import Optional

 import torch
 from torch.distributed import ProcessGroup
@@ -154,7 +154,7 @@ class CudaCommunicator(DeviceCommunicatorBase):

    def dispatch(
            self, hidden_states: torch.Tensor,
-            router_logits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        assert self.all2all_impl is not None
        hidden_states, router_logits = self.all2all_impl.dispatch(
            hidden_states, router_logits)

--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -6,7 +6,7 @@ convenient for use when we just need to call a few functions.

 import ctypes
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional

 # this line makes it possible to directly load `libcudart.so` using `ctypes`
 import torch  # noqa
@@ -32,7 +32,7 @@ class cudaIpcMemHandle_t(ctypes.Structure):
 class Function:
    name: str
    restype: Any
-    argtypes: List[Any]
+    argtypes: list[Any]


 def find_loaded_library(lib_name) -> Optional[str]:
@@ -97,11 +97,11 @@ class CudaRTLibrary:

    # class attribute to store the mapping from the path to the library
    # to avoid loading the same library multiple times
-    path_to_library_cache: Dict[str, Any] = {}
+    path_to_library_cache: dict[str, Any] = {}

    # class attribute to store the mapping from library path
    #  to the corresponding dictionary
-    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+    path_to_dict_mapping: dict[str, dict[str, Any]] = {}

    def __init__(self, so_file: Optional[str] = None):
        if so_file is None:

--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
 # SPDX-License-Identifier: Apache-2.0

 from contextlib import contextmanager
-from typing import List, Optional, Union
+from typing import Optional, Union

 import torch
 import torch.distributed as dist
@@ -276,7 +276,7 @@ class CustomAllreduce:
    @staticmethod
    def create_shared_buffer(size_in_bytes: int,
                             group: Optional[ProcessGroup] = None,
-                             uncached: Optional[bool] = False) -> List[int]:
+                             uncached: Optional[bool] = False) -> list[int]:
        pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes)

        world_size = dist.get_world_size(group=group)
@@ -284,7 +284,7 @@ class CustomAllreduce:
        handles = [None] * world_size
        dist.all_gather_object(handles, handle, group=group)

-        pointers: List[int] = []
+        pointers: list[int] = []
        for i, h in enumerate(handles):
            if i == rank:
                pointers.append(pointer)  # type: ignore
@@ -293,7 +293,7 @@ class CustomAllreduce:
        return pointers

    @staticmethod
-    def free_shared_buffer(pointers: List[int],
+    def free_shared_buffer(pointers: list[int],
                           group: Optional[ProcessGroup] = None,
                           rank: Optional[int] = 0) -> None:
        if rank is None:

--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -7,8 +7,9 @@ import pickle
 import subprocess
 import sys
 import tempfile
+from collections.abc import Sequence
 from itertools import product
-from typing import Dict, List, Optional, Sequence
+from typing import Optional

 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -149,7 +150,7 @@ def can_actually_p2p(
    p_src.join()
    p_tgt.join()
    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
-    result: List[bool] = []
+    result: list[bool] = []
    for src, tgt in zip(batch_src, batch_tgt):
        a = result_queue.get()
        b = result_queue.get()
@@ -175,7 +176,7 @@ def can_actually_p2p(
 #  e.g. used by different vllm engines. The device id in the cache file is a
 #  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
 #  of visible devices in the vllm engine.
-_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
+_gpu_p2p_access_cache: Optional[dict[str, bool]] = None


 def gpu_p2p_access_check(src: int, tgt: int) -> bool:
@@ -204,7 +205,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
        # only the local master process (with local_rank == 0) can
        #  enter this block to calculate the cache
        logger.info("generating GPU P2P access cache in %s", path)
-        cache: Dict[str, bool] = {}
+        cache: dict[str, bool] = {}
        ids = list(range(num_dev))
        # batch of all pairs of GPUs
        batch_src, batch_tgt = zip(*list(product(ids, ids)))

--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -24,7 +24,7 @@
 import ctypes
 import platform
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional

 import torch
 from torch.distributed import ReduceOp
@@ -121,7 +121,7 @@ class ncclRedOpTypeEnum:
 class Function:
    name: str
    restype: Any
-    argtypes: List[Any]
+    argtypes: list[Any]


 class NCCLLibrary:
@@ -210,11 +210,11 @@ class NCCLLibrary:

    # class attribute to store the mapping from the path to the library
    # to avoid loading the same library multiple times
-    path_to_library_cache: Dict[str, Any] = {}
+    path_to_library_cache: dict[str, Any] = {}

    # class attribute to store the mapping from library path
    #  to the corresponding dictionary
-    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+    path_to_dict_mapping: dict[str, dict[str, Any]] = {}

    def __init__(self, so_file: Optional[str] = None):

@@ -238,7 +238,7 @@ class NCCLLibrary:
            raise e

        if so_file not in NCCLLibrary.path_to_dict_mapping:
-            _funcs: Dict[str, Any] = {}
+            _funcs: dict[str, Any] = {}
            for func in NCCLLibrary.exported_functions:
                f = getattr(self.lib, func.name)
                f.restype = func.restype

--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -8,7 +8,7 @@ from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
 from threading import Event
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Optional, Union
 from unittest.mock import patch

 import torch
@@ -173,9 +173,9 @@ class ShmRingBuffer:

 @dataclass
 class Handle:
-    local_reader_ranks: List[int] = field(default_factory=list)
+    local_reader_ranks: list[int] = field(default_factory=list)

-    buffer_handle: Optional[Tuple[int, int, int, str]] = None
+    buffer_handle: Optional[tuple[int, int, int, str]] = None
    local_subscribe_addr: Optional[str] = None
    remote_subscribe_addr: Optional[str] = None
    remote_addr_ipv6: bool = False
@@ -187,7 +187,7 @@ class MessageQueue:
        self,
        n_reader,  # number of all readers
        n_local_reader,  # number of local readers through shared memory
-        local_reader_ranks: Optional[List[int]] = None,
+        local_reader_ranks: Optional[list[int]] = None,
        max_chunk_bytes: int = 1024 * 1024 * 10,
        max_chunks: int = 10,
        connect_ip: Optional[str] = None,

--- a/vllm/distributed/kv_transfer/kv_connector/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/base.py
@@ -8,7 +8,7 @@ The class provides two primary abstract methods:
 """

 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, List, Tuple, Union
+from typing import TYPE_CHECKING, Union

 import torch

@@ -55,7 +55,7 @@ class KVConnectorBase(ABC):
        self,
        model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor],
+        kv_caches: list[torch.Tensor],
        hidden_or_intermediate_states: Union[torch.Tensor,
                                             IntermediateTensors],
    ) -> None:
@@ -71,7 +71,7 @@ class KVConnectorBase(ABC):
                start and end layer information.
            model_input (ModelInputForGPUWithSamplingMetadata): The input
                metadata from vLLM.
-            kv_caches (List[torch.Tensor]): List of KV caches (keys and values) 
+            kv_caches (list[torch.Tensor]): List of KV caches (keys and values) 
                for each layer.
            hidden_or_intermediate_states (Union[torch.Tensor, 
            IntermediateTensors]): 
@@ -88,8 +88,8 @@ class KVConnectorBase(ABC):
    def recv_kv_caches_and_hidden_states(
        self, model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor]
-    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
               "ModelInputForGPUWithSamplingMetadata"]:
        """
        Receive KV caches and hidden states from the connector.
@@ -104,7 +104,7 @@ class KVConnectorBase(ABC):
                The model executable from vLLM modelrunner.
            model_input (ModelInputForGPUWithSamplingMetadata): 
                The model input from vLLM modelrunner.
-            kv_caches (List[torch.Tensor]): 
+            kv_caches (list[torch.Tensor]): 
                List of KV caches for each layer.

        Returns:

--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
 # SPDX-License-Identifier: Apache-2.0

 import importlib
-from typing import TYPE_CHECKING, Callable, Dict, Type
+from typing import TYPE_CHECKING, Callable

 import vllm.envs as envs
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
@@ -18,7 +18,7 @@ logger = init_logger(__name__)


 class KVConnectorFactory:
-    _registry: Dict[str, Callable[[], Type[KVConnectorBaseType]]] = {}
+    _registry: dict[str, Callable[[], type[KVConnectorBaseType]]] = {}

    @classmethod
    def register_connector(cls, name: str, module_path: str,
@@ -27,7 +27,7 @@ class KVConnectorFactory:
        if name in cls._registry:
            raise ValueError(f"Connector '{name}' is already registered.")

-        def loader() -> Type[KVConnectorBaseType]:
+        def loader() -> type[KVConnectorBaseType]:
            module = importlib.import_module(module_path)
            return getattr(module, class_name)


--- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
@@ -7,7 +7,7 @@ The LMCacheConnector can (1) transfer KV caches between prefill vLLM worker
 (2) offload and share KV caches.
 """

-from typing import TYPE_CHECKING, List, Tuple, Union
+from typing import TYPE_CHECKING, Union

 import torch

@@ -63,8 +63,8 @@ class LMCacheConnector(KVConnectorBase):
    def recv_kv_caches_and_hidden_states(
        self, model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor]
-    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
               "ModelInputForGPUWithSamplingMetadata"]:

        retrieve_status = self.lmcache_should_retrieve(model_input)
@@ -78,7 +78,7 @@ class LMCacheConnector(KVConnectorBase):
        self,
        model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor],
+        kv_caches: list[torch.Tensor],
        hidden_or_intermediate_states: Union[torch.Tensor,
                                             IntermediateTensors],
    ) -> None:

--- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -6,7 +6,7 @@ The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
 database-style KVStore.
 """
 import hashlib
-from typing import TYPE_CHECKING, List, Tuple, Union
+from typing import TYPE_CHECKING, Union

 import torch

@@ -70,7 +70,7 @@ class MooncakeStoreConnector(KVConnectorBase):
        self,
        model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor],
+        kv_caches: list[torch.Tensor],
        hidden_or_intermediate_states: Union[torch.Tensor,
                                             IntermediateTensors],
    ) -> None:
@@ -113,8 +113,8 @@ class MooncakeStoreConnector(KVConnectorBase):
    def recv_kv_caches_and_hidden_states(
        self, model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor]
-    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
               "ModelInputForGPUWithSamplingMetadata"]:
        bypass_model_exec = True
        input_tokens_tensor = model_input.input_tokens

--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -8,7 +8,7 @@ MooncakePipe.

 But the logic can be extended to support other pipe and lookup buffer.
 """
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Optional, Union

 import torch

@@ -133,7 +133,7 @@ class SimpleConnector(KVConnectorBase):
            )

    def select(self, input_tokens: Optional[torch.Tensor],
-               roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+               roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:

        assert self.consumer_buffer is not None, "Please initialize the "\
            "consumer buffer before calling select."
@@ -152,7 +152,7 @@ class SimpleConnector(KVConnectorBase):
        self,
        model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor],
+        kv_caches: list[torch.Tensor],
        hidden_or_intermediate_states: Union[torch.Tensor,
                                             IntermediateTensors],
    ) -> None:
@@ -207,8 +207,8 @@ class SimpleConnector(KVConnectorBase):
    def recv_kv_caches_and_hidden_states(
        self, model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor]
-    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
               "ModelInputForGPUWithSamplingMetadata"]:

        # When bypass_model_exec is set to False, it means that at least for one

--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -5,13 +5,13 @@ import threading
 import time
 import uuid
 from collections import defaultdict
+from collections.abc import Iterator
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Iterator
+from typing import TYPE_CHECKING, Any, Optional

 import msgspec
 import torch
 import zmq
-from typing_extensions import Optional

 from vllm import envs
 from vllm.config import VllmConfig

--- a/vllm/distributed/kv_transfer/kv_connector_agent.py
+++ b/vllm/distributed/kv_transfer/kv_connector_agent.py
@@ -5,7 +5,7 @@ This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
 1. `send_kv_caches_and_hidden_states`
 2. `recv_kv_caches_and_hidden_states
 """
-from typing import TYPE_CHECKING, List, Tuple, Union
+from typing import TYPE_CHECKING, Union

 if TYPE_CHECKING:
    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
@@ -53,7 +53,7 @@ class KVTransferAgent:
        self,
        model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor],
+        kv_caches: list[torch.Tensor],
        hidden_or_intermediate_states: Union[torch.Tensor,
                                             IntermediateTensors],
    ) -> None:
@@ -68,8 +68,8 @@ class KVTransferAgent:
    def recv_kv_caches_and_hidden_states(
        self, model_executable: torch.nn.Module,
        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor]
-    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+        kv_caches: list[torch.Tensor]
+    ) -> tuple[Union[torch.Tensor, IntermediateTensors], bool,
               "ModelInputForGPUWithSamplingMetadata"]:

        return self.connector.recv_kv_caches_and_hidden_states(

--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -13,7 +13,7 @@ These classes above are abstracted behind class `KVCacheBufferBase`.
 """

 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import Optional

 import torch

@@ -93,7 +93,7 @@ class KVLookupBufferBase(KVCacheBufferBase):
    @abstractmethod
    def drop_select(
            self, input_tokens: Optional[torch.Tensor],
-            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+            roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:
        """Select and *drop* KV cache entries from the lookup buffer.
        
        The functionality is similar to the following python statements
@@ -111,7 +111,7 @@ class KVLookupBufferBase(KVCacheBufferBase):
            roi (torch.Tensor): A binary mask on top of the input tokens

        Returns:
-            List[Optional[torch.Tensor]]: A list of tensors. Can be None.
+            list[Optional[torch.Tensor]]: A list of tensors. Can be None.

        Raises:
            NotImplementedError: This method must be implemented in subclasses.

--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -11,7 +11,7 @@
 """
 import threading
 from collections import deque
-from typing import Deque, List, Optional, Union
+from typing import Optional, Union

 import torch

@@ -38,7 +38,7 @@ class SimpleBuffer(KVLookupBufferBase):
        data_pipe: on device (e.g. GPU)
        """

-        self.buffer: Deque[List[torch.Tensor]] = deque()
+        self.buffer: deque[list[torch.Tensor]] = deque()

        self.buffer_size = 0
        self.buffer_size_threshold = buffer_size_thresh
@@ -50,8 +50,8 @@ class SimpleBuffer(KVLookupBufferBase):
        self.normal_signal = torch.tensor([0], device="cpu")
        self.end_signal = None

-    def _matches(self, tokens_roi_sender: List[torch.Tensor],
-                 tokens_roi_recver: List[torch.Tensor]):
+    def _matches(self, tokens_roi_sender: list[torch.Tensor],
+                 tokens_roi_recver: list[torch.Tensor]):

        # tokens_roi_sender: tokens and roi of the producer (in the buffer)
        # tokens_roi_recver: tokens and roi of the consumer (query)
@@ -88,7 +88,7 @@ class SimpleBuffer(KVLookupBufferBase):
            tensor = tensor.float()
        self.data_pipe.send_tensor(tensor)

-    def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
+    def _get_element_size(self, data: Optional[Union[list, torch.Tensor]]):

        if isinstance(data, torch.Tensor):
            return data.element_size() * data.numel()
@@ -151,7 +151,7 @@ class SimpleBuffer(KVLookupBufferBase):
                tokens_roi_recver = [input_tokens, roi]

                def is_buffer_available(
-                    tokens_roi_recver: List[torch.Tensor], ) -> bool:
+                    tokens_roi_recver: list[torch.Tensor], ) -> bool:
                    # perform input tokens and roi matching
                    # FIXME: this matching is O(n), ideally it should be O(1)
                    # but this buffer size won't (and shouldn't) be too large so
@@ -184,7 +184,7 @@ class SimpleBuffer(KVLookupBufferBase):

    def drop_select(
            self, input_tokens: Optional[torch.Tensor],
-            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+            roi: Optional[torch.Tensor]) -> list[Optional[torch.Tensor]]:

        assert self.request_handling_thread is None, \
            "drop_select should be called by the KV cache consumer "\