[mypy] Enable type checking for test directory (#5017)

0e9164b4 · Cyrus Leung · GitHub · 1b8a0d71 · 0e9164b4 · 0e9164b4
Unverified Commit 0e9164b4 authored Jun 15, 2024 by Cyrus Leung Committed by GitHub Jun 15, 2024
20 changed files
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -271,7 +271,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
        """
        source_blocks = get_all_blocks_recursively(last_block)

-        forked_blocks = []
+        forked_blocks: List[Block] = []
        prev_block = None
        for block in source_blocks:
            refcount = self._refcounter.incr(block.block_id)

--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -260,7 +260,7 @@ class BlockSpaceManagerV2(BlockSpaceManager):
        # at max extend.
        if self.enable_caching:
            block_table = self.block_tables[seq.seq_id]
-            block_ids = []
+            block_ids: List[Optional[int]] = []
            for block_id in block_table.physical_block_ids:
                block_ids.append(block_id)
            self.block_allocator.mark_blocks_as_accessed(

--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -2,7 +2,7 @@ import ctypes
 import json
 import os
 from itertools import product
-from typing import Dict, Optional, Sequence
+from typing import Dict, List, Optional, Sequence

 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -88,7 +88,7 @@ def consumer(batch_tgt: Sequence[int],
 def can_actually_p2p(
    batch_src: Sequence[int],
    batch_tgt: Sequence[int],
-):
+) -> Sequence[bool]:
    """
    Usually, checking if P2P access is enabled can be done by
    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
@@ -138,7 +138,7 @@ def can_actually_p2p(
    p_tgt.start()
    p_src.join()
    p_tgt.join()
-    result = []
+    result: List[bool] = []
    for src, tgt in zip(batch_src, batch_tgt):
        a = result_queue.get()
        b = result_queue.get()
@@ -188,7 +188,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
        # only the local master process (with local_rank == 0) can
        #  enter this block to calculate the cache
        logger.info("generating GPU P2P access cache in %s", path)
-        cache = {}
+        cache: Dict[str, bool] = {}
        ids = list(range(num_dev))
        # batch of all pairs of GPUs
        batch_src, batch_tgt = zip(*list(product(ids, ids)))

--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -205,7 +205,7 @@ class NCCLLibrary:
            raise e

        if so_file not in NCCLLibrary.path_to_dict_mapping:
-            _funcs = {}
+            _funcs: Dict[str, Any] = {}
            for func in NCCLLibrary.exported_functions:
                f = getattr(self.lib, func.name)
                f.restype = func.restype

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2,7 +2,7 @@ import time
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional
 from typing import Sequence as GenericSequence
-from typing import Type, TypeVar, Union
+from typing import Set, Type, TypeVar, Union

 from transformers import GenerationConfig, PreTrainedTokenizer

@@ -973,7 +973,7 @@ class LLMEngine:
    def remove_lora(self, lora_id: int) -> bool:
        return self.model_executor.remove_lora(lora_id)

-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
        return self.model_executor.list_loras()

    def check_health(self) -> None:

--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -144,7 +144,7 @@ class Metrics:
 # end-metrics-definitions


-def build_1_2_5_buckets(max_value: int):
+def build_1_2_5_buckets(max_value: int) -> List[int]:
    """
    Builds a list of buckets with increasing powers of 10 multiplied by 
    mantissa values (1, 2, 5) until the value exceeds the specified maximum.
@@ -155,7 +155,7 @@ def build_1_2_5_buckets(max_value: int):
    """
    mantissa_lst = [1, 2, 5]
    exponent = 0
-    buckets = []
+    buckets: List[int] = []
    while True:
        for m in mantissa_lst:
            value = m * 10**exponent

--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union

 from vllm.config import SchedulerConfig
 from vllm.core.scheduler import Scheduler
@@ -146,8 +146,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):

        # Beam search case
        # Select the child sequences to keep in the sequence group.
-        selected_child_seqs = []
-        unselected_child_seqs = []
+        selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
+        unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = []
        beam_width = seq_group.sampling_params.best_of
        length_penalty = seq_group.sampling_params.length_penalty


--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -2,6 +2,7 @@ import argparse
 import asyncio
 import sys
 from io import StringIO
+from typing import Awaitable, List

 import aiohttp

@@ -114,7 +115,7 @@ async def main(args):
    )

    # Submit all requests in the file to the engine "concurrently".
-    response_futures = []
+    response_futures: List[Awaitable[BatchRequestOutput]] = []
    for request_json in (await read_file(args.input_file)).strip().split("\n"):
        request = BatchRequestInput.model_validate_json(request_json)
        response_futures.append(run_request(openai_serving_chat, request))

--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -487,7 +487,7 @@ class OpenAIServingChat(OpenAIServing):
            final_res = res
        assert final_res is not None

-        choices = []
+        choices: List[ChatCompletionResponseChoice] = []

        role = self.get_chat_request_role(request)
        for output in final_res.outputs:

--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -25,7 +25,7 @@ def request_output_to_embedding_response(
    created_time: int,
    model_name: str,
 ) -> EmbeddingResponse:
-    data = []
+    data: List[EmbeddingResponseData] = []
    num_prompt_tokens = 0
    for idx, final_res in enumerate(final_res_batch):
        assert final_res is not None

--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
 from typing import List, Optional
+from typing import Sequence as GenericSequence

 import torch

@@ -120,7 +121,7 @@ class PackedLoRALayerWeights(LoRALayerWeights):

    @classmethod
    def pack(
-            cls, loras: List[Optional["LoRALayerWeights"]]
+        cls, loras: GenericSequence[Optional["LoRALayerWeights"]]
    ) -> "PackedLoRALayerWeights":
        """Pack a list of LoRAs into a single LoRA.


--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -165,7 +165,7 @@ class WorkerLoRAManager(AbstractWorkerLoRAManager):
            model = self._lora_manager.model
            supported_lora_modules = model.supported_lora_modules
            packed_modules_mapping = model.packed_modules_mapping
-            expected_lora_modules = []
+            expected_lora_modules: List[str] = []
            for module in supported_lora_modules:
                if module in packed_modules_mapping:
                    expected_lora_modules.extend(

--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -393,7 +393,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                param_data.copy_(loaded_weight)
                return
            current_shard_offset = 0
-            shard_offsets = []
+            shard_offsets: List[Tuple[int, int, int]] = []
            for i, output_size in enumerate(self.output_sizes):
                shard_offsets.append((i, current_shard_offset, output_size))
                current_shard_offset += output_size

--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -25,24 +25,25 @@ GPTQ_MARLIN_SUPPORTED_SYM = [True]


 # Permutations for Marlin scale shuffling
-def get_scale_perms(num_bits):
-    scale_perm = []
+def get_scale_perms(num_bits: int):
+    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
    for i in range(4):
        scale_perm_single.extend(
            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
    return scale_perm, scale_perm_single


-def get_pack_factor(num_bits):
+def get_pack_factor(num_bits: int):
    assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
            ), f"Unsupported num_bits = {num_bits}"
    return 32 // num_bits


-def marlin_permute_scales(s, size_k, size_n, group_size, num_bits):
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+                          group_size: int, num_bits: int):
    scale_perm, scale_perm_single = get_scale_perms(num_bits)
    if group_size < size_k and group_size != -1:
        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]

--- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
 """This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
 import numpy
 import torch

@@ -11,10 +13,10 @@ import torch
 #
 # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
 # (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms_24(num_bits):
-    perm_list = []
+def get_perms_24(num_bits: int):
+    perm_list: List[int] = []
    for i in range(32):
-        perm1 = []
+        perm1: List[int] = []
        col = i // 4
        col_o = col // 2
        for block in [0, 1]:
@@ -39,18 +41,18 @@ def get_perms_24(num_bits):

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
-    scale_perm = []
+    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
    for i in range(8):
        scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
    return perm, scale_perm, scale_perm_single


-marlin_24_perm = {}
-marlin_24_scale_perm = {}
-marlin_24_scale_perm_single = {}
+marlin_24_perm: Dict[int, torch.Tensor] = {}
+marlin_24_scale_perm: Dict[int, List[int]] = {}
+marlin_24_scale_perm_single: Dict[int, List[int]] = {}
 for num_bits in [4, 8]:
    perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
    marlin_24_perm[num_bits] = perm_24

--- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_perms.py
 """This file is used for /tests and /benchmarks"""
+from typing import Dict, List
+
 import numpy
 import torch

@@ -11,10 +13,10 @@ import torch
 #
 # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
 # (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms(num_bits):
-    perm_list = []
+def get_perms(num_bits: int):
+    perm_list: List[int] = []
    for i in range(32):
-        perm1 = []
+        perm1: List[int] = []
        col = i // 4
        for block in [0, 1]:
            for row in [
@@ -38,19 +40,19 @@ def get_perms(num_bits):

    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
    perm = torch.from_numpy(perm)
-    scale_perm = []
+    scale_perm: List[int] = []
    for i in range(8):
        scale_perm.extend([i + 8 * j for j in range(8)])
-    scale_perm_single = []
+    scale_perm_single: List[int] = []
    for i in range(4):
        scale_perm_single.extend(
            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
    return perm, scale_perm, scale_perm_single


-marlin_perm = {}
-marlin_scale_perm = {}
-marlin_scale_perm_single = {}
+marlin_perm: Dict[int, torch.Tensor] = {}
+marlin_scale_perm: Dict[int, List[int]] = {}
+marlin_scale_perm_single: Dict[int, List[int]] = {}
 for num_bits in [4, 8]:
    perm, scale_perm, scale_perm_single = get_perms(num_bits)
    marlin_perm[num_bits] = perm

--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -174,7 +174,7 @@ def _apply_min_tokens_penalty(
        min_tokens = sampling_params.min_tokens
        token_ids_to_penalize = sampling_params.all_stop_token_ids
        if min_tokens > 0 and token_ids_to_penalize:
-            seqs_to_penalize = []
+            seqs_to_penalize: List[int] = []
            for j, seq_id in enumerate(seq_ids):
                seq_data = seq_group.seq_data[seq_id]
                if len(seq_data.output_token_ids) < min_tokens:
@@ -285,7 +285,7 @@ def _greedy_sample(
        same as the length of selected_seq_groups. If the corresponding
        seq_group has do_sample=False, tuple contains ([], [])
    """
-    samples = samples.tolist()
+    samples_lst = samples.tolist()
    sample_idx = 0
    results: SampleResultType = []
    for seq_group in selected_seq_groups:
@@ -298,7 +298,7 @@ def _greedy_sample(
        assert num_parent_seqs == 1, (
            "Greedy sampling should have only one seq.")
        parent_ids = list(range(num_parent_seqs))
-        next_token_ids = [samples[sample_idx]]
+        next_token_ids = [samples_lst[sample_idx]]
        results.append((next_token_ids, parent_ids))
        sample_idx += num_parent_seqs
    return results
@@ -394,7 +394,7 @@ def _beam_search_sample(
            next_token_ids = next_token_ids.tolist()
        else:
            # Generation phase.
-            cumulative_logprobs: List[int] = [
+            cumulative_logprobs: List[float] = [
                seq_group.seq_data[seq_id].cumulative_logprob
                for seq_id in seq_ids
            ]
@@ -466,8 +466,9 @@ def _sample_with_torch(
        categorized_seq_group_ids[sampling_type].append(i)

    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
-    sample_metadata = {}
-    multinomial_samples = {}
+    sample_metadata: Dict[SamplingType,
+                          Tuple[List[int], List[SequenceGroupToSample]]] = {}
+    multinomial_samples: Dict[SamplingType, torch.Tensor] = {}

    # Create output tensor for sampled token ids.
    if include_gpu_probs_tensor:
@@ -494,7 +495,7 @@ def _sample_with_torch(
            greedy_samples = torch.argmax(logprobs[long_sample_indices],
                                          dim=-1)

-            if include_gpu_probs_tensor:
+            if sampled_token_ids_tensor is not None:
                # Store sampled tokens in output tensor.
                sampled_token_ids_tensor[
                    long_sample_indices] = greedy_samples.unsqueeze(-1)
@@ -522,7 +523,7 @@ def _sample_with_torch(
                probs[long_sample_indices], max_best_of_in_batch,
                **seeded_args)

-            if include_gpu_probs_tensor:
+            if sampled_token_ids_tensor is not None:
                # Store sampled tokens in output tensor.
                sampled_token_ids_tensor[
                    long_sample_indices] = multinomial_samples[sampling_type]
@@ -571,7 +572,9 @@ def _sample_with_triton_kernel(
        categorized_seq_group_ids[sampling_type].append(i)

    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
-    sample_metadata = {}
+    sample_metadata: Dict[SamplingType,
+                          Tuple[List[int], List[SequenceGroupToSample],
+                                torch.Tensor, torch.Tensor]] = {}
    max_best_of_in_batch = 1

    # Counterintiutively, having two loops here is actually faster.
@@ -1008,14 +1011,14 @@ def _build_sampler_output(
            speculative decoding rejection sampling.
    """

-    sampler_output = []
+    sampler_output: List[CompletionSequenceGroupOutput] = []
    for (seq_group, sample_result, group_prompt_logprobs,
         group_sample_logprobs) in zip(sampling_metadata.seq_groups,
                                       sample_results, prompt_logprobs,
                                       sample_logprobs):
        seq_ids = seq_group.seq_ids
        next_token_ids, parent_ids = sample_result
-        seq_outputs = []
+        seq_outputs: List[SequenceOutput] = []
        for parent_id, next_token_id, logprobs in zip(parent_ids,
                                                      next_token_ids,
                                                      group_sample_logprobs):

--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -68,7 +68,7 @@ def _get_model_initialization_kwargs(
        vision_language_config: Optional[VisionLanguageConfig]
 ) -> Dict[str, Any]:
    """Get extra kwargs for model initialization."""
-    extra_kwargs = {}
+    extra_kwargs: Dict[str, Any] = {}
    if hasattr(model_class, "supported_lora_modules"):
        extra_kwargs["lora_config"] = lora_config
    elif lora_config:
@@ -446,7 +446,8 @@ class ShardedStateLoader(BaseModelLoader):
        Filter out all tensors that share the same memory or a subset of the
        memory of another tensor.
        """
-        same_storage_groups = collections.defaultdict(list)
+        same_storage_groups: Dict[Any, List[Tuple[
+            str, torch.Tensor]]] = collections.defaultdict(list)
        for key, tensor in tensors.items():
            if tensor.numel():
                ptr = tensor.untyped_storage().data_ptr()
@@ -455,7 +456,7 @@ class ShardedStateLoader(BaseModelLoader):
        def get_end_ptr(tensor: torch.Tensor) -> int:
            return tensor.view(-1)[-1].data_ptr() + tensor.element_size()

-        result = {}
+        result: Dict[str, torch.Tensor] = {}
        for group in same_storage_groups.values():
            for k, t in group:
                a, b = t.data_ptr(), get_end_ptr(t)

--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -329,7 +329,7 @@ def np_cache_weights_iterator(
    # dumping the same model weights to numpy at the same time.
    with get_lock(model_name_or_path, cache_dir):
        if not os.path.exists(weight_names_file):
-            weight_names = []
+            weight_names: List[str] = []
            for bin_file in hf_weights_files:
                state = torch.load(bin_file, map_location="cpu")
                for name, param in state.items():

--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -72,11 +72,11 @@ _MODELS = {**_GENERATION_MODELS, **_EMBEDDING_MODELS}
 _OOT_MODELS: Dict[str, Type[nn.Module]] = {}

 # Models not supported by ROCm.
-_ROCM_UNSUPPORTED_MODELS = []
+_ROCM_UNSUPPORTED_MODELS: List[str] = []

 # Models partially supported by ROCm.
 # Architecture -> Reason.
-_ROCM_PARTIALLY_SUPPORTED_MODELS = {
+_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
    "Qwen2ForCausalLM":
    "Sliding window attention is not yet supported in ROCm's flash attention",
    "MistralForCausalLM":