remove tbo and zero_overhead envs

5db8533c · zhuwenwen · 573531eb · 5db8533c · 5db8533c · 5db8533c
Commit 5db8533c authored Oct 20, 2025 by zhuwenwen
Showing with 47 additions and 87 deletions

vllm/envs.py vllm/envs.py +0 -20

vllm/v1/attention/backends/mla/common.py vllm/v1/attention/backends/mla/common.py +47 -57

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +0 -10

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -222,10 +222,6 @@ if TYPE_CHECKING:
    VLLM_HAS_CONTEXT_DEFAULT: bool = False
    VLLM_USE_NN: bool = False
    VLLM_ENABLE_TBO: bool = False
-    VLLM_TBO_REQ_DELAY_MS: int = 0
-    VLLM_TBO_DECODE_BS: int = 0
-    VLLM_TBO_MIN_TOKENS: int = 200
-    VLLM_ZERO_OVERHEAD: bool = False
    VLLM_ENABLE_MOE_FUSED_GATE: bool = False
    VLLM_USE_FLASH_ATTN_PA: bool = False
    VLLM_USE_APEX_RN: bool = False
@@ -1583,22 +1579,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Enable two batch overlap.
    "VLLM_ENABLE_TBO":
    lambda: bool(int(os.getenv("VLLM_ENABLE_TBO", "0"))),
-    
-    # set delay on server when only one requet, the purpose is to merge a larger batch.
-    "VLLM_TBO_REQ_DELAY_MS":
-    lambda: int(os.getenv("VLLM_TBO_REQ_DELAY_MS", "0")),
-
-    # set the minimum batch size to enable TBO in decode, if < 2 , disable TBO in decode.
-    "VLLM_TBO_DECODE_BS":
-    lambda: int(os.getenv("VLLM_TBO_DECODE_BS", "0")),
-    
-    # set the minimum tokens size for each mini-batch to enable TBO on v1, default is 200.
-    "VLLM_TBO_MIN_TOKENS":
-    lambda: int(os.getenv("VLLM_TBO_MIN_TOKENS", "200")),
-
-    # Enable zero overhead scheduler.
-    "VLLM_ZERO_OVERHEAD":
-    lambda: bool(int(os.getenv("VLLM_ZERO_OVERHEAD", "0"))),

    # If set, vLLM will enable the moe_fused_gate kernel.
    "VLLM_ENABLE_MOE_FUSED_GATE":

--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -876,63 +876,53 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
        # TODO @ wangming
        decode_metadata = None
        # if num_decodes > 0:
-            # if self.use_spec_decode and not common_attn_metadata.spec_layer_decoding:
-            #     query_lens = self.num_scheduled_tokens_np[:num_decodes]
-            #     cu_num_blocks = np.cumsum(query_lens)
-            #     virtual_batches = cu_num_blocks[-1]
-            #     block_offsets = np.repeat(cu_num_blocks - query_lens, query_lens)
-            #     arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
-            #     rarange = np.repeat(query_lens, query_lens) - arange - 1
-
-            #     repeats = torch.from_numpy(query_lens).pin_memory().to(
-            #         block_table_tensor.device, non_blocking=True).contiguous()
-            #     if envs.VLLM_ZERO_OVERHEAD:
-            #         decode_block_table_tensor = torch.empty((self._num_decode_tokens, block_table_tensor.shape[1]), 
-            #                                                 device=block_table_tensor.device)
-            #         arange_np = np.arange(self._num_decodes)
-            #         indices_np = np.repeat(arange_np, query_lens)
-                    
-            #         indices = torch.from_numpy(indices_np).pin_memory().to(
-            #             block_table_tensor.device, non_blocking=True)
-            #         decode_block_table_tensor = block_table_tensor[indices].contiguous()
-            #         decode_seq_lens = seq_lens[indices].contiguous()
-            #     else:
-            #         decode_block_table_tensor = torch.repeat_interleave(
-            #             block_table_tensor[:self._num_decodes, ...],
-            #             repeats, dim=0).contiguous()
-            #         decode_seq_lens = torch.repeat_interleave(seq_lens[:self._num_decodes], repeats, dim=0).contiguous()
-            #     seq_lens_minus = torch.from_numpy(rarange).to(torch.int32).pin_memory().to(
-            #         seq_lens.device, non_blocking=True).contiguous()
-            #     decode_seq_lens = decode_seq_lens - seq_lens_minus
-
-            #     if self.spec_decode_block_table_tensor is not None:
-            #         self.spec_decode_block_table_tensor[:self._num_decode_tokens].copy_(decode_block_table_tensor)
-            #         self.spec_decode_seq_lens[:self._num_decode_tokens].copy_(decode_seq_lens)
-
-            #         decode_metadata = self._build_decode(
-            #             block_table_tensor=self.spec_decode_block_table_tensor[:self._num_decode_tokens, ...],
-            #             seq_lens=self.spec_decode_seq_lens[:self._num_decode_tokens],
-            #         )
-            #     else:
-            #         decode_metadata = self._build_decode(
-            #             block_table_tensor=decode_block_table_tensor,
-            #             seq_lens=decode_seq_lens,
-            #         )
-            # else:
-            #     self._num_decode_tokens = num_decodes
-            #     if self.use_spec_decode and self.spec_decode_block_table_tensor is not None:
-            #         self.spec_decode_block_table_tensor[:self._num_decode_tokens].copy_(block_table_tensor[:self._num_decode_tokens, ...])
-            #         self.spec_decode_seq_lens[:self._num_decode_tokens].copy_(seq_lens[:self._num_decode_tokens])
-
-            #         decode_metadata = self._build_decode(
-            #             block_table_tensor=self.spec_decode_block_table_tensor[:self._num_decode_tokens, ...],
-            #             seq_lens=self.spec_decode_seq_lens[:self._num_decode_tokens],
-            #         )
-            #     else:
-            #         decode_metadata = self._build_decode(
-            #             block_table_tensor=block_table_tensor[:self._num_decode_tokens, ...],
-            #             seq_lens=seq_lens[:self._num_decode_tokens],
-            #         )
+        #     if self.use_spec_decode and not common_attn_metadata.spec_layer_decoding:
+        #         query_lens = self.num_scheduled_tokens_np[:num_decodes]
+        #         cu_num_blocks = np.cumsum(query_lens)
+        #         virtual_batches = cu_num_blocks[-1]
+        #         block_offsets = np.repeat(cu_num_blocks - query_lens, query_lens)
+        #         arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
+        #         rarange = np.repeat(query_lens, query_lens) - arange - 1
+
+        #         repeats = torch.from_numpy(query_lens).pin_memory().to(
+        #             block_table_tensor.device, non_blocking=True).contiguous()
+               
+        #         decode_block_table_tensor = torch.repeat_interleave(
+        #             block_table_tensor[:self._num_decodes, ...],
+        #             repeats, dim=0).contiguous()
+        #         decode_seq_lens = torch.repeat_interleave(seq_lens[:self._num_decodes], repeats, dim=0).contiguous()
+        #         seq_lens_minus = torch.from_numpy(rarange).to(torch.int32).pin_memory().to(
+        #             seq_lens.device, non_blocking=True).contiguous()
+        #         decode_seq_lens = decode_seq_lens - seq_lens_minus
+
+        #         if self.spec_decode_block_table_tensor is not None:
+        #             self.spec_decode_block_table_tensor[:self._num_decode_tokens].copy_(decode_block_table_tensor)
+        #             self.spec_decode_seq_lens[:self._num_decode_tokens].copy_(decode_seq_lens)
+
+        #             decode_metadata = self._build_decode(
+        #                 block_table_tensor=self.spec_decode_block_table_tensor[:self._num_decode_tokens, ...],
+        #                 seq_lens=self.spec_decode_seq_lens[:self._num_decode_tokens],
+        #             )
+        #         else:
+        #             decode_metadata = self._build_decode(
+        #                 block_table_tensor=decode_block_table_tensor,
+        #                 seq_lens=decode_seq_lens,
+        #             )
+        #     else:
+        #         self._num_decode_tokens = num_decodes
+        #         if self.use_spec_decode and self.spec_decode_block_table_tensor is not None:
+        #             self.spec_decode_block_table_tensor[:self._num_decode_tokens].copy_(block_table_tensor[:self._num_decode_tokens, ...])
+        #             self.spec_decode_seq_lens[:self._num_decode_tokens].copy_(seq_lens[:self._num_decode_tokens])
+
+        #             decode_metadata = self._build_decode(
+        #                 block_table_tensor=self.spec_decode_block_table_tensor[:self._num_decode_tokens, ...],
+        #                 seq_lens=self.spec_decode_seq_lens[:self._num_decode_tokens],
+        #             )
+        #         else:
+        #             decode_metadata = self._build_decode(
+        #                 block_table_tensor=block_table_tensor[:self._num_decode_tokens, ...],
+        #                 seq_lens=seq_lens[:self._num_decode_tokens],
+        #             )

        decode_metadata = self._build_decode(
            block_table_tensor=block_table_tensor[:num_decodes, ...],

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1457,16 +1457,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        # [0, 1, 2, 5, 6, 9]
        target_logits_indices += arange

-        # if envs.VLLM_ZERO_OVERHEAD:
-        #     cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).pin_memory().to(
-        #         self.device, non_blocking=True)
-        #     logits_indices = torch.from_numpy(logits_indices).pin_memory().to(self.device,
-        #                                                         non_blocking=True)
-        #     target_logits_indices = torch.from_numpy(target_logits_indices).pin_memory().to(
-        #         self.device, non_blocking=True)
-        #     bonus_logits_indices = torch.from_numpy(bonus_logits_indices).pin_memory().to(
-        #         self.device, non_blocking=True)
-        # else:
        # TODO: Optimize the CPU -> GPU copy.
        cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
            self.device, non_blocking=True)