"docs/getting_started/installation/cpu.md" did not exist on "32aa2059addd97be1afce7a199d228191710c294"
Commit 5db8533c authored by zhuwenwen's avatar zhuwenwen
Browse files

remove tbo and zero_overhead envs

parent 573531eb
......@@ -222,10 +222,6 @@ if TYPE_CHECKING:
VLLM_HAS_CONTEXT_DEFAULT: bool = False
VLLM_USE_NN: bool = False
VLLM_ENABLE_TBO: bool = False
VLLM_TBO_REQ_DELAY_MS: int = 0
VLLM_TBO_DECODE_BS: int = 0
VLLM_TBO_MIN_TOKENS: int = 200
VLLM_ZERO_OVERHEAD: bool = False
VLLM_ENABLE_MOE_FUSED_GATE: bool = False
VLLM_USE_FLASH_ATTN_PA: bool = False
VLLM_USE_APEX_RN: bool = False
......@@ -1584,22 +1580,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ENABLE_TBO":
lambda: bool(int(os.getenv("VLLM_ENABLE_TBO", "0"))),
# set delay on server when only one requet, the purpose is to merge a larger batch.
"VLLM_TBO_REQ_DELAY_MS":
lambda: int(os.getenv("VLLM_TBO_REQ_DELAY_MS", "0")),
# set the minimum batch size to enable TBO in decode, if < 2 , disable TBO in decode.
"VLLM_TBO_DECODE_BS":
lambda: int(os.getenv("VLLM_TBO_DECODE_BS", "0")),
# set the minimum tokens size for each mini-batch to enable TBO on v1, default is 200.
"VLLM_TBO_MIN_TOKENS":
lambda: int(os.getenv("VLLM_TBO_MIN_TOKENS", "200")),
# Enable zero overhead scheduler.
"VLLM_ZERO_OVERHEAD":
lambda: bool(int(os.getenv("VLLM_ZERO_OVERHEAD", "0"))),
# If set, vLLM will enable the moe_fused_gate kernel.
"VLLM_ENABLE_MOE_FUSED_GATE":
lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_FUSED_GATE", "1"))),
......
......@@ -886,17 +886,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
# repeats = torch.from_numpy(query_lens).pin_memory().to(
# block_table_tensor.device, non_blocking=True).contiguous()
# if envs.VLLM_ZERO_OVERHEAD:
# decode_block_table_tensor = torch.empty((self._num_decode_tokens, block_table_tensor.shape[1]),
# device=block_table_tensor.device)
# arange_np = np.arange(self._num_decodes)
# indices_np = np.repeat(arange_np, query_lens)
# indices = torch.from_numpy(indices_np).pin_memory().to(
# block_table_tensor.device, non_blocking=True)
# decode_block_table_tensor = block_table_tensor[indices].contiguous()
# decode_seq_lens = seq_lens[indices].contiguous()
# else:
# decode_block_table_tensor = torch.repeat_interleave(
# block_table_tensor[:self._num_decodes, ...],
# repeats, dim=0).contiguous()
......
......@@ -1457,16 +1457,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# [0, 1, 2, 5, 6, 9]
target_logits_indices += arange
# if envs.VLLM_ZERO_OVERHEAD:
# cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).pin_memory().to(
# self.device, non_blocking=True)
# logits_indices = torch.from_numpy(logits_indices).pin_memory().to(self.device,
# non_blocking=True)
# target_logits_indices = torch.from_numpy(target_logits_indices).pin_memory().to(
# self.device, non_blocking=True)
# bonus_logits_indices = torch.from_numpy(bonus_logits_indices).pin_memory().to(
# self.device, non_blocking=True)
# else:
# TODO: Optimize the CPU -> GPU copy.
cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
self.device, non_blocking=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment