Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5db8533c
Commit
5db8533c
authored
Oct 20, 2025
by
zhuwenwen
Browse files
remove tbo and zero_overhead envs
parent
573531eb
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
47 additions
and
87 deletions
+47
-87
vllm/envs.py
vllm/envs.py
+0
-20
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+47
-57
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+0
-10
No files found.
vllm/envs.py
View file @
5db8533c
...
...
@@ -222,10 +222,6 @@ if TYPE_CHECKING:
VLLM_HAS_CONTEXT_DEFAULT
:
bool
=
False
VLLM_USE_NN
:
bool
=
False
VLLM_ENABLE_TBO
:
bool
=
False
VLLM_TBO_REQ_DELAY_MS
:
int
=
0
VLLM_TBO_DECODE_BS
:
int
=
0
VLLM_TBO_MIN_TOKENS
:
int
=
200
VLLM_ZERO_OVERHEAD
:
bool
=
False
VLLM_ENABLE_MOE_FUSED_GATE
:
bool
=
False
VLLM_USE_FLASH_ATTN_PA
:
bool
=
False
VLLM_USE_APEX_RN
:
bool
=
False
...
...
@@ -1583,22 +1579,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Enable two batch overlap.
"VLLM_ENABLE_TBO"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ENABLE_TBO"
,
"0"
))),
# set delay on server when only one requet, the purpose is to merge a larger batch.
"VLLM_TBO_REQ_DELAY_MS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_TBO_REQ_DELAY_MS"
,
"0"
)),
# set the minimum batch size to enable TBO in decode, if < 2 , disable TBO in decode.
"VLLM_TBO_DECODE_BS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_TBO_DECODE_BS"
,
"0"
)),
# set the minimum tokens size for each mini-batch to enable TBO on v1, default is 200.
"VLLM_TBO_MIN_TOKENS"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_TBO_MIN_TOKENS"
,
"200"
)),
# Enable zero overhead scheduler.
"VLLM_ZERO_OVERHEAD"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ZERO_OVERHEAD"
,
"0"
))),
# If set, vLLM will enable the moe_fused_gate kernel.
"VLLM_ENABLE_MOE_FUSED_GATE"
:
...
...
vllm/v1/attention/backends/mla/common.py
View file @
5db8533c
...
...
@@ -876,63 +876,53 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
# TODO @ wangming
decode_metadata
=
None
# if num_decodes > 0:
# if self.use_spec_decode and not common_attn_metadata.spec_layer_decoding:
# query_lens = self.num_scheduled_tokens_np[:num_decodes]
# cu_num_blocks = np.cumsum(query_lens)
# virtual_batches = cu_num_blocks[-1]
# block_offsets = np.repeat(cu_num_blocks - query_lens, query_lens)
# arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
# rarange = np.repeat(query_lens, query_lens) - arange - 1
# repeats = torch.from_numpy(query_lens).pin_memory().to(
# block_table_tensor.device, non_blocking=True).contiguous()
# if envs.VLLM_ZERO_OVERHEAD:
# decode_block_table_tensor = torch.empty((self._num_decode_tokens, block_table_tensor.shape[1]),
# device=block_table_tensor.device)
# arange_np = np.arange(self._num_decodes)
# indices_np = np.repeat(arange_np, query_lens)
# indices = torch.from_numpy(indices_np).pin_memory().to(
# block_table_tensor.device, non_blocking=True)
# decode_block_table_tensor = block_table_tensor[indices].contiguous()
# decode_seq_lens = seq_lens[indices].contiguous()
# else:
# decode_block_table_tensor = torch.repeat_interleave(
# block_table_tensor[:self._num_decodes, ...],
# repeats, dim=0).contiguous()
# decode_seq_lens = torch.repeat_interleave(seq_lens[:self._num_decodes], repeats, dim=0).contiguous()
# seq_lens_minus = torch.from_numpy(rarange).to(torch.int32).pin_memory().to(
# seq_lens.device, non_blocking=True).contiguous()
# decode_seq_lens = decode_seq_lens - seq_lens_minus
# if self.spec_decode_block_table_tensor is not None:
# self.spec_decode_block_table_tensor[:self._num_decode_tokens].copy_(decode_block_table_tensor)
# self.spec_decode_seq_lens[:self._num_decode_tokens].copy_(decode_seq_lens)
# decode_metadata = self._build_decode(
# block_table_tensor=self.spec_decode_block_table_tensor[:self._num_decode_tokens, ...],
# seq_lens=self.spec_decode_seq_lens[:self._num_decode_tokens],
# )
# else:
# decode_metadata = self._build_decode(
# block_table_tensor=decode_block_table_tensor,
# seq_lens=decode_seq_lens,
# )
# else:
# self._num_decode_tokens = num_decodes
# if self.use_spec_decode and self.spec_decode_block_table_tensor is not None:
# self.spec_decode_block_table_tensor[:self._num_decode_tokens].copy_(block_table_tensor[:self._num_decode_tokens, ...])
# self.spec_decode_seq_lens[:self._num_decode_tokens].copy_(seq_lens[:self._num_decode_tokens])
# decode_metadata = self._build_decode(
# block_table_tensor=self.spec_decode_block_table_tensor[:self._num_decode_tokens, ...],
# seq_lens=self.spec_decode_seq_lens[:self._num_decode_tokens],
# )
# else:
# decode_metadata = self._build_decode(
# block_table_tensor=block_table_tensor[:self._num_decode_tokens, ...],
# seq_lens=seq_lens[:self._num_decode_tokens],
# )
# if self.use_spec_decode and not common_attn_metadata.spec_layer_decoding:
# query_lens = self.num_scheduled_tokens_np[:num_decodes]
# cu_num_blocks = np.cumsum(query_lens)
# virtual_batches = cu_num_blocks[-1]
# block_offsets = np.repeat(cu_num_blocks - query_lens, query_lens)
# arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
# rarange = np.repeat(query_lens, query_lens) - arange - 1
# repeats = torch.from_numpy(query_lens).pin_memory().to(
# block_table_tensor.device, non_blocking=True).contiguous()
# decode_block_table_tensor = torch.repeat_interleave(
# block_table_tensor[:self._num_decodes, ...],
# repeats, dim=0).contiguous()
# decode_seq_lens = torch.repeat_interleave(seq_lens[:self._num_decodes], repeats, dim=0).contiguous()
# seq_lens_minus = torch.from_numpy(rarange).to(torch.int32).pin_memory().to(
# seq_lens.device, non_blocking=True).contiguous()
# decode_seq_lens = decode_seq_lens - seq_lens_minus
# if self.spec_decode_block_table_tensor is not None:
# self.spec_decode_block_table_tensor[:self._num_decode_tokens].copy_(decode_block_table_tensor)
# self.spec_decode_seq_lens[:self._num_decode_tokens].copy_(decode_seq_lens)
# decode_metadata = self._build_decode(
# block_table_tensor=self.spec_decode_block_table_tensor[:self._num_decode_tokens, ...],
# seq_lens=self.spec_decode_seq_lens[:self._num_decode_tokens],
# )
# else:
# decode_metadata = self._build_decode(
# block_table_tensor=decode_block_table_tensor,
# seq_lens=decode_seq_lens,
# )
# else:
# self._num_decode_tokens = num_decodes
# if self.use_spec_decode and self.spec_decode_block_table_tensor is not None:
# self.spec_decode_block_table_tensor[:self._num_decode_tokens].copy_(block_table_tensor[:self._num_decode_tokens, ...])
# self.spec_decode_seq_lens[:self._num_decode_tokens].copy_(seq_lens[:self._num_decode_tokens])
# decode_metadata = self._build_decode(
# block_table_tensor=self.spec_decode_block_table_tensor[:self._num_decode_tokens, ...],
# seq_lens=self.spec_decode_seq_lens[:self._num_decode_tokens],
# )
# else:
# decode_metadata = self._build_decode(
# block_table_tensor=block_table_tensor[:self._num_decode_tokens, ...],
# seq_lens=seq_lens[:self._num_decode_tokens],
# )
decode_metadata
=
self
.
_build_decode
(
block_table_tensor
=
block_table_tensor
[:
num_decodes
,
...],
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
5db8533c
...
...
@@ -1457,16 +1457,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# [0, 1, 2, 5, 6, 9]
target_logits_indices
+=
arange
# if envs.VLLM_ZERO_OVERHEAD:
# cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).pin_memory().to(
# self.device, non_blocking=True)
# logits_indices = torch.from_numpy(logits_indices).pin_memory().to(self.device,
# non_blocking=True)
# target_logits_indices = torch.from_numpy(target_logits_indices).pin_memory().to(
# self.device, non_blocking=True)
# bonus_logits_indices = torch.from_numpy(bonus_logits_indices).pin_memory().to(
# self.device, non_blocking=True)
# else:
# TODO: Optimize the CPU -> GPU copy.
cu_num_draft_tokens
=
torch
.
from_numpy
(
cu_num_draft_tokens
).
to
(
self
.
device
,
non_blocking
=
True
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment