Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c1c5e4f6
Commit
c1c5e4f6
authored
Dec 18, 2025
by
zhuwenwen
Browse files
remove unused code
parent
d2fe5111
Changes
17
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
12 additions
and
1042 deletions
+12
-1042
vllm/attention/ops/common.py
vllm/attention/ops/common.py
+1
-206
vllm/attention/ops/paged_attn.py
vllm/attention/ops/paged_attn.py
+0
-6
vllm/compilation/decorators.py
vllm/compilation/decorators.py
+0
-1
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+0
-5
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+0
-2
vllm/lora/models.py
vllm/lora/models.py
+0
-3
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+0
-3
vllm/v1/attention/backends/utils.py
vllm/v1/attention/backends/utils.py
+1
-8
vllm/v1/attention/backends/xformers.py
vllm/v1/attention/backends/xformers.py
+0
-438
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+3
-10
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+0
-1
vllm/v1/engine/llm_engine.py
vllm/v1/engine/llm_engine.py
+0
-3
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+0
-11
vllm/v1/spec_decode/utils.py
vllm/v1/spec_decode/utils.py
+0
-1
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_input_batch.py
+1
-2
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+6
-23
vllm/worker/worker_base.py
vllm/worker/worker_base.py
+0
-319
No files found.
vllm/attention/ops/common.py
View file @
c1c5e4f6
...
...
@@ -467,208 +467,3 @@ def unpack_seq_triton(
out
=
out
.
reshape
(
output_shape
)
return
out
\ No newline at end of file
@
triton
.
jit
def
_pack_seq_kernel
(
x_ptr
,
# [N, D]
out_ptr
,
# [B, Lmax, D]
lengths_ptr
,
# *i32, [B]
N
:
tl
.
constexpr
,
D
:
tl
.
constexpr
,
Lmax
:
tl
.
constexpr
,
PAD_VALUE
:
tl
.
constexpr
,
BLOCK_T
:
tl
.
constexpr
,
# timesteps per program
BLOCK_D
:
tl
.
constexpr
# features per program
):
pid_b
=
tl
.
program_id
(
0
)
# batch id
pid_t
=
tl
.
program_id
(
1
)
# block over time dimension
pid_d
=
tl
.
program_id
(
2
)
# block over feature dimension
off_t
=
pid_t
*
BLOCK_T
+
tl
.
arange
(
0
,
BLOCK_T
)
# [BLOCK_T]
off_d
=
pid_d
*
BLOCK_D
+
tl
.
arange
(
0
,
BLOCK_D
)
# [BLOCK_D]
# Compute start index and sequence length from cumulative lengths
in_start
=
0
for
i
in
range
(
pid_b
):
in_start
+=
tl
.
load
(
lengths_ptr
+
i
)
seq_len
=
tl
.
load
(
lengths_ptr
+
pid_b
)
# valid time positions for this block
t_mask
=
off_t
<
Lmax
# compute input row indices for valid (b, t)
in_row
=
in_start
+
off_t
valid_row
=
(
off_t
<
seq_len
)
&
t_mask
# Pointers
# x_ptr: row-major [N, D]
x_row_ptr
=
x_ptr
+
in_row
[:,
None
]
*
D
+
off_d
[
None
,
:]
# out_ptr: row-major [B, Lmax, D]
out_row_ptr
=
out_ptr
+
(
pid_b
*
Lmax
+
off_t
)[:,
None
]
*
D
+
off_d
[
None
,
:]
# Initialize with PAD (cast will occur as needed based on out_ptr dtype)
d_mask
=
off_d
[
None
,
:]
<
D
pad_vals
=
tl
.
full
([
BLOCK_T
,
BLOCK_D
],
PAD_VALUE
,
tl
.
float32
)
tl
.
store
(
out_row_ptr
,
pad_vals
,
mask
=
t_mask
[:,
None
]
&
d_mask
)
# Load & write only where within seq_len
x_vals
=
tl
.
load
(
x_row_ptr
,
mask
=
valid_row
[:,
None
]
&
d_mask
)
tl
.
store
(
out_row_ptr
,
x_vals
,
mask
=
valid_row
[:,
None
]
&
d_mask
)
def
pack_seq_triton
(
x
:
torch
.
Tensor
,
lengths
:
torch
.
Tensor
,
pad_value
:
float
=
-
float
(
'inf'
),
block_t
:
int
=
64
,
block_d
:
int
=
64
)
->
torch
.
Tensor
:
"""
Pack sequences of different lengths into a batched tensor.
Args:
x: [N, ...] - input tensor where N is total number of tokens
lengths: [B] - sequence lengths for each batch
pad_value: value to use for padding
block_t: block size for time dimension
block_d: block size for feature dimension
Returns:
packed: [B, Lmax, ...] - packed tensor
"""
# Handle multi-dimensional input by reshaping to (N, -1)
original_shape
=
x
.
shape
if
len
(
original_shape
)
>
2
:
N
=
original_shape
[
0
]
x_reshaped
=
x
.
reshape
(
N
,
-
1
)
D
=
x_reshaped
.
shape
[
1
]
else
:
N
,
D
=
x
.
shape
x_reshaped
=
x
B
=
lengths
.
numel
()
Lmax
=
int
(
lengths
.
max
().
item
())
# Starts are computed inside the kernel from lengths
out
=
torch
.
empty
((
B
,
Lmax
,
D
),
device
=
x
.
device
,
dtype
=
x
.
dtype
)
grid
=
(
B
,
triton
.
cdiv
(
Lmax
,
block_t
),
triton
.
cdiv
(
D
,
block_d
))
_pack_seq_kernel
[
grid
](
x_reshaped
,
out
,
lengths
.
int
(),
N
,
D
,
Lmax
,
PAD_VALUE
=
float
(
pad_value
),
BLOCK_T
=
block_t
,
BLOCK_D
=
block_d
,
num_warps
=
4
,
num_stages
=
2
)
# Reshape output back to original dimensions (except first dimension)
if
len
(
original_shape
)
>
2
:
output_shape
=
(
B
,
Lmax
)
+
original_shape
[
1
:]
out
=
out
.
reshape
(
output_shape
)
return
out
@
triton
.
jit
def
_unpack_seq_triton_kernel
(
packed_ptr
,
# [B, Lmax, D]
out_ptr
,
# [N, D]
lengths_ptr
,
# *i32, [B]
B
:
tl
.
constexpr
,
Lmax
:
tl
.
constexpr
,
D
:
tl
.
constexpr
,
BLOCK_T
:
tl
.
constexpr
,
# timesteps per program
BLOCK_D
:
tl
.
constexpr
# features per program
):
pid_b
=
tl
.
program_id
(
0
)
# batch id
pid_t
=
tl
.
program_id
(
1
)
# block over time dimension
pid_d
=
tl
.
program_id
(
2
)
# block over feature dimension
off_t
=
pid_t
*
BLOCK_T
+
tl
.
arange
(
0
,
BLOCK_T
)
# [BLOCK_T]
off_d
=
pid_d
*
BLOCK_D
+
tl
.
arange
(
0
,
BLOCK_D
)
# [BLOCK_D]
# bounds: compute start from cumulative lengths
in_start
=
0
for
i
in
range
(
pid_b
):
in_start
+=
tl
.
load
(
lengths_ptr
+
i
)
seq_len
=
tl
.
load
(
lengths_ptr
+
pid_b
)
# valid time positions for this block
t_mask
=
off_t
<
Lmax
valid_row
=
(
off_t
<
seq_len
)
&
t_mask
# compute output row indices for valid (b, t)
out_row
=
in_start
+
off_t
# Pointers
# packed_ptr: row-major [B, Lmax, D]
packed_row_ptr
=
packed_ptr
+
(
pid_b
*
Lmax
+
off_t
)[:,
None
]
*
D
+
off_d
[
None
,
:]
# out_ptr: row-major [N, D]
out_row_ptr
=
out_ptr
+
out_row
[:,
None
]
*
D
+
off_d
[
None
,
:]
# Load from packed tensor and store to output
d_mask
=
off_d
[
None
,
:]
<
D
packed_vals
=
tl
.
load
(
packed_row_ptr
,
mask
=
valid_row
[:,
None
]
&
d_mask
)
tl
.
store
(
out_row_ptr
,
packed_vals
,
mask
=
valid_row
[:,
None
]
&
d_mask
)
def
unpack_seq_triton
(
packed_tensor
:
torch
.
Tensor
,
lengths
:
torch
.
Tensor
,
block_t
:
int
=
64
,
block_d
:
int
=
64
)
->
torch
.
Tensor
:
"""
Unpack a packed decode query tensor back to the original format.
Efficient Triton implementation.
Args:
packed_tensor: [B, Lmax, ...] - packed tensor from pack_seq_triton
lengths: [B] - sequence lengths for each batch
block_t: block size for time dimension
block_d: block size for feature dimension
Returns:
unpacked_tensor: [N, ...] where N = sum(lengths)
"""
# Handle multi-dimensional input by reshaping to (B, Lmax, -1)
original_shape
=
packed_tensor
.
shape
if
len
(
original_shape
)
>
3
:
B
,
Lmax
=
original_shape
[:
2
]
packed_reshaped
=
packed_tensor
.
reshape
(
B
,
Lmax
,
-
1
)
D
=
packed_reshaped
.
shape
[
2
]
else
:
B
,
Lmax
,
D
=
packed_tensor
.
shape
packed_reshaped
=
packed_tensor
# Calculate total number of elements
N
=
int
(
lengths
.
sum
().
item
())
out
=
torch
.
empty
((
N
,
D
),
device
=
packed_tensor
.
device
,
dtype
=
packed_tensor
.
dtype
)
grid
=
(
B
,
triton
.
cdiv
(
Lmax
,
block_t
),
triton
.
cdiv
(
D
,
block_d
))
_unpack_seq_triton_kernel
[
grid
](
packed_reshaped
,
out
,
lengths
.
int
(),
B
,
Lmax
,
D
,
BLOCK_T
=
block_t
,
BLOCK_D
=
block_d
,
num_warps
=
4
,
num_stages
=
2
)
# Reshape output back to original dimensions (except first dimension)
if
len
(
original_shape
)
>
3
:
output_shape
=
(
N
,
)
+
original_shape
[
2
:]
out
=
out
.
reshape
(
output_shape
)
return
out
vllm/attention/ops/paged_attn.py
View file @
c1c5e4f6
...
...
@@ -7,18 +7,12 @@ import torch
from
vllm.platforms
import
current_platform
from
vllm
import
envs
from
vllm.triton_utils
import
HAS_TRITON
if
current_platform
.
is_cuda_alike
():
from
vllm
import
_custom_ops
as
ops
elif
current_platform
.
is_xpu
():
from
vllm._ipex_ops
import
ipex_ops
as
ops
if
HAS_TRITON
:
from
vllm.attention.ops.prefix_prefill
import
context_attention_fwd
use_tc
=
envs
.
VLLM_USE_OPT_OP
and
envs
.
VLLM_USE_TC_PAGED_ATTN
class
PagedAttention
:
@
staticmethod
...
...
vllm/compilation/decorators.py
View file @
c1c5e4f6
...
...
@@ -31,7 +31,6 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
from
vllm.utils.torch_utils
import
supports_dynamo
from
.monitor
import
start_monitoring_torch_compile
from
vllm.config
import
VllmConfig
from
vllm.forward_context
import
get_profilling
logger
=
init_logger
(
__name__
)
...
...
vllm/entrypoints/chat_utils.py
View file @
c1c5e4f6
...
...
@@ -1767,11 +1767,6 @@ def apply_hf_chat_template(
)
try
:
resolved_kwargs
=
resolve_chat_template_kwargs
(
tokenizer
=
tokenizer
,
chat_template
=
hf_chat_template
,
chat_template_kwargs
=
kwargs
,
)
return
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
# type: ignore[arg-type]
tools
=
tools
,
# type: ignore[arg-type]
...
...
vllm/entrypoints/llm.py
View file @
c1c5e4f6
...
...
@@ -80,8 +80,6 @@ from vllm.v1.engine import EngineCoreRequest
from
vllm.v1.engine.llm_engine
import
LLMEngine
from
vllm.v1.sample.logits_processor
import
LogitsProcessor
import
vllm.envs
as
envs
if
TYPE_CHECKING
:
from
vllm.v1.metrics.reader
import
Metric
...
...
vllm/lora/models.py
View file @
c1c5e4f6
...
...
@@ -332,9 +332,6 @@ class LoRAModelManager:
assert
self
.
supported_lora_modules
,
"No supported LoRA modules found in"
f
"
{
self
.
model
.
__class__
.
__name__
}
."
if
lora_config
.
lora_target_modules
is
not
None
:
self
.
supported_lora_modules
=
lora_config
.
lora_target_modules
self
.
packed_modules_mapping
=
process_packed_modules_mapping
(
self
.
model
)
# Used to indicate whether the model is a multimodal model
self
.
supports_mm
:
bool
=
(
...
...
vllm/v1/attention/backends/mla/common.py
View file @
c1c5e4f6
...
...
@@ -231,7 +231,6 @@ from vllm.v1.attention.backends.utils import (
split_decodes_and_prefills
,
)
from
vllm.v1.kv_cache_interface
import
AttentionSpec
from
vllm.v1.worker.block_table
import
BlockTable
class
QueryLenSupport
(
Enum
):
...
...
@@ -1460,8 +1459,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
v
=
v
,
return_lse
=
True
,
)
# Convert from (q_len, num_heads) to (num_heads, q_len)
return
attn_out
,
lse
.
transpose
(
0
,
1
).
contiguous
()
# Convert from (q_len, num_heads) to (num_heads, q_len)
return
attn_out
,
lse
.
transpose
(
0
,
1
).
contiguous
()
...
...
vllm/v1/attention/backends/utils.py
View file @
c1c5e4f6
...
...
@@ -69,10 +69,8 @@ class CommonAttentionMetadata:
seq_lens_cpu
:
torch
.
Tensor
"""(batch_size,), the length of each request including both computed tokens
and newly scheduled tokens"""
num_computed_tokens_cpu
:
torch
.
Tensor
"""(batch_size,), the number of computed tokens for each request"""
num_reqs
:
int
"""Number of requests"""
# TODO(lucas): rename to num_tokens since it may be padded and this is misleading
...
...
@@ -84,12 +82,7 @@ class CommonAttentionMetadata:
"""Longest context length in batch"""
block_table_tensor
:
torch
.
Tensor
num_speculative_tokens
:
int
=
0
"""Number of speculative tokens"""
slot_mapping
:
torch
.
Tensor
=
None
"""(batch_size, seq_len), slot mapping"""
spec_layer_decoding
:
bool
=
False
slot_mapping
:
torch
.
Tensor
causal
:
bool
=
True
...
...
vllm/v1/attention/backends/xformers.py
deleted
100644 → 0
View file @
d2fe5111
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Attention layer with XFormersAttention."""
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Optional
import
torch
from
vllm.attention.backends.abstract
import
(
AttentionBackend
,
AttentionImpl
,
AttentionMetadata
,
AttentionType
)
from
vllm.attention.ops.triton_unified_attention
import
unified_attention
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.v1.attention.backends.utils
import
(
AttentionMetadataBuilder
,
CommonAttentionMetadata
,
reorder_batch_to_split_decodes_and_prefills
,
split_decodes_and_prefills
)
from
vllm.v1.kv_cache_interface
import
AttentionSpec
try
:
from
xformers
import
ops
as
xops
from
xformers.ops.fmha.attn_bias
import
(
AttentionBias
,
PagedBlockDiagonalCausalWithOffsetPaddedKeysMask
)
XFORMERS_AVAILABLE
=
True
except
ImportError
:
XFORMERS_AVAILABLE
=
False
if
TYPE_CHECKING
:
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.worker.gpu_input_batch
import
InputBatch
from
vllm
import
_custom_ops
as
ops
logger
=
init_logger
(
__name__
)
class
XFormersAttentionBackend
(
AttentionBackend
):
accept_output_buffer
:
bool
=
True
@
classmethod
def
get_supported_dtypes
(
cls
)
->
list
[
torch
.
dtype
]:
return
[
torch
.
float16
,
torch
.
bfloat16
]
@
classmethod
def
get_supported_head_sizes
(
cls
)
->
list
[
int
]:
return
[
32
,
40
,
48
,
56
,
64
,
72
,
80
,
88
,
96
,
104
,
112
,
120
,
128
,
136
,
144
,
152
,
160
,
168
,
176
,
184
,
192
,
200
,
208
,
216
,
224
,
232
,
240
,
248
,
256
,
]
@
classmethod
def
validate_head_size
(
cls
,
head_size
:
int
)
->
None
:
supported_head_sizes
=
cls
.
get_supported_head_sizes
()
if
head_size
not
in
supported_head_sizes
:
attn_type
=
cls
.
__name__
.
removesuffix
(
"Backend"
)
raise
ValueError
(
f
"Head size
{
head_size
}
is not supported by
{
attn_type
}
. "
f
"Supported head sizes are:
{
supported_head_sizes
}
. "
"Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
"FlexAttention backend which supports all head sizes."
)
@
staticmethod
def
get_name
()
->
str
:
return
"XFORMERS"
@
staticmethod
def
get_impl_cls
()
->
type
[
"XFormersAttentionImpl"
]:
return
XFormersAttentionImpl
@
staticmethod
def
get_metadata_cls
()
->
type
[
"AttentionMetadata"
]:
return
XFormersAttentionMetadata
@
staticmethod
def
get_kv_cache_shape
(
num_blocks
:
int
,
block_size
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
cache_dtype_str
:
str
=
"auto"
,
)
->
tuple
[
int
,
...]:
if
block_size
%
16
!=
0
:
raise
ValueError
(
"Block size must be a multiple of 16."
)
return
(
2
,
num_blocks
,
block_size
,
num_kv_heads
,
head_size
)
@
staticmethod
def
get_builder_cls
()
->
type
[
"XFormersAttentionMetadataBuilder"
]:
return
XFormersAttentionMetadataBuilder
@
staticmethod
def
use_cascade_attention
(
*
args
,
**
kwargs
)
->
bool
:
return
False
@
dataclass
class
XFormersAttentionMetadata
:
num_actual_tokens
:
int
# Number of tokens excluding padding.
max_query_len
:
int
query_start_loc
:
torch
.
Tensor
max_seq_len
:
int
seq_lens
:
torch
.
Tensor
block_table
:
torch
.
Tensor
slot_mapping
:
torch
.
Tensor
num_prefill_tokens
:
int
=
0
num_decode_tokens
:
int
=
0
num_prefills
:
int
=
0
num_decodes
:
int
=
0
# Biases for different attention types.
attn_bias
:
Optional
[
"AttentionBias"
]
=
None
# Self-attention prefill/decode metadata cache
_cached_prefill_metadata
:
Optional
[
"XFormersAttentionMetadata"
]
=
None
_cached_decode_metadata
:
Optional
[
"XFormersAttentionMetadata"
]
=
None
@
property
def
prefill_metadata
(
self
)
->
Optional
[
"XFormersAttentionMetadata"
]:
if
self
.
num_prefills
==
0
:
return
None
if
self
.
_cached_prefill_metadata
is
not
None
:
# Recover cached prefill-phase attention
# metadata structure
return
self
.
_cached_prefill_metadata
q_start_loc
=
self
.
query_start_loc
[
self
.
num_decodes
:]
q_seqlens
=
torch
.
diff
(
q_start_loc
)
kv_seqlens
=
self
.
seq_lens
[
self
.
num_decodes
:]
# Construct & cache prefill-phase attention metadata structure
self
.
_cached_prefill_metadata
=
XFormersAttentionMetadata
(
num_actual_tokens
=
self
.
num_prefill_tokens
,
max_query_len
=
int
(
q_seqlens
.
max
().
item
()),
query_start_loc
=
q_start_loc
-
q_start_loc
[
0
],
max_seq_len
=
int
(
kv_seqlens
.
max
().
item
()),
seq_lens
=
kv_seqlens
,
block_table
=
self
.
block_table
[
self
.
num_decodes
:],
slot_mapping
=
self
.
slot_mapping
[
self
.
num_decode_tokens
:],
)
return
self
.
_cached_prefill_metadata
@
property
def
decode_metadata
(
self
)
->
Optional
[
"XFormersAttentionMetadata"
]:
if
self
.
num_decode_tokens
==
0
:
return
None
if
self
.
_cached_decode_metadata
is
not
None
:
# Recover cached decode-phase attention
# metadata structure
return
self
.
_cached_decode_metadata
q_start_loc
=
self
.
query_start_loc
q_seqlens
=
torch
.
diff
(
q_start_loc
)
decode_kv_seqlens
=
self
.
seq_lens
[:
self
.
num_decodes
]
# Construct & cache decode-phase attention metadata structure
self
.
_cached_decode_metadata
=
XFormersAttentionMetadata
(
num_actual_tokens
=
self
.
num_decode_tokens
,
max_query_len
=
int
(
q_seqlens
[:
self
.
num_decodes
].
max
().
item
()),
query_start_loc
=
q_start_loc
[:
self
.
num_decodes
+
1
],
max_seq_len
=
int
(
decode_kv_seqlens
.
max
().
item
()),
seq_lens
=
decode_kv_seqlens
,
block_table
=
self
.
block_table
[:
self
.
num_decodes
],
slot_mapping
=
self
.
slot_mapping
[:
self
.
num_decode_tokens
],
attn_bias
=
self
.
attn_bias
,
)
return
self
.
_cached_decode_metadata
class
XFormersAttentionMetadataBuilder
(
AttentionMetadataBuilder
[
XFormersAttentionMetadata
]):
reorder_batch_threshold
:
int
=
1
def
__init__
(
self
,
kv_cache_spec
:
AttentionSpec
,
layer_names
:
list
[
str
],
vllm_config
:
VllmConfig
,
device
:
torch
.
device
,
):
super
().
__init__
(
kv_cache_spec
,
layer_names
,
vllm_config
,
device
)
assert
XFORMERS_AVAILABLE
self
.
block_size
=
kv_cache_spec
.
block_size
self
.
_num_decodes
=
0
self
.
_num_decode_tokens
=
0
def
reorder_batch
(
self
,
input_batch
:
"InputBatch"
,
scheduler_output
:
"SchedulerOutput"
)
->
bool
:
return
reorder_batch_to_split_decodes_and_prefills
(
input_batch
,
scheduler_output
,
decode_threshold
=
self
.
reorder_batch_threshold
)
def
build
(
self
,
common_prefix_len
:
int
,
common_attn_metadata
:
CommonAttentionMetadata
,
fast_build
:
bool
=
False
,
)
->
XFormersAttentionMetadata
:
num_decodes
,
num_prefills
,
num_decode_tokens
,
num_prefill_tokens
=
(
split_decodes_and_prefills
(
common_attn_metadata
,
decode_threshold
=
self
.
reorder_batch_threshold
))
num_actual_tokens
=
common_attn_metadata
.
num_actual_tokens
q_start_loc
=
common_attn_metadata
.
query_start_loc
q_seqlens
=
torch
.
diff
(
q_start_loc
)
max_query_len
=
common_attn_metadata
.
max_query_len
kv_seqlens
=
common_attn_metadata
.
seq_lens
max_seq_len
=
common_attn_metadata
.
max_seq_len
block_table
=
common_attn_metadata
.
block_table_tensor
slot_mapping
=
common_attn_metadata
.
slot_mapping
bias
=
None
if
num_decodes
>
0
:
# Construct the decoder bias.
decode_q_seqlens
=
q_seqlens
[:
num_decodes
]
decode_kv_seqlens
=
kv_seqlens
[:
num_decodes
]
bias
=
(
PagedBlockDiagonalCausalWithOffsetPaddedKeysMask
.
from_seqlens
(
q_seqlen
=
decode_q_seqlens
.
tolist
(),
kv_seqlen
=
decode_kv_seqlens
.
tolist
(),
page_size
=
self
.
block_size
,
block_tables
=
block_table
[:
num_decodes
],
device
=
block_table
.
device
,
))
return
XFormersAttentionMetadata
(
num_actual_tokens
=
num_actual_tokens
,
num_prefill_tokens
=
num_prefill_tokens
,
num_decode_tokens
=
num_decode_tokens
,
num_prefills
=
num_prefills
,
num_decodes
=
num_decodes
,
max_query_len
=
max_query_len
,
query_start_loc
=
q_start_loc
,
max_seq_len
=
max_seq_len
,
seq_lens
=
kv_seqlens
,
block_table
=
block_table
,
slot_mapping
=
slot_mapping
,
attn_bias
=
bias
,
)
class
XFormersAttentionImpl
(
AttentionImpl
):
def
__init__
(
self
,
num_heads
:
int
,
head_size
:
int
,
scale
:
float
,
num_kv_heads
:
int
,
alibi_slopes
:
Optional
[
list
[
float
]],
sliding_window
:
Optional
[
int
],
kv_cache_dtype
:
str
,
logits_soft_cap
:
Optional
[
float
]
=
None
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
kv_sharing_target_layer_name
:
Optional
[
str
]
=
None
,
)
->
None
:
if
kv_sharing_target_layer_name
is
not
None
:
raise
NotImplementedError
(
"KV sharing is not supported in V0."
)
if
alibi_slopes
is
not
None
:
raise
NotImplementedError
(
"XFormers does not support alibi slopes yet."
)
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
num_kv_heads
=
num_kv_heads
self
.
num_queries_per_kv
=
self
.
num_heads
//
self
.
num_kv_heads
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
kv_sharing_target_layer_name
=
kv_sharing_target_layer_name
if
alibi_slopes
is
not
None
:
alibi_slopes
=
torch
.
tensor
(
alibi_slopes
,
dtype
=
torch
.
float32
)
self
.
alibi_slopes
=
alibi_slopes
if
sliding_window
is
None
:
self
.
sliding_window
=
(
-
1
,
-
1
)
else
:
self
.
sliding_window
=
(
sliding_window
-
1
,
0
)
if
logits_soft_cap
is
None
:
# Setting logits_soft_cap to 0 means no soft cap.
logits_soft_cap
=
0
self
.
logits_soft_cap
=
logits_soft_cap
XFormersAttentionBackend
.
validate_head_size
(
head_size
)
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
"encoder/decoder cross-attention "
"are not implemented for "
"XFormersAttentionImpl."
)
def
forward
(
self
,
layer
:
torch
.
nn
.
Module
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
attn_metadata
:
XFormersAttentionMetadata
,
output
:
Optional
[
torch
.
Tensor
]
=
None
,
output_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
output_block_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
"""Forward pass with XFormers.
Args:
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
"""
assert
output
is
not
None
,
"Output tensor must be provided."
if
output_scale
is
not
None
or
output_block_scale
is
not
None
:
raise
NotImplementedError
(
"fused output quantization is not yet supported"
" for XFormersAttentionImpl"
)
if
attn_metadata
is
None
:
# Profiling run.
return
output
# Cache the input KVs.
key_cache
,
value_cache
=
kv_cache
.
unbind
(
0
)
if
self
.
kv_sharing_target_layer_name
is
None
:
# Reshape the input keys and values and store them in the cache.
# Skip this if sharing KV cache with an earlier attention layer.
# NOTE(woosuk): Here, key and value are padded while slot_mapping is
# not padded. However, we don't need to do key[:num_actual_tokens]
# and value[:num_actual_tokens] because the reshape_and_cache_flash
# op uses the slot_mapping's shape to determine the number of
# actual tokens.
ops
.
reshape_and_cache_flash
(
key
,
value
,
key_cache
,
value_cache
,
attn_metadata
.
slot_mapping
,
self
.
kv_cache_dtype
,
layer
.
_k_scale
,
layer
.
_v_scale
,
)
num_actual_tokens
=
attn_metadata
.
num_actual_tokens
num_decode_tokens
=
attn_metadata
.
num_decode_tokens
if
prefill_meta
:
=
attn_metadata
.
prefill_metadata
:
descale_shape
=
(
prefill_meta
.
query_start_loc
.
shape
[
0
]
-
1
,
key
.
shape
[
1
])
unified_attention
(
q
=
query
[
num_decode_tokens
:
num_actual_tokens
],
k
=
key_cache
,
v
=
value_cache
,
out
=
output
[
num_decode_tokens
:
num_actual_tokens
],
cu_seqlens_q
=
prefill_meta
.
query_start_loc
,
max_seqlen_q
=
prefill_meta
.
max_query_len
,
seqused_k
=
prefill_meta
.
seq_lens
,
max_seqlen_k
=
prefill_meta
.
max_seq_len
,
softmax_scale
=
self
.
scale
,
causal
=
True
,
alibi_slopes
=
self
.
alibi_slopes
,
window_size
=
self
.
sliding_window
,
block_table
=
prefill_meta
.
block_table
,
softcap
=
self
.
logits_soft_cap
,
q_descale
=
None
,
# Not supported
k_descale
=
layer
.
_k_scale
.
expand
(
descale_shape
),
v_descale
=
layer
.
_v_scale
.
expand
(
descale_shape
),
)
if
decode_meta
:
=
attn_metadata
.
decode_metadata
:
# Query for decode. KV is not needed because it is already cached.
decode_query
=
query
[:
num_decode_tokens
]
# Reshape query to [1, B_T, G, H, D].
q
=
decode_query
.
view
(
1
,
-
1
,
self
.
num_kv_heads
,
self
.
num_queries_per_kv
,
self
.
head_size
)
# Reshape the k and v caches to [1, Bkv_T, G, H, D]
cache_k
=
key_cache
.
view
(
1
,
-
1
,
self
.
num_kv_heads
,
1
,
self
.
head_size
).
expand
(
1
,
-
1
,
self
.
num_kv_heads
,
self
.
num_queries_per_kv
,
self
.
head_size
,
)
cache_v
=
value_cache
.
view
(
1
,
-
1
,
self
.
num_kv_heads
,
1
,
self
.
head_size
).
expand
(
1
,
-
1
,
self
.
num_kv_heads
,
self
.
num_queries_per_kv
,
self
.
head_size
,
)
attn_bias
=
decode_meta
.
attn_bias
output
[:
num_decode_tokens
]
=
xops
.
memory_efficient_attention_forward
(
q
,
cache_k
,
cache_v
,
attn_bias
=
attn_bias
,
p
=
0.0
,
scale
=
self
.
scale
,
).
view
(
decode_query
.
shape
)
# Reshape the output tensor.
return
output
vllm/v1/core/sched/scheduler.py
View file @
c1c5e4f6
...
...
@@ -178,7 +178,6 @@ class Scheduler(SchedulerInterface):
self
.
encoder_cache_manager
=
EncoderCacheManager
(
cache_size
=
encoder_cache_size
)
speculative_config
=
vllm_config
.
speculative_config
self
.
use_eagle
=
False
self
.
num_spec_tokens
=
self
.
num_lookahead_tokens
=
0
if
speculative_config
:
...
...
@@ -187,10 +186,6 @@ class Scheduler(SchedulerInterface):
self
.
use_eagle
=
True
self
.
num_lookahead_tokens
=
self
.
num_spec_tokens
self
.
compilation_config
=
vllm_config
.
compilation_config
self
.
full_cuda_graph
=
self
.
compilation_config
.
full_cuda_graph
self
.
use_mla
=
vllm_config
.
model_config
.
use_mla
# Create the KV cache manager.
self
.
kv_cache_manager
=
KVCacheManager
(
kv_cache_config
=
kv_cache_config
,
...
...
@@ -207,7 +202,7 @@ class Scheduler(SchedulerInterface):
self
.
use_pp
=
self
.
parallel_config
.
pipeline_parallel_size
>
1
self
.
use_v2_model_runner
=
envs
.
VLLM_USE_V2_MODEL_RUNNER
def
schedule
_default
(
self
)
->
SchedulerOutput
:
def
schedule
(
self
)
->
SchedulerOutput
:
# NOTE(woosuk) on the scheduling algorithm:
# There's no "decoding phase" nor "prefill phase" in the scheduler.
# Each request just has the num_computed_tokens and
...
...
@@ -423,9 +418,7 @@ class Scheduler(SchedulerInterface):
break
request
=
self
.
waiting
.
peek_request
()
if
request
.
is_finished
():
self
.
waiting
.
pop_request
()
continue
# KVTransfer: skip request if still waiting for remote kvs.
if
request
.
status
==
RequestStatus
.
WAITING_FOR_REMOTE_KVS
:
is_ready
=
self
.
_update_waiting_for_remote_kv
(
request
)
...
...
vllm/v1/engine/core.py
View file @
c1c5e4f6
...
...
@@ -14,7 +14,6 @@ from logging import DEBUG
from
typing
import
Any
,
TypeVar
,
cast
import
msgspec
from
vllm
import
envs
import
zmq
from
vllm.config
import
ParallelConfig
,
VllmConfig
...
...
vllm/v1/engine/llm_engine.py
View file @
c1c5e4f6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
time
from
collections.abc
import
Callable
,
Mapping
from
copy
import
copy
...
...
@@ -137,8 +136,6 @@ class LLMEngine:
# Don't keep the dummy data in memory
self
.
reset_mm_cache
()
# self.tree_decoding = os.environ.get('VLLM_TREE_DECODING') == '1'
@
property
@
deprecated
(
"`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
...
...
vllm/v1/spec_decode/eagle.py
View file @
c1c5e4f6
...
...
@@ -25,7 +25,6 @@ from vllm.model_executor.model_loader import get_model
from
vllm.model_executor.models
import
supports_multimodal
from
vllm.model_executor.models.deepseek_v2
import
DeepseekV32IndexerCache
from
vllm.model_executor.models.llama_eagle3
import
Eagle3LlamaForCausalLM
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.platforms
import
current_platform
from
vllm.triton_utils
import
triton
...
...
@@ -51,7 +50,6 @@ from vllm.v1.spec_decode.utils import (
from
vllm.v1.utils
import
CpuGpuBuffer
from
vllm.v1.worker.dp_utils
import
coordinate_batch_across_dp
from
vllm.v1.worker.gpu_input_batch
import
CachedRequestState
,
InputBatch
from
vllm.v1.worker.ubatching
import
dbo_current_ubatch_id
logger
=
init_logger
(
__name__
)
...
...
@@ -254,8 +252,6 @@ class EagleProposer:
# E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
self
.
input_ids
[
last_token_indices
]
=
next_token_ids
seq_lens
=
(
target_positions
[
last_token_indices
]
+
1
).
int
()
assert
self
.
runner
is
not
None
if
self
.
attn_metadata_builder
is
None
:
...
...
@@ -336,7 +332,6 @@ class EagleProposer:
hidden_states
=
self
.
hidden_states
[:
num_input_tokens
],
inputs_embeds
=
inputs_embeds
,
)
if
self
.
method
==
"mtp"
:
last_hidden_states
=
ret_hidden_states
hidden_states
=
last_hidden_states
...
...
@@ -391,11 +386,6 @@ class EagleProposer:
# Generate the remaining draft tokens.
draft_token_ids_list
=
[
draft_token_ids
]
if
self
.
method
==
"deepseek_mtp"
:
hidden_states
=
last_hidden_states
[
last_token_indices
]
else
:
hidden_states
=
hidden_states
[
last_token_indices
]
batch_size_dp_padded
,
batch_size_across_dp
=
self
.
_pad_batch_across_dp
(
num_tokens_unpadded
=
batch_size
,
num_tokens_padded
=
batch_size
,
...
...
@@ -534,7 +524,6 @@ class EagleProposer:
# [batch_size, num_speculative_tokens]
draft_token_ids
=
torch
.
stack
(
draft_token_ids_list
,
dim
=
1
)
return
draft_token_ids
def
prepare_next_token_ids_cpu
(
...
...
vllm/v1/spec_decode/utils.py
View file @
c1c5e4f6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.sampling_params
import
SamplingParams
from
vllm.triton_utils
import
tl
,
triton
...
...
vllm/v1/worker/gpu_input_batch.py
View file @
c1c5e4f6
...
...
@@ -39,7 +39,6 @@ class CachedRequestState:
block_ids
:
tuple
[
list
[
int
],
...]
num_computed_tokens
:
int
output_token_ids
:
list
[
int
]
spec_token_ids
:
list
[
int
]
=
None
mrope_positions
:
torch
.
Tensor
|
None
=
None
mrope_position_delta
:
int
|
None
=
None
...
...
@@ -335,7 +334,7 @@ class InputBatch:
self
.
is_token_ids
[
req_index
,
start_idx
:
end_idx
]
=
True
# Number of token ids in prompt (token_ids_cpu or prompt_embeds).
# NOTE(woosuk): This may include spec decode tokens.
self
.
num_tokens
[
req_index
]
=
request
.
num_tokens
+
num_spec_tokens
self
.
num_tokens
[
req_index
]
=
request
.
num_tokens
# Number of tokens without spec decode tokens.
self
.
num_tokens_no_spec
[
req_index
]
=
request
.
num_tokens
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
c1c5e4f6
...
...
@@ -48,7 +48,7 @@ from vllm.distributed.parallel_state import (
is_global_first_rank
,
prepare_communication_buffer_for_model
,
)
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
,
set_profilling
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.attention_layer_base
import
AttentionLayerBase
from
vllm.model_executor.layers.rotary_embedding
import
(
...
...
@@ -888,8 +888,6 @@ class GPUModelRunner(
# Update the cached states.
req_state
.
num_computed_tokens
=
num_computed_tokens
spec_token_ids
=
(
scheduler_output
.
scheduled_spec_decode_tokens
.
get
(
req_id
,
()))
if
not
is_last_rank
:
# When using PP, the scheduler sends the sampled tokens back,
...
...
@@ -955,7 +953,7 @@ class GPUModelRunner(
if
not
is_last_rank
:
# Add new_token_ids to token_ids_cpu.
start_token_index
=
num_computed_tokens
end_token_index
=
num_computed_tokens
+
1
end_token_index
=
num_computed_tokens
+
len
(
new_token_ids
)
self
.
input_batch
.
token_ids_cpu
[
req_index
,
start_token_index
:
end_token_index
]
=
new_token_ids
...
...
@@ -2004,7 +2002,6 @@ class GPUModelRunner(
self
.
device
,
non_blocking
=
True
)
# Compute the draft token ids.
# draft_token_indices: [ 1, 2, 3, 105, 106, 208]
draft_token_ids
=
self
.
input_ids
.
gpu
[
logits_indices
]
...
...
@@ -3930,10 +3927,6 @@ class GPUModelRunner(
else
:
num_reqs
=
min
(
num_tokens
,
max_num_reqs
)
min_tokens_per_req
=
num_tokens
//
num_reqs
if
not
is_profile
and
self
.
speculative_config
is
not
None
and
self
.
speculative_config
.
num_lookahead_slots
>
0
:
min_tokens_per_req
=
(
1
+
self
.
speculative_config
.
num_lookahead_slots
)
num_reqs
=
num_tokens
//
min_tokens_per_req
num_scheduled_tokens_list
=
[
min_tokens_per_req
]
*
num_reqs
num_scheduled_tokens_list
[
-
1
]
+=
num_tokens
%
num_reqs
...
...
@@ -3995,8 +3988,6 @@ class GPUModelRunner(
self
.
seq_lens
.
np
[
num_reqs
:]
=
0
self
.
seq_lens
.
copy_to_gpu
()
num_speculative_tokens
=
0
if
self
.
speculative_config
is
None
else
self
.
speculative_config
.
num_lookahead_slots
cum_num_tokens
,
_
=
self
.
_get_cumsum_and_arange
(
num_scheduled_tokens
)
self
.
query_start_loc
.
np
[
1
:
num_reqs
+
1
]
=
cum_num_tokens
self
.
query_start_loc
.
copy_to_gpu
()
...
...
@@ -4090,10 +4081,8 @@ class GPUModelRunner(
else
:
hidden_states
=
outputs
if
self
.
speculative_config
and
self
.
speculative_config
.
use_eagle
()
and
not
is_profile
:
# assert isinstance(self.drafter, EagleProposer)
if
hasattr
(
self
,
'drafter'
)
and
isinstance
(
self
.
drafter
,
EagleProposer
):
self
.
drafter
.
dummy_run
(
num_tokens
,
attn_metadata
)
if
self
.
speculative_config
and
self
.
speculative_config
.
use_eagle
():
assert
isinstance
(
self
.
drafter
,
EagleProposer
)
use_cudagraphs
=
(
cudagraph_runtime_mode
.
has_mode
(
CUDAGraphMode
.
PIECEWISE
)
and
not
self
.
speculative_config
.
enforce_eager
...
...
@@ -4293,10 +4282,6 @@ class GPUModelRunner(
return
self
.
_dummy_pooler_run_task
(
hidden_states
,
max_task
)
def
profile_run
(
self
)
->
None
:
# set profiling flag to avoid torch compile
# set_profilling(True)
# self._sync_device()
# Profile with multimodal encoder & encoder cache.
if
self
.
supports_mm_inputs
:
mm_config
=
self
.
model_config
.
multimodal_config
...
...
@@ -4383,7 +4368,6 @@ class GPUModelRunner(
del
hidden_states
,
output
self
.
encoder_cache
.
clear
()
gc
.
collect
()
# set_profilling(False)
def
capture_model
(
self
)
->
int
:
if
self
.
compilation_config
.
cudagraph_mode
==
CUDAGraphMode
.
NONE
:
...
...
@@ -5324,10 +5308,9 @@ class GPUModelRunner(
)
if
self
.
speculative_config
and
self
.
speculative_config
.
use_eagle
():
#
assert isinstance(self.drafter, EagleProposer)
assert
isinstance
(
self
.
drafter
,
EagleProposer
)
# validate all draft model layers belong to the same kv cache
# group
if
hasattr
(
self
,
'drafter'
)
and
isinstance
(
self
.
drafter
,
EagleProposer
):
self
.
drafter
.
validate_same_kv_cache_group
(
kv_cache_config
)
if
has_kv_transfer_group
():
...
...
vllm/worker/worker_base.py
deleted
100644 → 0
View file @
d2fe5111
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
numa
import
time
from
abc
import
abstractmethod
from
typing
import
(
Any
,
Callable
,
Dict
,
List
,
Optional
,
Set
,
Tuple
,
TypeVar
,
Union
,
Type
)
import
cloudpickle
import
torch.nn
as
nn
from
vllm.config
import
VllmConfig
,
set_current_vllm_config
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.utils
import
(
enable_trace_function_call_for_thread
,
resolve_obj_by_qualname
,
run_method
,
update_environment_variables
,
warn_for_unimplemented_methods
)
from
vllm.v1.outputs
import
SamplerOutput
logger
=
init_logger
(
__name__
)
_R
=
TypeVar
(
"_R"
)
# 设置当前进程绑定到 NUMA 节点
def
bind_to_numa
(
local_rank
):
env_str
=
f
"VLLM_RANK
{
local_rank
}
_NUMA"
node_count
=
numa
.
get_max_node
()
+
1
numa_node
=
int
(
os
.
getenv
(
env_str
,
-
1
))
# 未配置环境变量或配置错误则不做绑定,TODO:根据topo自动绑定方案
if
numa_node
<
0
:
logger
.
warning
(
"%s is unset or set incorrectly, vllm will not bind to numa! %s = %d"
,
env_str
,
env_str
,
numa_node
)
return
if
numa_node
>
numa
.
get_max_node
():
raise
ValueError
(
f
"NUMA node
{
numa_node
}
is not available."
)
numa
.
bind
([
numa_node
])
@
warn_for_unimplemented_methods
class
WorkerBase
:
"""Worker interface that allows vLLM to cleanly separate implementations for
different hardware. Also abstracts control plane communication, e.g., to
communicate request metadata to other workers.
"""
# TODO
tree_decoding
=
(
os
.
environ
.
get
(
'VLLM_TREE_DECODING'
)
==
'1'
)
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
)
->
None
:
self
.
vllm_config
=
vllm_config
self
.
model_config
=
vllm_config
.
model_config
self
.
cache_config
=
vllm_config
.
cache_config
self
.
lora_config
=
vllm_config
.
lora_config
self
.
load_config
=
vllm_config
.
load_config
self
.
parallel_config
=
vllm_config
.
parallel_config
self
.
scheduler_config
=
vllm_config
.
scheduler_config
self
.
device_config
=
vllm_config
.
device_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
observability_config
=
vllm_config
.
observability_config
self
.
kv_transfer_config
=
vllm_config
.
kv_transfer_config
self
.
compilation_config
=
vllm_config
.
compilation_config
from
vllm.platforms
import
current_platform
self
.
current_platform
=
current_platform
def
init_device
(
self
)
->
None
:
"""Initialize device state, such as loading the model or other on-device
memory allocations.
"""
raise
NotImplementedError
def
initialize_cache
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
)
->
None
:
"""Initialize the KV cache with the given size in blocks.
"""
raise
NotImplementedError
def
get_model
(
self
)
->
nn
.
Module
:
raise
NotImplementedError
def
apply_model
(
self
,
fn
:
Callable
[[
nn
.
Module
],
_R
])
->
_R
:
"""Apply a function on the model inside this worker."""
return
fn
(
self
.
get_model
())
def
load_model
(
self
)
->
None
:
"""Load model onto target device."""
raise
NotImplementedError
def
execute_model
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
Optional
[
List
[
SamplerOutput
]]:
raise
NotImplementedError
def
start_worker_execution_loop
(
self
)
->
None
:
"""Execute model loop in parallel worker.
You can stop the loop by executing a driver worker with an empty output.
See `stop_remote_worker_execution_loop` for more details.
"""
with
self
.
current_platform
.
inference_mode
():
while
True
:
output
=
self
.
execute_model
(
execute_model_req
=
None
)
if
output
is
None
:
return
None
def
determine_num_available_blocks
(
self
)
->
Tuple
[
int
,
int
]:
"""Determine the number of available blocks for the GPU KV cache and
swappable CPU KV cache.
The implementation may run profiling or other heuristics to determine
the size of caches.
Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
are blocks that are "active" on the device and can be appended to.
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
appended to.
"""
raise
NotImplementedError
def
get_cache_block_size_bytes
(
self
)
->
int
:
"""Return the size of a single cache block, in bytes. Used in
speculative decoding.
"""
raise
NotImplementedError
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
raise
NotImplementedError
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
raise
NotImplementedError
def
pin_lora
(
self
,
lora_id
:
int
)
->
bool
:
raise
NotImplementedError
def
list_loras
(
self
)
->
Set
[
int
]:
raise
NotImplementedError
# @property
# @abstractmethod
# def cache_engines(self) -> Optional[List[CacheEngine]]:
# raise NotImplementedError
@
property
def
vocab_size
(
self
)
->
int
:
"""Get vocabulary size from model configuration."""
return
self
.
model_config
.
get_vocab_size
()
def
shutdown
(
self
)
->
None
:
"""Clean up resources held by the worker."""
return
class
WorkerWrapperBase
:
"""
This class represents one process in an executor/engine. It is responsible
for lazily initializing the worker and handling the worker's lifecycle.
We first instantiate the WorkerWrapper, which remembers the worker module
and class name. Then, when we call `update_environment_variables`, and the
real initialization happens in `init_worker`.
"""
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
rpc_rank
:
int
=
0
,
)
->
None
:
"""
Initialize the worker wrapper with the given vllm_config and rpc_rank.
Note: rpc_rank is the rank of the worker in the executor. In most cases,
it is also the rank of the worker in the distributed group. However,
when multiple executors work together, they can be different.
e.g. in the case of SPMD-style offline inference with TP=2,
users can launch 2 engines/executors, each with only 1 worker.
All workers have rpc_rank=0, but they have different ranks in the TP
group.
"""
self
.
rpc_rank
=
rpc_rank
self
.
worker
:
Optional
[
WorkerBase
]
=
None
self
.
vllm_config
:
Optional
[
VllmConfig
]
=
None
# do not store this `vllm_config`, `init_worker` will set the final
# one. TODO: investigate if we can remove this field in
# `WorkerWrapperBase`, `init_cached_hf_modules` should be
# unnecessary now.
if
vllm_config
.
model_config
is
not
None
:
# it can be None in tests
trust_remote_code
=
vllm_config
.
model_config
.
trust_remote_code
if
trust_remote_code
:
# note: lazy import to avoid importing torch before initializing
from
vllm.utils
import
init_cached_hf_modules
init_cached_hf_modules
()
def
shutdown
(
self
)
->
None
:
if
self
.
worker
is
not
None
:
self
.
worker
.
shutdown
()
def
adjust_rank
(
self
,
rank_mapping
:
Dict
[
int
,
int
])
->
None
:
"""
Adjust the rpc_rank based on the given mapping.
It is only used during the initialization of the executor,
to adjust the rpc_rank of workers after we create all workers.
"""
if
self
.
rpc_rank
in
rank_mapping
:
self
.
rpc_rank
=
rank_mapping
[
self
.
rpc_rank
]
def
update_environment_variables
(
self
,
envs_list
:
List
[
Dict
[
str
,
str
]])
->
None
:
envs
=
envs_list
[
self
.
rpc_rank
]
key
=
'CUDA_VISIBLE_DEVICES'
if
key
in
envs
and
key
in
os
.
environ
:
# overwriting CUDA_VISIBLE_DEVICES is desired behavior
# suppress the warning in `update_environment_variables`
del
os
.
environ
[
key
]
update_environment_variables
(
envs
)
def
init_worker
(
self
,
all_kwargs
:
List
[
Dict
[
str
,
Any
]])
->
None
:
"""
Here we inject some common logic before initializing the worker.
Arguments are passed to the worker class constructor.
"""
kwargs
=
all_kwargs
[
self
.
rpc_rank
]
self
.
vllm_config
=
kwargs
.
get
(
"vllm_config"
)
assert
self
.
vllm_config
is
not
None
,
(
"vllm_config is required to initialize the worker"
)
enable_trace_function_call_for_thread
(
self
.
vllm_config
)
from
vllm.plugins
import
load_general_plugins
load_general_plugins
()
if
isinstance
(
self
.
vllm_config
.
parallel_config
.
worker_cls
,
str
):
worker_class
=
resolve_obj_by_qualname
(
self
.
vllm_config
.
parallel_config
.
worker_cls
)
else
:
logger
.
warning
(
"passing worker_cls as a class object is strongly deprecated,"
" as the serialization of class objects can be tricky and"
" error-prone. To be safe, please keep the class in a separate"
" module and pass the qualified name of the class as a string."
)
assert
isinstance
(
self
.
vllm_config
.
parallel_config
.
worker_cls
,
bytes
)
worker_class
=
cloudpickle
.
loads
(
self
.
vllm_config
.
parallel_config
.
worker_cls
)
if
self
.
vllm_config
.
parallel_config
.
worker_extension_cls
:
worker_extension_cls
=
resolve_obj_by_qualname
(
self
.
vllm_config
.
parallel_config
.
worker_extension_cls
)
extended_calls
=
[]
if
worker_extension_cls
not
in
worker_class
.
__bases__
:
# check any conflicts between worker and worker_extension_cls
for
attr
in
dir
(
worker_extension_cls
):
if
attr
.
startswith
(
"__"
):
continue
assert
not
hasattr
(
worker_class
,
attr
),
(
f
"Worker class
{
worker_class
}
already has an attribute"
f
"
{
attr
}
, which conflicts with the worker"
f
" extension class
{
worker_extension_cls
}
."
)
if
callable
(
getattr
(
worker_extension_cls
,
attr
)):
extended_calls
.
append
(
attr
)
# dynamically inherit the worker extension class
worker_class
.
__bases__
=
worker_class
.
__bases__
+
(
worker_extension_cls
,
)
logger
.
info
(
"Injected %s into %s for extended collective_rpc calls %s"
,
worker_extension_cls
,
worker_class
,
extended_calls
)
with
set_current_vllm_config
(
self
.
vllm_config
):
# To make vLLM config available during worker initialization
self
.
worker
=
worker_class
(
**
kwargs
)
assert
self
.
worker
is
not
None
VLLM_NUMA_BIND
=
int
(
os
.
getenv
(
"VLLM_NUMA_BIND"
,
1
))
if
VLLM_NUMA_BIND
>
0
:
# 绑定当前进程到指定 NUMA 节点
bind_to_numa
(
kwargs
[
'local_rank'
])
pid
=
os
.
getpid
()
logger
.
info
(
"########## %d process(rank%s) is running on CPU(s): %s"
,
pid
,
str
(
kwargs
[
'local_rank'
]),
str
(
os
.
sched_getaffinity
(
pid
)))
logger
.
info
(
"########## %d process(rank%s) is running on memnode(s): %s"
,
pid
,
str
(
kwargs
[
'local_rank'
]),
str
(
numa
.
get_membind
()))
def
initialize_from_config
(
self
,
kv_cache_configs
:
List
[
Any
])
->
None
:
kv_cache_config
=
kv_cache_configs
[
self
.
rpc_rank
]
with
set_current_vllm_config
(
self
.
vllm_config
):
self
.
worker
.
initialize_from_config
(
kv_cache_config
)
# type: ignore
def
init_device
(
self
):
with
set_current_vllm_config
(
self
.
vllm_config
):
# To make vLLM config available during device initialization
self
.
worker
.
init_device
()
# type: ignore
def
execute_method
(
self
,
method
:
Union
[
str
,
bytes
],
*
args
,
**
kwargs
):
try
:
# method resolution order:
# if a method is defined in this class, it will be called directly.
# otherwise, since we define `__getattr__` and redirect attribute
# query to `self.worker`, the method will be called on the worker.
return
run_method
(
self
,
method
,
args
,
kwargs
)
except
Exception
as
e
:
# if the driver worker also execute methods,
# exceptions in the rest worker may cause deadlock in rpc like ray
# see https://github.com/vllm-project/vllm/issues/3455
# print the error and inform the user to solve the error
msg
=
(
f
"Error executing method
{
method
!
r
}
. "
"This might cause deadlock in distributed execution."
)
logger
.
exception
(
msg
)
raise
e
def
__getattr__
(
self
,
attr
):
return
getattr
(
self
.
worker
,
attr
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment