Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
de5774fa
Commit
de5774fa
authored
Dec 19, 2025
by
zhuwenwen
Browse files
remove unused code
parent
c1c5e4f6
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
9 additions
and
254 deletions
+9
-254
vllm/model_executor/layers/mla.py
vllm/model_executor/layers/mla.py
+0
-4
vllm/model_executor/layers/rotary_embedding/common.py
vllm/model_executor/layers/rotary_embedding/common.py
+1
-0
vllm/model_executor/layers/vocab_parallel_embedding.py
vllm/model_executor/layers/vocab_parallel_embedding.py
+1
-2
vllm/platforms/__init__.py
vllm/platforms/__init__.py
+0
-1
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+6
-1
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+1
-2
vllm/transformers_utils/configs/deepseek_v3.py
vllm/transformers_utils/configs/deepseek_v3.py
+0
-101
vllm/transformers_utils/configs/medusa.py
vllm/transformers_utils/configs/medusa.py
+0
-8
vllm/transformers_utils/tokenizers/__init__.py
vllm/transformers_utils/tokenizers/__init__.py
+0
-4
vllm/utils/__init__.py
vllm/utils/__init__.py
+0
-31
vllm/utils/deep_gemm.py
vllm/utils/deep_gemm.py
+0
-100
No files found.
vllm/model_executor/layers/mla.py
View file @
de5774fa
...
@@ -163,10 +163,6 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
...
@@ -163,10 +163,6 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
if
llama_4_scaling
is
not
None
:
if
llama_4_scaling
is
not
None
:
q
*=
llama_4_scaling
q
*=
llama_4_scaling
if
self
.
indexer
and
self
.
is_sparse
:
_topk_indices
=
self
.
indexer
(
hidden_states
,
q_c
,
positions
,
self
.
rotary_emb
)
attn_out
=
self
.
mla_attn
(
attn_out
=
self
.
mla_attn
(
q
,
q
,
kv_c_normed
,
kv_c_normed
,
...
...
vllm/model_executor/layers/rotary_embedding/common.py
View file @
de5774fa
...
@@ -20,6 +20,7 @@ if current_platform.is_rocm():
...
@@ -20,6 +20,7 @@ if current_platform.is_rocm():
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
# common functions
# common functions
def
rotate_neox
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
rotate_neox
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x1
=
x
[...,
:
x
.
shape
[
-
1
]
//
2
]
x1
=
x
[...,
:
x
.
shape
[
-
1
]
//
2
]
...
...
vllm/model_executor/layers/vocab_parallel_embedding.py
View file @
de5774fa
...
@@ -27,10 +27,8 @@ from vllm.model_executor.layers.quantization.base_config import (
...
@@ -27,10 +27,8 @@ from vllm.model_executor.layers.quantization.base_config import (
from
vllm.model_executor.layers.utils
import
dispatch_unquantized_gemm
from
vllm.model_executor.layers.utils
import
dispatch_unquantized_gemm
from
vllm.model_executor.parameter
import
BasevLLMParameter
from
vllm.model_executor.parameter
import
BasevLLMParameter
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
DEFAULT_VOCAB_PADDING_SIZE
=
64
DEFAULT_VOCAB_PADDING_SIZE
=
64
...
@@ -211,6 +209,7 @@ def get_masked_input_and_mask(
...
@@ -211,6 +209,7 @@ def get_masked_input_and_mask(
input_
=
vocab_mask
*
(
input_
-
valid_offset
)
input_
=
vocab_mask
*
(
input_
-
valid_offset
)
return
input_
,
~
vocab_mask
return
input_
,
~
vocab_mask
@
CustomOp
.
register
(
"vocab_parallel_embedding"
)
@
CustomOp
.
register
(
"vocab_parallel_embedding"
)
class
VocabParallelEmbedding
(
CustomOp
):
class
VocabParallelEmbedding
(
CustomOp
):
"""Embedding parallelized in the vocabulary dimension.
"""Embedding parallelized in the vocabulary dimension.
...
...
vllm/platforms/__init__.py
View file @
de5774fa
...
@@ -11,7 +11,6 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
...
@@ -11,7 +11,6 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
from
vllm.utils.torch_utils
import
supports_xccl
from
vllm.utils.torch_utils
import
supports_xccl
from
.interface
import
CpuArchEnum
,
Platform
,
PlatformEnum
from
.interface
import
CpuArchEnum
,
Platform
,
PlatformEnum
import
torch
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
...
vllm/platforms/rocm.py
View file @
de5774fa
...
@@ -367,9 +367,14 @@ class RocmPlatform(Platform):
...
@@ -367,9 +367,14 @@ class RocmPlatform(Platform):
@
with_amdsmi_context
@
with_amdsmi_context
@
lru_cache
(
maxsize
=
8
)
@
lru_cache
(
maxsize
=
8
)
def
get_device_name
(
cls
,
device_id
:
int
=
0
)
->
str
:
def
get_device_name
(
cls
,
device_id
:
int
=
0
)
->
str
:
# physical_device_id = cls.device_id_to_physical_device_id(device_id)
physical_device_id
=
device_id_to_physical_device_id
(
device_id
)
physical_device_id
=
device_id_to_physical_device_id
(
device_id
)
handle
=
amdsmi_get_processor_handles
()[
physical_device_id
]
handle
=
amdsmi_get_processor_handles
()[
physical_device_id
]
# return amdsmi_get_gpu_asic_info(handle)["market_name"]
# asic_info = amdsmi_get_gpu_asic_info(handle)
# device_name: str = asic_info["device_id"]
# if device_name in _ROCM_DEVICE_ID_NAME_MAP:
# return _ROCM_DEVICE_ID_NAME_MAP[device_name]
# return asic_info["market_name"]
return
torch
.
cuda
.
get_device_name
(
device_id
)
return
torch
.
cuda
.
get_device_name
(
device_id
)
@
classmethod
@
classmethod
...
...
vllm/transformers_utils/configs/__init__.py
View file @
de5774fa
...
@@ -14,7 +14,6 @@ from transformers import DeepseekV3Config
...
@@ -14,7 +14,6 @@ from transformers import DeepseekV3Config
from
vllm.transformers_utils.configs.afmoe
import
AfmoeConfig
from
vllm.transformers_utils.configs.afmoe
import
AfmoeConfig
from
vllm.transformers_utils.configs.chatglm
import
ChatGLMConfig
from
vllm.transformers_utils.configs.chatglm
import
ChatGLMConfig
from
vllm.transformers_utils.configs.deepseek_v3
import
DeepseekV3Config
from
vllm.transformers_utils.configs.deepseek_vl2
import
DeepseekVLV2Config
from
vllm.transformers_utils.configs.deepseek_vl2
import
DeepseekVLV2Config
from
vllm.transformers_utils.configs.dotsocr
import
DotsOCRConfig
from
vllm.transformers_utils.configs.dotsocr
import
DotsOCRConfig
from
vllm.transformers_utils.configs.eagle
import
EAGLEConfig
from
vllm.transformers_utils.configs.eagle
import
EAGLEConfig
...
@@ -23,6 +22,7 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
...
@@ -23,6 +22,7 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
# `FalconConfig` class from the official HuggingFace transformers library.
from
vllm.transformers_utils.configs.falcon
import
RWConfig
from
vllm.transformers_utils.configs.falcon
import
RWConfig
from
vllm.transformers_utils.configs.fm9g
import
FM9GConfig
from
vllm.transformers_utils.configs.flex_olmo
import
FlexOlmoConfig
from
vllm.transformers_utils.configs.flex_olmo
import
FlexOlmoConfig
from
vllm.transformers_utils.configs.hunyuan_vl
import
(
from
vllm.transformers_utils.configs.hunyuan_vl
import
(
HunYuanVLConfig
,
HunYuanVLConfig
,
...
@@ -41,7 +41,6 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig
...
@@ -41,7 +41,6 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig
from
vllm.transformers_utils.configs.nemotron_h
import
NemotronHConfig
from
vllm.transformers_utils.configs.nemotron_h
import
NemotronHConfig
from
vllm.transformers_utils.configs.olmo3
import
Olmo3Config
from
vllm.transformers_utils.configs.olmo3
import
Olmo3Config
from
vllm.transformers_utils.configs.ovis
import
OvisConfig
from
vllm.transformers_utils.configs.ovis
import
OvisConfig
from
vllm.transformers_utils.configs.qwen3_next
import
Qwen3NextConfig
from
vllm.transformers_utils.configs.qwen3_next
import
Qwen3NextConfig
from
vllm.transformers_utils.configs.radio
import
RadioConfig
from
vllm.transformers_utils.configs.radio
import
RadioConfig
from
vllm.transformers_utils.configs.speculators.base
import
SpeculatorsConfig
from
vllm.transformers_utils.configs.speculators.base
import
SpeculatorsConfig
...
...
vllm/transformers_utils/configs/deepseek_v3.py
deleted
100644 → 0
View file @
c1c5e4f6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
class
DeepseekV3Config
(
PretrainedConfig
):
model_type
=
"deepseek_v3"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
129280
,
hidden_size
=
7168
,
intermediate_size
=
18432
,
moe_intermediate_size
=
2048
,
num_hidden_layers
=
61
,
num_nextn_predict_layers
=
1
,
num_attention_heads
=
128
,
num_key_value_heads
=
128
,
n_shared_experts
=
1
,
n_routed_experts
=
256
,
ep_size
=
1
,
routed_scaling_factor
=
2.5
,
kv_lora_rank
=
512
,
q_lora_rank
=
1536
,
qk_rope_head_dim
=
64
,
v_head_dim
=
128
,
qk_nope_head_dim
=
128
,
topk_method
=
'noaux_tc'
,
n_group
=
8
,
topk_group
=
4
,
num_experts_per_tok
=
8
,
moe_layer_freq
=
1
,
first_k_dense_replace
=
3
,
norm_topk_prob
=
True
,
scoring_func
=
'sigmoid'
,
hidden_act
=
"silu"
,
max_position_embeddings
=
4096
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
0
,
eos_token_id
=
1
,
tie_word_embeddings
=
False
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
moe_intermediate_size
=
moe_intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_nextn_predict_layers
=
num_nextn_predict_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
n_shared_experts
=
n_shared_experts
self
.
n_routed_experts
=
n_routed_experts
self
.
ep_size
=
ep_size
self
.
routed_scaling_factor
=
routed_scaling_factor
self
.
kv_lora_rank
=
kv_lora_rank
self
.
q_lora_rank
=
q_lora_rank
self
.
qk_rope_head_dim
=
qk_rope_head_dim
self
.
v_head_dim
=
v_head_dim
self
.
qk_nope_head_dim
=
qk_nope_head_dim
self
.
topk_method
=
topk_method
self
.
n_group
=
n_group
self
.
topk_group
=
topk_group
self
.
num_experts_per_tok
=
num_experts_per_tok
self
.
moe_layer_freq
=
moe_layer_freq
self
.
first_k_dense_replace
=
first_k_dense_replace
self
.
norm_topk_prob
=
norm_topk_prob
self
.
scoring_func
=
scoring_func
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
vllm/transformers_utils/configs/medusa.py
View file @
de5774fa
...
@@ -55,14 +55,6 @@ class MedusaConfig(PretrainedConfig):
...
@@ -55,14 +55,6 @@ class MedusaConfig(PretrainedConfig):
@
property
@
property
def
num_attention_heads
(
self
):
def
num_attention_heads
(
self
):
return
0
return
0
@
property
def
num_lookahead_heads
(
self
):
return
self
.
num_heads
@
num_lookahead_heads
.
setter
def
num_lookahead_heads
(
self
,
num_lookahead_heads
:
int
):
self
.
num_heads
=
num_lookahead_heads
@
property
@
property
def
num_lookahead_tokens
(
self
):
def
num_lookahead_tokens
(
self
):
...
...
vllm/transformers_utils/tokenizers/__init__.py
View file @
de5774fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
.mistral
import
(
MistralTokenizer
,
maybe_serialize_tool_calls
,
truncate_tool_call_ids
,
validate_request_params
)
from
vllm.transformers_utils.tokenizers.cpm_9g
import
CPM9GTokenizer
from
vllm.transformers_utils.tokenizers.cpm_9g
import
CPM9GTokenizer
__all__
=
[
__all__
=
[
"MistralTokenizer"
,
"maybe_serialize_tool_calls"
,
"truncate_tool_call_ids"
,
"validate_request_params"
,
"CPM9GTokenizer"
"CPM9GTokenizer"
]
]
vllm/utils/__init__.py
View file @
de5774fa
...
@@ -41,37 +41,6 @@ def __dir__() -> list[str]:
...
@@ -41,37 +41,6 @@ def __dir__() -> list[str]:
MASK_64_BITS
=
(
1
<<
64
)
-
1
MASK_64_BITS
=
(
1
<<
64
)
-
1
def
_maybe_force_spawn
():
"""Check if we need to force the use of the `spawn` multiprocessing start
method.
"""
if
os
.
environ
.
get
(
"VLLM_WORKER_MULTIPROC_METHOD"
)
==
"spawn"
:
return
reasons
=
[]
if
is_in_ray_actor
():
# even if we choose to spawn, we need to pass the ray address
# to the subprocess so that it knows how to connect to the ray cluster.
# env vars are inherited by subprocesses, even if we use spawn.
import
ray
os
.
environ
[
"RAY_ADDRESS"
]
=
ray
.
get_runtime_context
().
gcs_address
reasons
.
append
(
"In a Ray actor and can only be spawned"
)
if
cuda_is_initialized
():
reasons
.
append
(
"CUDA is initialized"
)
elif
xpu_is_initialized
():
reasons
.
append
(
"XPU is initialized"
)
if
reasons
:
logger
.
warning
(
"We must use the `spawn` multiprocessing start method. "
"Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
"See https://docs.vllm.ai/en/latest/usage/"
"troubleshooting.html#python-multiprocessing "
"for more information. Reasons: %s"
,
"; "
.
join
(
reasons
))
os
.
environ
[
"VLLM_WORKER_MULTIPROC_METHOD"
]
=
"spawn"
def
random_uuid
()
->
str
:
def
random_uuid
()
->
str
:
return
f
"
{
uuid
.
uuid4
().
int
&
MASK_64_BITS
:
016
x
}
"
# 16 hex chars
return
f
"
{
uuid
.
uuid4
().
int
&
MASK_64_BITS
:
016
x
}
"
# 16 hex chars
...
...
vllm/utils/deep_gemm.py
View file @
de5774fa
...
@@ -166,12 +166,6 @@ def get_mk_alignment_for_contiguous_layout() -> list[int]:
...
@@ -166,12 +166,6 @@ def get_mk_alignment_for_contiguous_layout() -> list[int]:
return
[
mk_align_size
,
mk_align_size
]
return
[
mk_align_size
,
mk_align_size
]
def
get_num_sms
()
->
int
:
_lazy_init
()
_dg
=
importlib
.
import_module
(
"deep_gemm"
)
return
int
(
_dg
.
get_num_sms
())
def
get_col_major_tma_aligned_tensor
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
get_col_major_tma_aligned_tensor
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Wrapper for DeepGEMM's get_mn_major_tma_aligned_tensor"""
"""Wrapper for DeepGEMM's get_mn_major_tma_aligned_tensor"""
_lazy_init
()
_lazy_init
()
...
@@ -315,100 +309,6 @@ def fp8_paged_mqa_logits(
...
@@ -315,100 +309,6 @@ def fp8_paged_mqa_logits(
)
)
def
fp8_mqa_logits
(
q
:
torch
.
Tensor
,
kv
:
tuple
[
torch
.
Tensor
,
torch
.
Tensor
],
weights
:
torch
.
Tensor
,
cu_seqlen_ks
:
torch
.
Tensor
,
cu_seqlen_ke
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
"""Compute FP8 MQA logits for a single sequence without KV paging.
Args:
q: Query tensor of shape [M, H, D]. Casted to
`torch.float8_e4m3fn` by caller.
kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
[N, 1]) with dtype `torch.float32`.
weights: weights of shape [M, H], dtype `torch.float32`.
cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
shape [M], dtype int32.
cu_seqlen_ke: End indices (exclusive) for valid K per query position,
shape [M], dtype int32.
Returns:
Logits tensor of shape [M, N], dtype `torch.float32`.
"""
_lazy_init
()
if
_fp8_mqa_logits_impl
is
None
:
return
_missing
()
return
_fp8_mqa_logits_impl
(
q
,
kv
,
weights
,
cu_seqlen_ks
,
cu_seqlen_ke
)
def
get_paged_mqa_logits_metadata
(
context_lens
:
torch
.
Tensor
,
block_size
:
int
,
num_sms
:
int
)
->
torch
.
Tensor
:
"""Build scheduling metadata for paged MQA logits.
Args:
context_lens: Tensor of shape [B], dtype int32; effective context length
per batch element.
block_size: KV-cache block size in tokens (e.g., 64).
num_sms: Number of SMs available. 132 for Hopper
Returns:
Backend-specific tensor consumed by `fp8_paged_mqa_logits` to
schedule work across SMs.
"""
_lazy_init
()
if
_get_paged_mqa_logits_metadata_impl
is
None
:
return
_missing
()
return
_get_paged_mqa_logits_metadata_impl
(
context_lens
,
block_size
,
num_sms
)
def
fp8_paged_mqa_logits
(
q_fp8
:
torch
.
Tensor
,
kv_cache_fp8
:
torch
.
Tensor
,
weights
:
torch
.
Tensor
,
context_lens
:
torch
.
Tensor
,
block_tables
:
torch
.
Tensor
,
schedule_metadata
:
torch
.
Tensor
,
max_model_len
:
int
,
)
->
torch
.
Tensor
:
"""Compute FP8 MQA logits using paged KV-cache.
Args:
q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
`torch.float8_e4m3fn` by caller.
kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
[num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
4 bytes per (block,pos) store the `float` dequant scale.
weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
context_lens: Tensor of shape [B], dtype int32; effective context length
for each batch element.
block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
block indices to physical blocks in the paged cache.
schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
used to distribute work across SMs.
max_model_len: Maximum sequence length used to size the logits output.
Returns:
Logits tensor of shape [B * next_n, max_model_len], dtype
`torch.float32`.
"""
_lazy_init
()
if
_fp8_paged_mqa_logits_impl
is
None
:
return
_missing
()
return
_fp8_paged_mqa_logits_impl
(
q_fp8
,
kv_cache_fp8
,
weights
,
context_lens
,
block_tables
,
schedule_metadata
,
max_model_len
,
clean_logits
=
True
)
def
_ceil_to_ue8m0
(
x
:
torch
.
Tensor
):
def
_ceil_to_ue8m0
(
x
:
torch
.
Tensor
):
return
torch
.
pow
(
2.0
,
torch
.
ceil
(
torch
.
log2
(
x
.
abs
())))
return
torch
.
pow
(
2.0
,
torch
.
ceil
(
torch
.
log2
(
x
.
abs
())))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment