Commit de5774fa authored by zhuwenwen's avatar zhuwenwen
Browse files

remove unused code

parent c1c5e4f6
......@@ -163,10 +163,6 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
if llama_4_scaling is not None:
q *= llama_4_scaling
if self.indexer and self.is_sparse:
_topk_indices = self.indexer(hidden_states, q_c, positions,
self.rotary_emb)
attn_out = self.mla_attn(
q,
kv_c_normed,
......
......@@ -20,6 +20,7 @@ if current_platform.is_rocm():
logger = init_logger(__name__)
# common functions
def rotate_neox(x: torch.Tensor) -> torch.Tensor:
x1 = x[..., : x.shape[-1] // 2]
......
......@@ -27,10 +27,8 @@ from vllm.model_executor.layers.quantization.base_config import (
from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
from vllm.model_executor.parameter import BasevLLMParameter
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
DEFAULT_VOCAB_PADDING_SIZE = 64
......@@ -211,6 +209,7 @@ def get_masked_input_and_mask(
input_ = vocab_mask * (input_ - valid_offset)
return input_, ~vocab_mask
@CustomOp.register("vocab_parallel_embedding")
class VocabParallelEmbedding(CustomOp):
"""Embedding parallelized in the vocabulary dimension.
......
......@@ -11,7 +11,6 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
from vllm.utils.torch_utils import supports_xccl
from .interface import CpuArchEnum, Platform, PlatformEnum
import torch
logger = logging.getLogger(__name__)
......
......@@ -367,9 +367,14 @@ class RocmPlatform(Platform):
@with_amdsmi_context
@lru_cache(maxsize=8)
def get_device_name(cls, device_id: int = 0) -> str:
# physical_device_id = cls.device_id_to_physical_device_id(device_id)
physical_device_id = device_id_to_physical_device_id(device_id)
handle = amdsmi_get_processor_handles()[physical_device_id]
# return amdsmi_get_gpu_asic_info(handle)["market_name"]
# asic_info = amdsmi_get_gpu_asic_info(handle)
# device_name: str = asic_info["device_id"]
# if device_name in _ROCM_DEVICE_ID_NAME_MAP:
# return _ROCM_DEVICE_ID_NAME_MAP[device_name]
# return asic_info["market_name"]
return torch.cuda.get_device_name(device_id)
@classmethod
......
......@@ -14,7 +14,6 @@ from transformers import DeepseekV3Config
from vllm.transformers_utils.configs.afmoe import AfmoeConfig
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig
......@@ -23,6 +22,7 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
from vllm.transformers_utils.configs.falcon import RWConfig
from vllm.transformers_utils.configs.fm9g import FM9GConfig
from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
from vllm.transformers_utils.configs.hunyuan_vl import (
HunYuanVLConfig,
......@@ -41,7 +41,6 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig
from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
from vllm.transformers_utils.configs.olmo3 import Olmo3Config
from vllm.transformers_utils.configs.ovis import OvisConfig
from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class DeepseekV3Config(PretrainedConfig):
model_type = "deepseek_v3"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=129280,
hidden_size=7168,
intermediate_size=18432,
moe_intermediate_size=2048,
num_hidden_layers=61,
num_nextn_predict_layers=1,
num_attention_heads=128,
num_key_value_heads=128,
n_shared_experts=1,
n_routed_experts=256,
ep_size=1,
routed_scaling_factor=2.5,
kv_lora_rank=512,
q_lora_rank=1536,
qk_rope_head_dim=64,
v_head_dim=128,
qk_nope_head_dim=128,
topk_method='noaux_tc',
n_group=8,
topk_group=4,
num_experts_per_tok=8,
moe_layer_freq=1,
first_k_dense_replace=3,
norm_topk_prob=True,
scoring_func='sigmoid',
hidden_act="silu",
max_position_embeddings=4096,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=0,
eos_token_id=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.moe_intermediate_size = moe_intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_nextn_predict_layers = num_nextn_predict_layers
self.num_attention_heads = num_attention_heads
self.n_shared_experts = n_shared_experts
self.n_routed_experts = n_routed_experts
self.ep_size = ep_size
self.routed_scaling_factor = routed_scaling_factor
self.kv_lora_rank = kv_lora_rank
self.q_lora_rank = q_lora_rank
self.qk_rope_head_dim = qk_rope_head_dim
self.v_head_dim = v_head_dim
self.qk_nope_head_dim = qk_nope_head_dim
self.topk_method = topk_method
self.n_group = n_group
self.topk_group = topk_group
self.num_experts_per_tok = num_experts_per_tok
self.moe_layer_freq = moe_layer_freq
self.first_k_dense_replace = first_k_dense_replace
self.norm_topk_prob = norm_topk_prob
self.scoring_func = scoring_func
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
......@@ -56,14 +56,6 @@ class MedusaConfig(PretrainedConfig):
def num_attention_heads(self):
return 0
@property
def num_lookahead_heads(self):
return self.num_heads
@num_lookahead_heads.setter
def num_lookahead_heads(self, num_lookahead_heads: int):
self.num_heads = num_lookahead_heads
@property
def num_lookahead_tokens(self):
return self.num_heads
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
truncate_tool_call_ids, validate_request_params)
from vllm.transformers_utils.tokenizers.cpm_9g import CPM9GTokenizer
__all__ = [
"MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
"validate_request_params",
"CPM9GTokenizer"
]
......@@ -41,37 +41,6 @@ def __dir__() -> list[str]:
MASK_64_BITS = (1 << 64) - 1
def _maybe_force_spawn():
"""Check if we need to force the use of the `spawn` multiprocessing start
method.
"""
if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
return
reasons = []
if is_in_ray_actor():
# even if we choose to spawn, we need to pass the ray address
# to the subprocess so that it knows how to connect to the ray cluster.
# env vars are inherited by subprocesses, even if we use spawn.
import ray
os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
reasons.append("In a Ray actor and can only be spawned")
if cuda_is_initialized():
reasons.append("CUDA is initialized")
elif xpu_is_initialized():
reasons.append("XPU is initialized")
if reasons:
logger.warning(
"We must use the `spawn` multiprocessing start method. "
"Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
"See https://docs.vllm.ai/en/latest/usage/"
"troubleshooting.html#python-multiprocessing "
"for more information. Reasons: %s", "; ".join(reasons))
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
def random_uuid() -> str:
return f"{uuid.uuid4().int & MASK_64_BITS:016x}" # 16 hex chars
......
......@@ -166,12 +166,6 @@ def get_mk_alignment_for_contiguous_layout() -> list[int]:
return [mk_align_size, mk_align_size]
def get_num_sms() -> int:
_lazy_init()
_dg = importlib.import_module("deep_gemm")
return int(_dg.get_num_sms())
def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
"""Wrapper for DeepGEMM's get_mn_major_tma_aligned_tensor"""
_lazy_init()
......@@ -315,100 +309,6 @@ def fp8_paged_mqa_logits(
)
def fp8_mqa_logits(
q: torch.Tensor,
kv: tuple[torch.Tensor, torch.Tensor],
weights: torch.Tensor,
cu_seqlen_ks: torch.Tensor,
cu_seqlen_ke: torch.Tensor,
) -> torch.Tensor:
"""Compute FP8 MQA logits for a single sequence without KV paging.
Args:
q: Query tensor of shape [M, H, D]. Casted to
`torch.float8_e4m3fn` by caller.
kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
[N, 1]) with dtype `torch.float32`.
weights: weights of shape [M, H], dtype `torch.float32`.
cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
shape [M], dtype int32.
cu_seqlen_ke: End indices (exclusive) for valid K per query position,
shape [M], dtype int32.
Returns:
Logits tensor of shape [M, N], dtype `torch.float32`.
"""
_lazy_init()
if _fp8_mqa_logits_impl is None:
return _missing()
return _fp8_mqa_logits_impl(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)
def get_paged_mqa_logits_metadata(context_lens: torch.Tensor, block_size: int,
num_sms: int) -> torch.Tensor:
"""Build scheduling metadata for paged MQA logits.
Args:
context_lens: Tensor of shape [B], dtype int32; effective context length
per batch element.
block_size: KV-cache block size in tokens (e.g., 64).
num_sms: Number of SMs available. 132 for Hopper
Returns:
Backend-specific tensor consumed by `fp8_paged_mqa_logits` to
schedule work across SMs.
"""
_lazy_init()
if _get_paged_mqa_logits_metadata_impl is None:
return _missing()
return _get_paged_mqa_logits_metadata_impl(context_lens, block_size,
num_sms)
def fp8_paged_mqa_logits(
q_fp8: torch.Tensor,
kv_cache_fp8: torch.Tensor,
weights: torch.Tensor,
context_lens: torch.Tensor,
block_tables: torch.Tensor,
schedule_metadata: torch.Tensor,
max_model_len: int,
) -> torch.Tensor:
"""Compute FP8 MQA logits using paged KV-cache.
Args:
q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
`torch.float8_e4m3fn` by caller.
kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
[num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
4 bytes per (block,pos) store the `float` dequant scale.
weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
context_lens: Tensor of shape [B], dtype int32; effective context length
for each batch element.
block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
block indices to physical blocks in the paged cache.
schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
used to distribute work across SMs.
max_model_len: Maximum sequence length used to size the logits output.
Returns:
Logits tensor of shape [B * next_n, max_model_len], dtype
`torch.float32`.
"""
_lazy_init()
if _fp8_paged_mqa_logits_impl is None:
return _missing()
return _fp8_paged_mqa_logits_impl(q_fp8,
kv_cache_fp8,
weights,
context_lens,
block_tables,
schedule_metadata,
max_model_len,
clean_logits=True)
def _ceil_to_ue8m0(x: torch.Tensor):
return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment