Commit 641fc5b7 authored by zhuwenwen's avatar zhuwenwen
Browse files

remove unused code

parent b01efa0b
...@@ -13,7 +13,6 @@ from vllm.attention.backends.utils import (CommonAttentionState, ...@@ -13,7 +13,6 @@ from vllm.attention.backends.utils import (CommonAttentionState,
from vllm.attention.ops.blocksparse_attention.interface import ( from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn, get_head_sliding_step) LocalStridedBlockSparseAttn, get_head_sliding_step)
from vllm.attention.ops.paged_attn import PagedAttention from vllm.attention.ops.paged_attn import PagedAttention
from vllm import _custom_ops as ops
from vllm.distributed import (get_tensor_model_parallel_rank, from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size) get_tensor_model_parallel_world_size)
......
...@@ -148,7 +148,6 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): ...@@ -148,7 +148,6 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
cross_slot_mapping: Optional[torch.Tensor] = None cross_slot_mapping: Optional[torch.Tensor] = None
cross_block_tables: Optional[torch.Tensor] = None cross_block_tables: Optional[torch.Tensor] = None
@property @property
def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
if self.num_prefills == 0: if self.num_prefills == 0:
...@@ -723,7 +722,6 @@ class ROCmFlashAttentionImpl(AttentionImpl): ...@@ -723,7 +722,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
attn_masks[0][None] attn_masks[0][None]
if attn_masks is not None else None, if attn_masks is not None else None,
) )
elif self.use_naive_attn: elif self.use_naive_attn:
if self.num_kv_heads != self.num_heads: if self.num_kv_heads != self.num_heads:
# Interleave for MQA workaround. # Interleave for MQA workaround.
......
...@@ -2305,7 +2305,6 @@ class SpeculativeConfig: ...@@ -2305,7 +2305,6 @@ class SpeculativeConfig:
f"other value than 1 or target model tensor_parallel_size") f"other value than 1 or target model tensor_parallel_size")
return speculative_draft_tensor_parallel_size return speculative_draft_tensor_parallel_size
@staticmethod @staticmethod
def create_draft_parallel_config( def create_draft_parallel_config(
target_parallel_config: ParallelConfig, target_parallel_config: ParallelConfig,
......
...@@ -19,7 +19,6 @@ from vllm.utils import cuda_device_count_stateless ...@@ -19,7 +19,6 @@ from vllm.utils import cuda_device_count_stateless
try: try:
ops.meta_size() ops.meta_size()
custom_ar = True custom_ar = True
except Exception: except Exception:
# For CPUs # For CPUs
custom_ar = False custom_ar = False
...@@ -130,7 +129,6 @@ class CustomAllreduce: ...@@ -130,7 +129,6 @@ class CustomAllreduce:
# test nvlink first, this will filter out most of the cases # test nvlink first, this will filter out most of the cases
# where custom allreduce is not supported # where custom allreduce is not supported
# this checks hardware and driver support for NVLink # this checks hardware and driver support for NVLink
assert current_platform.is_cuda_alike() assert current_platform.is_cuda_alike()
fully_connected = current_platform.is_fully_connected( fully_connected = current_platform.is_fully_connected(
physical_device_ids) physical_device_ids)
......
...@@ -341,6 +341,7 @@ class LoRAModelManager(AdapterModelManager): ...@@ -341,6 +341,7 @@ class LoRAModelManager(AdapterModelManager):
# Used for long context lora. # Used for long context lora.
self.scaling_factor_to_offset: Dict[float, int] = {} self.scaling_factor_to_offset: Dict[float, int] = {}
super().__init__(model) super().__init__(model)
self.supported_lora_modules = get_supported_lora_modules(self.model) self.supported_lora_modules = get_supported_lora_modules(self.model)
assert self.supported_lora_modules, "No supported LoRA modules found in" assert self.supported_lora_modules, "No supported LoRA modules found in"
f"{self.model.__class__.__name__}." f"{self.model.__class__.__name__}."
......
...@@ -9,7 +9,6 @@ import numpy as np ...@@ -9,7 +9,6 @@ import numpy as np
import numpy.typing as npt import numpy.typing as npt
import torch import torch
from PIL import Image from PIL import Image
import os
import vllm.envs as envs import vllm.envs as envs
from vllm.connections import HTTPConnection, global_http_connection from vllm.connections import HTTPConnection, global_http_connection
...@@ -87,7 +86,6 @@ class MediaConnector: ...@@ -87,7 +86,6 @@ class MediaConnector:
return media_io.load_file(filepath) return media_io.load_file(filepath)
def load_from_url( def load_from_url(
self, self,
url: str, url: str,
......
...@@ -32,10 +32,6 @@ class CpuPlatform(Platform): ...@@ -32,10 +32,6 @@ class CpuPlatform(Platform):
def get_device_name(cls, device_id: int = 0) -> str: def get_device_name(cls, device_id: int = 0) -> str:
return "cpu" return "cpu"
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
return psutil.virtual_memory().total
@classmethod @classmethod
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
dtype: torch.dtype, kv_cache_dtype: Optional[str], dtype: torch.dtype, kv_cache_dtype: Optional[str],
......
...@@ -203,11 +203,6 @@ class Platform: ...@@ -203,11 +203,6 @@ class Platform:
""" """
raise NotImplementedError raise NotImplementedError
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
"""Get the total memory of a device in bytes."""
raise NotImplementedError
@classmethod @classmethod
def inference_mode(cls): def inference_mode(cls):
"""A device-specific wrapper of `torch.inference_mode`. """A device-specific wrapper of `torch.inference_mode`.
......
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen2VL model configuration"""
import os
from typing import Union
from transformers import PretrainedConfig
class Qwen2VLVisionConfig(PretrainedConfig):
model_type = "qwen2_vl"
def __init__(
self,
depth=32,
embed_dim=1280,
hidden_size=3584,
hidden_act="quick_gelu",
mlp_ratio=4,
num_heads=16,
in_channels=3,
patch_size=14,
spatial_merge_size=2,
temporal_patch_size=2,
**kwargs,
):
super().__init__(**kwargs)
self.depth = depth
self.embed_dim = embed_dim
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.mlp_ratio = mlp_ratio
self.num_heads = num_heads
self.in_channels = in_channels
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
os.PathLike],
**kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "qwen2_vl":
config_dict = config_dict["vision_config"]
return cls.from_dict(config_dict, **kwargs)
class Qwen2VLConfig(PretrainedConfig):
def __init__(
self,
vocab_size=152064,
hidden_size=8192,
intermediate_size=29568,
num_hidden_layers=80,
num_attention_heads=64,
num_key_value_heads=8,
hidden_act="silu",
max_position_embeddings=32768,
initializer_range=0.02,
rms_norm_eps=1e-05,
use_cache=True,
tie_word_embeddings=False,
rope_theta=1000000.0,
use_sliding_window=False,
sliding_window=4096,
max_window_layers=80,
attention_dropout=0.0,
vision_config=None,
rope_scaling=None,
**kwargs,
):
if isinstance(vision_config, dict):
self.vision_config = Qwen2VLVisionConfig(**vision_config)
elif vision_config is None:
self.vision_config = Qwen2VLVisionConfig()
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.use_sliding_window = use_sliding_window
self.sliding_window = sliding_window
self.max_window_layers = max_window_layers
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.attention_dropout = attention_dropout
self.rope_scaling = rope_scaling
# NOTE: the following section from original transformers config
# for Qwen2-VL is commented out to address rope config loading issue
#
# if self.rope_scaling is not None and "type" in self.rope_scaling:
# if self.rope_scaling["type"] == "mrope":
# self.rope_scaling["type"] = "default"
# self.rope_scaling["rope_type"] = self.rope_scaling["type"]
# rope_config_validation(self)
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
\ No newline at end of file
...@@ -352,6 +352,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): ...@@ -352,6 +352,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
prompt_adapter_prompt_mapping prompt_adapter_prompt_mapping
else: else:
self.prompt_adapter_prompt_mapping.clear() self.prompt_adapter_prompt_mapping.clear()
else: else:
self.input_tokens = input_tokens or [] self.input_tokens = input_tokens or []
self.input_positions = input_positions or [] self.input_positions = input_positions or []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment