Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
641fc5b7
Commit
641fc5b7
authored
Apr 14, 2025
by
zhuwenwen
Browse files
remove unused code
parent
b01efa0b
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
6 additions
and
152 deletions
+6
-152
vllm/attention/backends/blocksparse_attn.py
vllm/attention/backends/blocksparse_attn.py
+0
-1
vllm/attention/backends/rocm_flash_attn.py
vllm/attention/backends/rocm_flash_attn.py
+4
-6
vllm/config.py
vllm/config.py
+0
-1
vllm/distributed/device_communicators/custom_all_reduce.py
vllm/distributed/device_communicators/custom_all_reduce.py
+0
-2
vllm/lora/models.py
vllm/lora/models.py
+1
-0
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+0
-2
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+0
-4
vllm/platforms/interface.py
vllm/platforms/interface.py
+0
-5
vllm/transformers_utils/configs/qwen2vl.py
vllm/transformers_utils/configs/qwen2vl.py
+0
-131
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+1
-0
No files found.
vllm/attention/backends/blocksparse_attn.py
View file @
641fc5b7
...
...
@@ -13,7 +13,6 @@ from vllm.attention.backends.utils import (CommonAttentionState,
from
vllm.attention.ops.blocksparse_attention.interface
import
(
LocalStridedBlockSparseAttn
,
get_head_sliding_step
)
from
vllm.attention.ops.paged_attn
import
PagedAttention
from
vllm
import
_custom_ops
as
ops
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
...
...
vllm/attention/backends/rocm_flash_attn.py
View file @
641fc5b7
...
...
@@ -148,7 +148,6 @@ class ROCmFlashAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
cross_slot_mapping
:
Optional
[
torch
.
Tensor
]
=
None
cross_block_tables
:
Optional
[
torch
.
Tensor
]
=
None
@
property
def
prefill_metadata
(
self
)
->
Optional
[
"ROCmFlashAttentionMetadata"
]:
if
self
.
num_prefills
==
0
:
...
...
@@ -524,10 +523,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
logger
.
debug
(
"Using Triton FA in ROCmBackend"
)
if
self
.
sliding_window
!=
(
-
1
,
-
1
):
logger
.
warning
(
"ROCm Triton FA does not currently support "
"sliding window attention. If using half "
"precision, please try using the ROCm CK "
"FA backend instead by setting the env var "
"`VLLM_USE_TRITON_FLASH_ATTN=0`"
)
"sliding window attention. If using half "
"precision, please try using the ROCm CK "
"FA backend instead by setting the env var "
"`VLLM_USE_TRITON_FLASH_ATTN=0`"
)
else
:
# if not using triton, navi3x/navi21/navi10 do not use flash-attn
# either
...
...
@@ -723,7 +722,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
attn_masks
[
0
][
None
]
if
attn_masks
is
not
None
else
None
,
)
elif
self
.
use_naive_attn
:
if
self
.
num_kv_heads
!=
self
.
num_heads
:
# Interleave for MQA workaround.
...
...
vllm/config.py
View file @
641fc5b7
...
...
@@ -2305,7 +2305,6 @@ class SpeculativeConfig:
f
"other value than 1 or target model tensor_parallel_size"
)
return
speculative_draft_tensor_parallel_size
@
staticmethod
def
create_draft_parallel_config
(
target_parallel_config
:
ParallelConfig
,
...
...
vllm/distributed/device_communicators/custom_all_reduce.py
View file @
641fc5b7
...
...
@@ -19,7 +19,6 @@ from vllm.utils import cuda_device_count_stateless
try
:
ops
.
meta_size
()
custom_ar
=
True
except
Exception
:
# For CPUs
custom_ar
=
False
...
...
@@ -130,7 +129,6 @@ class CustomAllreduce:
# test nvlink first, this will filter out most of the cases
# where custom allreduce is not supported
# this checks hardware and driver support for NVLink
assert
current_platform
.
is_cuda_alike
()
fully_connected
=
current_platform
.
is_fully_connected
(
physical_device_ids
)
...
...
vllm/lora/models.py
View file @
641fc5b7
...
...
@@ -341,6 +341,7 @@ class LoRAModelManager(AdapterModelManager):
# Used for long context lora.
self
.
scaling_factor_to_offset
:
Dict
[
float
,
int
]
=
{}
super
().
__init__
(
model
)
self
.
supported_lora_modules
=
get_supported_lora_modules
(
self
.
model
)
assert
self
.
supported_lora_modules
,
"No supported LoRA modules found in"
f
"
{
self
.
model
.
__class__
.
__name__
}
."
...
...
vllm/multimodal/utils.py
View file @
641fc5b7
...
...
@@ -9,7 +9,6 @@ import numpy as np
import
numpy.typing
as
npt
import
torch
from
PIL
import
Image
import
os
import
vllm.envs
as
envs
from
vllm.connections
import
HTTPConnection
,
global_http_connection
...
...
@@ -87,7 +86,6 @@ class MediaConnector:
return
media_io
.
load_file
(
filepath
)
def
load_from_url
(
self
,
url
:
str
,
...
...
vllm/platforms/cpu.py
View file @
641fc5b7
...
...
@@ -31,10 +31,6 @@ class CpuPlatform(Platform):
@
classmethod
def
get_device_name
(
cls
,
device_id
:
int
=
0
)
->
str
:
return
"cpu"
@
classmethod
def
get_device_total_memory
(
cls
,
device_id
:
int
=
0
)
->
int
:
return
psutil
.
virtual_memory
().
total
@
classmethod
def
get_attn_backend_cls
(
cls
,
selected_backend
:
_Backend
,
head_size
:
int
,
...
...
vllm/platforms/interface.py
View file @
641fc5b7
...
...
@@ -203,11 +203,6 @@ class Platform:
"""
raise
NotImplementedError
@
classmethod
def
get_device_total_memory
(
cls
,
device_id
:
int
=
0
)
->
int
:
"""Get the total memory of a device in bytes."""
raise
NotImplementedError
@
classmethod
def
inference_mode
(
cls
):
"""A device-specific wrapper of `torch.inference_mode`.
...
...
vllm/transformers_utils/configs/qwen2vl.py
deleted
100644 → 0
View file @
b01efa0b
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen2VL model configuration"""
import
os
from
typing
import
Union
from
transformers
import
PretrainedConfig
class
Qwen2VLVisionConfig
(
PretrainedConfig
):
model_type
=
"qwen2_vl"
def
__init__
(
self
,
depth
=
32
,
embed_dim
=
1280
,
hidden_size
=
3584
,
hidden_act
=
"quick_gelu"
,
mlp_ratio
=
4
,
num_heads
=
16
,
in_channels
=
3
,
patch_size
=
14
,
spatial_merge_size
=
2
,
temporal_patch_size
=
2
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
self
.
depth
=
depth
self
.
embed_dim
=
embed_dim
self
.
hidden_size
=
hidden_size
self
.
hidden_act
=
hidden_act
self
.
mlp_ratio
=
mlp_ratio
self
.
num_heads
=
num_heads
self
.
in_channels
=
in_channels
self
.
patch_size
=
patch_size
self
.
spatial_merge_size
=
spatial_merge_size
self
.
temporal_patch_size
=
temporal_patch_size
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
:
Union
[
str
,
os
.
PathLike
],
**
kwargs
)
->
"PretrainedConfig"
:
cls
.
_set_token_in_kwargs
(
kwargs
)
config_dict
,
kwargs
=
cls
.
get_config_dict
(
pretrained_model_name_or_path
,
**
kwargs
)
if
config_dict
.
get
(
"model_type"
)
==
"qwen2_vl"
:
config_dict
=
config_dict
[
"vision_config"
]
return
cls
.
from_dict
(
config_dict
,
**
kwargs
)
class
Qwen2VLConfig
(
PretrainedConfig
):
def
__init__
(
self
,
vocab_size
=
152064
,
hidden_size
=
8192
,
intermediate_size
=
29568
,
num_hidden_layers
=
80
,
num_attention_heads
=
64
,
num_key_value_heads
=
8
,
hidden_act
=
"silu"
,
max_position_embeddings
=
32768
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-05
,
use_cache
=
True
,
tie_word_embeddings
=
False
,
rope_theta
=
1000000.0
,
use_sliding_window
=
False
,
sliding_window
=
4096
,
max_window_layers
=
80
,
attention_dropout
=
0.0
,
vision_config
=
None
,
rope_scaling
=
None
,
**
kwargs
,
):
if
isinstance
(
vision_config
,
dict
):
self
.
vision_config
=
Qwen2VLVisionConfig
(
**
vision_config
)
elif
vision_config
is
None
:
self
.
vision_config
=
Qwen2VLVisionConfig
()
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
use_sliding_window
=
use_sliding_window
self
.
sliding_window
=
sliding_window
self
.
max_window_layers
=
max_window_layers
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
attention_dropout
=
attention_dropout
self
.
rope_scaling
=
rope_scaling
# NOTE: the following section from original transformers config
# for Qwen2-VL is commented out to address rope config loading issue
#
# if self.rope_scaling is not None and "type" in self.rope_scaling:
# if self.rope_scaling["type"] == "mrope":
# self.rope_scaling["type"] = "default"
# self.rope_scaling["rope_type"] = self.rope_scaling["type"]
# rope_config_validation(self)
super
().
__init__
(
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
)
\ No newline at end of file
vllm/worker/model_runner.py
View file @
641fc5b7
...
...
@@ -352,6 +352,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
prompt_adapter_prompt_mapping
else
:
self
.
prompt_adapter_prompt_mapping
.
clear
()
else
:
self
.
input_tokens
=
input_tokens
or
[]
self
.
input_positions
=
input_positions
or
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment