Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d29483b5
Unverified
Commit
d29483b5
authored
Oct 17, 2025
by
Zhuohan Li
Committed by
GitHub
Oct 17, 2025
Browse files
[Minor] Remove unnecessary error message (#27115)
Signed-off-by:
Zhuohan Li
<
zhuohan123@gmail.com
>
parent
950cf9e5
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
19 additions
and
55 deletions
+19
-55
vllm/attention/layer.py
vllm/attention/layer.py
+8
-27
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+11
-28
No files found.
vllm/attention/layer.py
View file @
d29483b5
...
@@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
...
@@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
GroupShape
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
GroupShape
from
vllm.model_executor.models.vision
import
get_vit_attn_backend
from
vllm.model_executor.models.vision
import
get_vit_attn_backend
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
GiB_bytes
,
direct_register_custom_op
from
vllm.utils
import
direct_register_custom_op
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -281,25 +281,10 @@ class Attention(nn.Module, AttentionLayerBase):
...
@@ -281,25 +281,10 @@ class Attention(nn.Module, AttentionLayerBase):
)
)
]
]
try
:
# Initialize q/k/v range constants.
self
.
q_range
=
torch
.
tensor
(
envs
.
Q_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
q_range
=
torch
.
tensor
(
envs
.
Q_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
k_range
=
torch
.
tensor
(
envs
.
K_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
k_range
=
torch
.
tensor
(
envs
.
K_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
v_range
=
torch
.
tensor
(
envs
.
V_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
v_range
=
torch
.
tensor
(
envs
.
V_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
except
torch
.
cuda
.
OutOfMemoryError
as
e
:
logger
.
error
(
"Failed to initialize attention q/k/v range constants: %s"
,
e
)
if
torch
.
cuda
.
is_available
():
logger
.
debug
(
"CUDA device: %s"
,
torch
.
cuda
.
current_device
())
logger
.
debug
(
"Allocated: %.2f GiB"
,
torch
.
cuda
.
memory_allocated
()
/
GiB_bytes
)
logger
.
debug
(
"Reserved: %.2f GiB"
,
torch
.
cuda
.
memory_reserved
()
/
GiB_bytes
)
raise
RuntimeError
(
"Failed to initialize q/k/v range constants. "
"This may be caused by insufficient memory to allocate "
"kv cache."
)
from
e
# for attn backends supporting query quantization
# for attn backends supporting query quantization
self
.
query_quant
=
None
self
.
query_quant
=
None
...
@@ -668,13 +653,9 @@ class MLAAttention(nn.Module, AttentionLayerBase):
...
@@ -668,13 +653,9 @@ class MLAAttention(nn.Module, AttentionLayerBase):
self
.
use_sparse
=
use_sparse
self
.
use_sparse
=
use_sparse
# Initialize q/k/v range constants.
# Initialize q/k/v range constants.
try
:
self
.
q_range
=
torch
.
tensor
(
envs
.
Q_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
q_range
=
torch
.
tensor
(
envs
.
Q_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
k_range
=
torch
.
tensor
(
envs
.
K_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
k_range
=
torch
.
tensor
(
envs
.
K_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
v_range
=
torch
.
tensor
(
envs
.
V_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
self
.
v_range
=
torch
.
tensor
(
envs
.
V_SCALE_CONSTANT
,
dtype
=
torch
.
float32
)
except
torch
.
cuda
.
OutOfMemoryError
:
# Keep defaults if allocation fails; not critical for init.
pass
def
forward
(
def
forward
(
self
,
self
,
...
...
vllm/model_executor/layers/linear.py
View file @
d29483b5
...
@@ -34,7 +34,6 @@ from vllm.model_executor.parameter import (
...
@@ -34,7 +34,6 @@ from vllm.model_executor.parameter import (
)
)
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
GiB_bytes
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -211,33 +210,17 @@ class UnquantizedLinearMethod(LinearMethodBase):
...
@@ -211,33 +210,17 @@ class UnquantizedLinearMethod(LinearMethodBase):
# The weights are not quantized, and they are not sharded.
# The weights are not quantized, and they are not sharded.
# The amount of memory allocated for the weights is
# The amount of memory allocated for the weights is
# sum(output_partition_sizes) * input_size_per_partition.
# sum(output_partition_sizes) * input_size_per_partition.
try
:
weight_loader
=
extra_weight_attrs
.
pop
(
"weight_loader"
)
weight_loader
=
extra_weight_attrs
.
pop
(
"weight_loader"
)
weight
=
ModelWeightParameter
(
weight
=
ModelWeightParameter
(
data
=
torch
.
empty
(
data
=
torch
.
empty
(
sum
(
output_partition_sizes
),
sum
(
output_partition_sizes
),
input_size_per_partition
,
input_size_per_partition
,
dtype
=
params_dtype
,
dtype
=
params_dtype
,
),
),
input_dim
=
1
,
input_dim
=
1
,
output_dim
=
0
,
output_dim
=
0
,
weight_loader
=
weight_loader
,
weight_loader
=
weight_loader
,
)
)
except
torch
.
cuda
.
OutOfMemoryError
as
e
:
logger
.
error
(
"Failed to create unquantized linear weights: %s"
,
e
)
if
torch
.
cuda
.
is_available
():
logger
.
debug
(
"CUDA device: %s"
,
torch
.
cuda
.
current_device
())
logger
.
debug
(
"Allocated: %.2f GiB"
,
torch
.
cuda
.
memory_allocated
()
/
GiB_bytes
)
logger
.
debug
(
"Reserved: %.2f GiB"
,
torch
.
cuda
.
memory_reserved
()
/
GiB_bytes
)
raise
RuntimeError
(
"Failed to create unquantized linear weights. "
"This may be caused by insufficient memory to allocate "
"the weight."
)
from
e
layer
.
register_parameter
(
"weight"
,
weight
)
layer
.
register_parameter
(
"weight"
,
weight
)
set_weight_attrs
(
weight
,
extra_weight_attrs
)
set_weight_attrs
(
weight
,
extra_weight_attrs
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment