Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8900b622
Commit
8900b622
authored
Jan 28, 2026
by
chenyue3
Browse files
修复vit attn的导入问题,以及w4a16的gptq的接口问题
parent
eafda883
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
4 additions
and
4 deletions
+4
-4
vllm/model_executor/layers/quantization/gptq.py
vllm/model_executor/layers/quantization/gptq.py
+1
-1
vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
...or/layers/quantization/kernels/mixed_precision/exllama.py
+2
-2
vllm/v1/attention/backends/fa_utils.py
vllm/v1/attention/backends/fa_utils.py
+1
-1
No files found.
vllm/model_executor/layers/quantization/gptq.py
View file @
8900b622
...
@@ -385,7 +385,7 @@ class GPTQLinearMethod(LinearMethodBase):
...
@@ -385,7 +385,7 @@ class GPTQLinearMethod(LinearMethodBase):
layer
.
scales
,
layer
.
scales
,
layer
.
g_idx
,
layer
.
g_idx
,
layer
.
exllama_state
==
ExllamaState
.
READY
,
layer
.
exllama_state
==
ExllamaState
.
READY
,
self
.
use_v2_format
,
#
self.use_v2_format,
self
.
quant_config
.
weight_bits
,
self
.
quant_config
.
weight_bits
,
)
)
if
bias
is
not
None
:
if
bias
is
not
None
:
...
...
vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
View file @
8900b622
...
@@ -155,12 +155,12 @@ class ExllamaLinearKernel(MPLinearKernel):
...
@@ -155,12 +155,12 @@ class ExllamaLinearKernel(MPLinearKernel):
# gptq_gemm supports GPTQv2 format by passing use_v2_format=True.
# gptq_gemm supports GPTQv2 format by passing use_v2_format=True.
# However, the MPLinearLayerConfig doesn't contain format info.
# However, the MPLinearLayerConfig doesn't contain format info.
# So hardcode GPTQv1 format here, to keep its behavior unchanged.
# So hardcode GPTQv1 format here, to keep its behavior unchanged.
use_v2_format
=
False
#
use_v2_format = False
assert
w_zp
is
not
None
,
"Zero points are required by Exllama"
assert
w_zp
is
not
None
,
"Zero points are required by Exllama"
assert
w_g_idx
is
not
None
,
"Group index is required by Exllama"
assert
w_g_idx
is
not
None
,
"Group index is required by Exllama"
output
=
ops
.
gptq_gemm
(
output
=
ops
.
gptq_gemm
(
x_2d
,
w_q
,
w_zp
,
w_s
,
w_g_idx
,
True
,
use_v2_format
,
c
.
weight_type
.
size_bits
x_2d
,
w_q
,
w_zp
,
w_s
,
w_g_idx
,
True
,
c
.
weight_type
.
size_bits
)
)
if
bias
is
not
None
:
if
bias
is
not
None
:
...
...
vllm/v1/attention/backends/fa_utils.py
View file @
8900b622
...
@@ -23,7 +23,7 @@ elif current_platform.is_xpu():
...
@@ -23,7 +23,7 @@ elif current_platform.is_xpu():
elif
current_platform
.
is_rocm
():
elif
current_platform
.
is_rocm
():
try
:
try
:
from
vllm._custom_ops
import
reshape_and_cache_cuda
from
vllm._custom_ops
import
reshape_and_cache_cuda
from
flash_attn
import
vllm_flash_attn_varlen_func
from
flash_attn
import
flash_attn_varlen_func
,
vllm_flash_attn_varlen_func
except
ImportError
as
e
:
except
ImportError
as
e
:
raise
ImportError
(
raise
ImportError
(
"Rocm platform requires upstream flash-attn "
"Rocm platform requires upstream flash-attn "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment