Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
65b1f121
Unverified
Commit
65b1f121
authored
Jul 25, 2024
by
Michael Goin
Committed by
GitHub
Jul 25, 2024
Browse files
[Bugfix] Fix `kv_cache_dtype=fp8` without scales for FP8 checkpoints (#6761)
parent
889da130
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
6 deletions
+12
-6
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+10
-2
vllm/model_executor/layers/quantization/kv_cache.py
vllm/model_executor/layers/quantization/kv_cache.py
+2
-4
No files found.
tests/quantization/test_fp8.py
View file @
65b1f121
...
@@ -60,12 +60,20 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
...
@@ -60,12 +60,20 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
reason
=
"FP8 is not supported on this GPU type."
)
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
as
llm
:
def
test_load_fp16_model
(
vllm_runner
,
kv_cache_dtype
:
str
)
->
None
:
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
,
kv_cache_dtype
=
kv_cache_dtype
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
assert
isinstance
(
fc1
.
quant_method
,
Fp8LinearMethod
)
if
kv_cache_dtype
==
"fp8"
:
attn
=
model
.
model
.
decoder
.
layers
[
0
].
self_attn
.
attn
assert
isinstance
(
attn
.
quant_method
,
Fp8KVCacheMethod
)
assert
attn
.
_k_scale
==
1.0
assert
attn
.
_v_scale
==
1.0
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
...
...
vllm/model_executor/layers/quantization/kv_cache.py
View file @
65b1f121
...
@@ -46,10 +46,8 @@ class BaseKVCacheMethod(QuantizeMethodBase):
...
@@ -46,10 +46,8 @@ class BaseKVCacheMethod(QuantizeMethodBase):
elif
layer
.
k_scale
<
0.0
and
layer
.
v_scale
<
0.0
:
elif
layer
.
k_scale
<
0.0
and
layer
.
v_scale
<
0.0
:
# If no scales were loaded (both scales are invalid negative
# If no scales were loaded (both scales are invalid negative
# values), use the default value of 1.0
# values), use the default value of 1.0
k_scale
=
torch
.
nn
.
Parameter
(
torch
.
tensor
(
1.0
),
k_scale
=
1.0
requires_grad
=
False
)
v_scale
=
1.0
v_scale
=
torch
.
nn
.
Parameter
(
torch
.
tensor
(
1.0
),
requires_grad
=
False
)
else
:
else
:
# If we find a single kv_scale in the checkpoint, we remap
# If we find a single kv_scale in the checkpoint, we remap
# kv_scale to k_scale during weight loading, and duplicate
# kv_scale to k_scale during weight loading, and duplicate
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment