Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
42c19496
Unverified
Commit
42c19496
authored
Dec 03, 2025
by
Tsukasa OI
Committed by
GitHub
Dec 03, 2025
Browse files
[Bugfix][Quantization] Support BF16 tensors on GGUF (#29948)
Signed-off-by:
Tsukasa OI
<
floss_llm@irq.a4lg.com
>
parent
cc4e296e
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
1 deletion
+18
-1
tests/models/quantization/test_gguf.py
tests/models/quantization/test_gguf.py
+7
-0
vllm/model_executor/model_loader/weight_utils.py
vllm/model_executor/model_loader/weight_utils.py
+11
-1
No files found.
tests/models/quantization/test_gguf.py
View file @
42c19496
...
@@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
...
@@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
gguf_filename
=
"qwen2.5-1.5b-instruct-q6_k.gguf"
,
gguf_filename
=
"qwen2.5-1.5b-instruct-q6_k.gguf"
,
)
)
QWEN3_CONFIG
=
GGUFTestConfig
(
original_model
=
"Qwen/Qwen3-0.6B"
,
gguf_repo
=
"unsloth/Qwen3-0.6B-GGUF"
,
gguf_filename
=
"Qwen3-0.6B-BF16.gguf"
,
)
PHI3_CONFIG
=
GGUFTestConfig
(
PHI3_CONFIG
=
GGUFTestConfig
(
original_model
=
"microsoft/Phi-3.5-mini-instruct"
,
original_model
=
"microsoft/Phi-3.5-mini-instruct"
,
gguf_repo
=
"bartowski/Phi-3.5-mini-instruct-GGUF"
,
gguf_repo
=
"bartowski/Phi-3.5-mini-instruct-GGUF"
,
...
@@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
...
@@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
MODELS
=
[
MODELS
=
[
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
QWEN2_CONFIG
,
QWEN2_CONFIG
,
QWEN3_CONFIG
,
PHI3_CONFIG
,
PHI3_CONFIG
,
GPT2_CONFIG
,
GPT2_CONFIG
,
STABLELM_CONFIG
,
STABLELM_CONFIG
,
...
...
vllm/model_executor/model_loader/weight_utils.py
View file @
42c19496
...
@@ -921,6 +921,16 @@ def gguf_quant_weights_iterator(
...
@@ -921,6 +921,16 @@ def gguf_quant_weights_iterator(
name
=
gguf_to_hf_name_map
[
tensor
.
name
]
name
=
gguf_to_hf_name_map
[
tensor
.
name
]
if
weight_type
.
name
not
in
(
"F32"
,
"BF16"
,
"F16"
):
if
weight_type
.
name
not
in
(
"F32"
,
"BF16"
,
"F16"
):
name
=
name
.
replace
(
"weight"
,
"qweight"
)
name
=
name
.
replace
(
"weight"
,
"qweight"
)
if
weight_type
.
name
==
"BF16"
and
tensor
.
data
.
dtype
==
np
.
uint8
:
# BF16 is currently the only "quantization" type that isn't
# actually quantized but is read as a raw byte tensor.
# Reinterpret as `torch.bfloat16` tensor.
weight
=
weight
.
view
(
np
.
uint16
)
if
reader
.
byte_order
==
"S"
:
# GGUF endianness != system endianness
weight
=
weight
.
byteswap
()
param
=
torch
.
tensor
(
weight
).
view
(
torch
.
bfloat16
)
else
:
param
=
torch
.
tensor
(
weight
)
param
=
torch
.
tensor
(
weight
)
yield
name
,
param
yield
name
,
param
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment