Unverified Commit be444507 authored by Zebing Lin's avatar Zebing Lin Committed by GitHub
Browse files

[Fix][Spec Decode] Fix llama4 draft loading with different quantization (#27136)


Signed-off-by: default avatarlinzebing <linzebing1995@gmail.com>
parent f381cf23
......@@ -60,6 +60,10 @@ class LlamaModel(nn.Module):
prefix=maybe_prefix(prefix, "embed_tokens"),
)
# Temporarily modify vllm_config.quant_config for draft model layers
original_quant_config = vllm_config.quant_config
vllm_config.quant_config = quant_config
try:
self.layers = nn.ModuleList(
[
Llama4DecoderLayer(
......@@ -70,6 +74,9 @@ class LlamaModel(nn.Module):
for i in range(self.config.num_hidden_layers)
]
)
finally:
# Restore original quant_config
vllm_config.quant_config = original_quant_config
self.fc = torch.nn.Linear(
self.config.hidden_size * 2, self.config.hidden_size, bias=False
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment