Unverified Commit 1e8438a8 authored by Eldar Kurtić's avatar Eldar Kurtić Committed by GitHub
Browse files

[Llama4,CI] Bring back Llama-4 bug fixes, and also fix Maverick tests (#35033)


Signed-off-by: default avatarEldar Kurtic <you@example.com>
Co-authored-by: default avatarEldar Kurtic <you@example.com>
parent 8435b2e0
...@@ -305,10 +305,10 @@ def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Te ...@@ -305,10 +305,10 @@ def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Te
# Self-attention weights (separate q, k, v projections) # Self-attention weights (separate q, k, v projections)
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn( weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16 num_attention_heads * head_dim, hidden_size, dtype=torch.bfloat16
) )
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn( weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16 num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
) )
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn( weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16 num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
......
...@@ -44,9 +44,6 @@ from vllm.model_executor.layers.linear import ( ...@@ -44,9 +44,6 @@ from vllm.model_executor.layers.linear import (
RowParallelLinear, RowParallelLinear,
) )
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.compressed_tensors import (
compressed_tensors as ct,
)
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader, default_weight_loader,
...@@ -831,74 +828,38 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts): ...@@ -831,74 +828,38 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts):
name: str, name: str,
loaded_weight: torch.Tensor, loaded_weight: torch.Tensor,
) -> tuple[str, torch.Tensor]: ) -> tuple[str, torch.Tensor]:
# Helper function to permute the weight's channels modules = name.split(".")
def permute( # Permute Q/K weights and corresponding scales for rotary embedding.
w: torch.Tensor, # This pathway is validated against modelopt and compressed-tensors ckpts,
n_heads: int, # and for per-tensor, per-group (e.g. GPTQ), and per-channel quant schemes.
is_nvfp4_weight_scale: bool, # Note: permutations are not feasible only for per-block (e.g. DeepSeek 128x128)
is_ct_int8_or_fp8_weight_scale: bool, # For per-block quantization, consider not quantizing q/k_proj.
): is_weight = modules[-1] in ("weight", "weight_packed")
# Calculate the expected shape of the weight. is_weight_scale = (
# Do not rely on w's shape, as it may be in another layout. modules[-1] == "weight_scale"
attn_in = self.config.head_dim * n_heads and loaded_weight.numel() > 1 # no need to permute per-tensor scales
attn_out = ( )
self.config.hidden_size is_k_proj = "wk" in modules or "k_proj" in modules
if not is_ct_int8_or_fp8_weight_scale is_q_proj = "wq" in modules or "q_proj" in modules
else w.shape[-1]
if (is_weight or is_weight_scale) and (is_k_proj or is_q_proj):
original_ndim = loaded_weight.ndim
if original_ndim == 1:
loaded_weight = loaded_weight.unsqueeze(-1)
f_out, f_in = loaded_weight.shape
n_heads = (
self.config.num_key_value_heads
if is_k_proj
else self.config.num_attention_heads
) )
loaded_weight = (
# If the weight is FP4 packed as uint8, we need to divide attn_out loaded_weight.view(n_heads, f_out // n_heads // 2, 2, f_in)
# by 2.
if w.dtype == torch.uint8 and w.shape[1] * 2 == attn_out:
attn_out = attn_out // 2
# If the weight is a weight scale, we need to divide attn_out by
# block size, which is currently 16.
elif (
w.dtype == torch.float8_e4m3fn
and is_nvfp4_weight_scale
and w.shape[1] * 16 == attn_out
):
attn_out = attn_out // 16
return (
w.view(n_heads, attn_in // n_heads // 2, 2, attn_out)
.transpose(1, 2) .transpose(1, 2)
.reshape(attn_in, attn_out) .reshape(f_out, f_in)
) )
modules = name.split(".") if original_ndim == 1:
loaded_weight = loaded_weight.squeeze(-1)
# Permute Q/K weights and weight block scales for rotary embedding
is_weight = modules[-1] == "weight"
is_nvfp4_weight_scale = (
modules[-1] == "weight_scale" and loaded_weight.dtype == torch.float8_e4m3fn
)
is_ct_int8_or_fp8_weight_scale = False
if modules[-1] == "weight_scale" and isinstance(
self.model.quant_config, ct.CompressedTensorsConfig
):
from compressed_tensors import CompressionFormat
is_ct_int8_or_fp8_weight_scale = self.model.quant_config.quant_format in [
CompressionFormat.int_quantized.value,
CompressionFormat.float_quantized.value,
] and loaded_weight.dtype in [torch.float16, torch.bfloat16, torch.float32]
if is_weight or is_nvfp4_weight_scale or is_ct_int8_or_fp8_weight_scale:
if "wk" in modules or "k_proj" in modules:
loaded_weight = permute(
loaded_weight,
self.config.num_key_value_heads,
is_nvfp4_weight_scale,
is_ct_int8_or_fp8_weight_scale,
)
elif "wq" in modules or "q_proj" in modules:
loaded_weight = permute(
loaded_weight,
self.config.num_attention_heads,
is_nvfp4_weight_scale,
is_ct_int8_or_fp8_weight_scale,
)
return name, loaded_weight return name, loaded_weight
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment