Unverified Commit 1fd0cf8a authored by Michael Feil's avatar Michael Feil Committed by GitHub
Browse files

Update comment in qwen2.py (#4447)

parent bf63ee54
......@@ -170,6 +170,7 @@ class Qwen2MoeAttention(nn.Module):
rope_theta: float = 10000,
rope_scaling: Optional[Dict[str, Any]] = None,
max_position_embeddings: int = 8192,
qkv_bias: int = True,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
......@@ -201,7 +202,7 @@ class Qwen2MoeAttention(nn.Module):
self.head_dim,
self.total_num_heads,
self.total_num_kv_heads,
bias=True,
bias=qkv_bias,
quant_config=quant_config,
prefix=add_prefix("qkv_proj", prefix),
)
......@@ -257,6 +258,8 @@ class Qwen2MoeDecoderLayer(nn.Module):
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# note: replace config.num_hidden_layers < 80 with True once its available in transformers 4.50.0
qkv_bias = getattr(config, "qkv_bias", config.num_hidden_layers < 80)
self.self_attn = Qwen2MoeAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
......@@ -266,6 +269,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
qkv_bias=qkv_bias,
prefix=add_prefix("self_attn", prefix),
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment