"README.md.src" did not exist on "4eade17a6e1baf6bc5c71daac7fc3ac595c378a2"
Unverified Commit 4b6f069b authored by Antoni Baum's avatar Antoni Baum Committed by GitHub
Browse files

Add support for CodeLlama (#854)

parent 791d79de
...@@ -85,6 +85,7 @@ class LlamaAttention(nn.Module): ...@@ -85,6 +85,7 @@ class LlamaAttention(nn.Module):
hidden_size: int, hidden_size: int,
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
rope_theta: float = 10000,
): ):
super().__init__() super().__init__()
self.hidden_size = hidden_size self.hidden_size = hidden_size
...@@ -99,6 +100,7 @@ class LlamaAttention(nn.Module): ...@@ -99,6 +100,7 @@ class LlamaAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = ColumnParallelLinear( self.qkv_proj = ColumnParallelLinear(
hidden_size, hidden_size,
...@@ -118,6 +120,7 @@ class LlamaAttention(nn.Module): ...@@ -118,6 +120,7 @@ class LlamaAttention(nn.Module):
self.attn = PagedAttentionWithRoPE(self.num_heads, self.attn = PagedAttentionWithRoPE(self.num_heads,
self.head_dim, self.head_dim,
self.scaling, self.scaling,
base=self.rope_theta,
rotary_dim=self.head_dim, rotary_dim=self.head_dim,
num_kv_heads=self.num_kv_heads) num_kv_heads=self.num_kv_heads)
...@@ -143,10 +146,13 @@ class LlamaDecoderLayer(nn.Module): ...@@ -143,10 +146,13 @@ class LlamaDecoderLayer(nn.Module):
def __init__(self, config: LlamaConfig): def __init__(self, config: LlamaConfig):
super().__init__() super().__init__()
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
self.self_attn = LlamaAttention( self.self_attn = LlamaAttention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads, num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
) )
self.mlp = LlamaMLP( self.mlp = LlamaMLP(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment