Unverified Commit 4934d492 authored by Jong-hun Shin's avatar Jong-hun Shin Committed by GitHub
Browse files

Support GPT-NeoX Models without attention biases (#2301)

parent 358c328d
...@@ -54,6 +54,7 @@ class GPTNeoXAttention(nn.Module): ...@@ -54,6 +54,7 @@ class GPTNeoXAttention(nn.Module):
self.total_num_heads = config.num_attention_heads self.total_num_heads = config.num_attention_heads
self.hidden_size = config.hidden_size self.hidden_size = config.hidden_size
self.head_size = self.hidden_size // self.total_num_heads self.head_size = self.hidden_size // self.total_num_heads
self.bias = getattr(config, "attention_bias", True)
tensor_model_parallel_world_size = ( tensor_model_parallel_world_size = (
get_tensor_model_parallel_world_size()) get_tensor_model_parallel_world_size())
...@@ -65,11 +66,13 @@ class GPTNeoXAttention(nn.Module): ...@@ -65,11 +66,13 @@ class GPTNeoXAttention(nn.Module):
config.hidden_size, config.hidden_size,
self.head_size, self.head_size,
self.total_num_heads, self.total_num_heads,
bias=self.bias,
linear_method=linear_method, linear_method=linear_method,
) )
self.dense = RowParallelLinear( self.dense = RowParallelLinear(
config.hidden_size, config.hidden_size,
config.hidden_size, config.hidden_size,
bias=self.bias,
linear_method=linear_method, linear_method=linear_method,
) )
scaling = self.head_size**-0.5 scaling = self.head_size**-0.5
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment