Unverified Commit a78565b7 authored by Suraj Patil's avatar Suraj Patil Committed by GitHub
Browse files

[MT5Config] add relative_attention_max_distance in config (#16170)

parent 4f4e5ddb
...@@ -50,6 +50,8 @@ class MT5Config(PretrainedConfig): ...@@ -50,6 +50,8 @@ class MT5Config(PretrainedConfig):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
relative_attention_num_buckets (`int`, *optional*, defaults to 32): relative_attention_num_buckets (`int`, *optional*, defaults to 32):
The number of buckets to use for each attention layer. The number of buckets to use for each attention layer.
relative_attention_max_distance (`int`, *optional*, defaults to 128):
The maximum distance of the longer sequences for the bucket separation.
dropout_rate (`float`, *optional*, defaults to 0.1): dropout_rate (`float`, *optional*, defaults to 0.1):
The ratio for all dropout layers. The ratio for all dropout layers.
layer_norm_eps (`float`, *optional*, defaults to 1e-6): layer_norm_eps (`float`, *optional*, defaults to 1e-6):
...@@ -75,6 +77,7 @@ class MT5Config(PretrainedConfig): ...@@ -75,6 +77,7 @@ class MT5Config(PretrainedConfig):
num_decoder_layers=None, num_decoder_layers=None,
num_heads=6, num_heads=6,
relative_attention_num_buckets=32, relative_attention_num_buckets=32,
relative_attention_max_distance=128,
dropout_rate=0.1, dropout_rate=0.1,
layer_norm_epsilon=1e-6, layer_norm_epsilon=1e-6,
initializer_factor=1.0, initializer_factor=1.0,
...@@ -107,6 +110,7 @@ class MT5Config(PretrainedConfig): ...@@ -107,6 +110,7 @@ class MT5Config(PretrainedConfig):
) # default = symmetry ) # default = symmetry
self.num_heads = num_heads self.num_heads = num_heads
self.relative_attention_num_buckets = relative_attention_num_buckets self.relative_attention_num_buckets = relative_attention_num_buckets
self.relative_attention_max_distance = relative_attention_max_distance
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_factor = initializer_factor self.initializer_factor = initializer_factor
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment