[Flax LLaMA] Fix attn dropout (#28059)

6af3ce77 · Sanchit Gandhi · GitHub · 7e876dca · 6af3ce77
Unverified Commit 6af3ce77 authored Dec 15, 2023 by Sanchit Gandhi Committed by GitHub Dec 15, 2023
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 0 deletions

src/transformers/models/llama/modeling_flax_llama.py src/transformers/models/llama/modeling_flax_llama.py +6 -0

No files found.
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -289,6 +289,10 @@ class FlaxLlamaAttention(nn.Module):
        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
        attention_mask = combine_masks(attention_mask, causal_mask)
+        dropout_rng = None
+        if not deterministic and self.config.attention_dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
        # During fast autoregressive decoding, we feed one position at a time,
        # and cache the keys and values step by step.
        if self.has_variable("cache", "cached_key") or init_cache:
@@ -307,6 +311,8 @@ class FlaxLlamaAttention(nn.Module):
            query,
            key,
            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_dropout,
            deterministic=deterministic,
            dtype=attention_dtype,
        )