Remove some Kosmos-2 `copied from` (#27149)

* fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Remove some Kosmos-2 `copied from` (#27149)
* fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
3224c0c1 · Yih-Dar · GitHub · cd19b193 · 3224c0c1
Unverified Commit 3224c0c1 authored Oct 30, 2023 by Yih-Dar Committed by GitHub Oct 30, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 10 deletions

src/transformers/models/kosmos2/modeling_kosmos2.py src/transformers/models/kosmos2/modeling_kosmos2.py +2 -10

No files found.
--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@@ -52,7 +52,6 @@ KOSMOS2_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]


-# Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -67,7 +66,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)


-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
@@ -660,7 +658,7 @@ class Kosmos2VisionEncoder(nn.Module):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            if self.gradient_checkpointing and self.training:
-                layer_outputs = self.gradient_checkpointing_func(
+                layer_outputs = self._gradient_checkpointing_func(
                    encoder_layer.__call__,
                    hidden_states,
                    attention_mask,
@@ -1114,7 +1112,6 @@ class Kosmos2TextTransformer(nn.Module):

        self.gradient_checkpointing = False

-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
        # create causal mask
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1268,7 +1265,7 @@ class Kosmos2TextTransformer(nn.Module):
            past_key_value = past_key_values[idx] if past_key_values is not None else None

            if self.gradient_checkpointing and self.training:
-                layer_outputs = self.gradient_checkpointing_func(
+                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    attention_mask,
@@ -1428,11 +1425,6 @@ class Kosmos2PreTrainedModel(PreTrainedModel):
            if module.embed_tokens.padding_idx is not None:
                module.embed_tokens.weight.data[module.embed_tokens.padding_idx].zero_()

-    def _set_gradient_checkpointing(self, module, gradient_checkpointing_func=None):
-        if isinstance(module, (Kosmos2TextTransformer, Kosmos2VisionEncoder)):
-            module.gradient_checkpointing_func = gradient_checkpointing_func
-            module.gradient_checkpointing = gradient_checkpointing_func is not None
-

 class Kosmos2VisionModel(Kosmos2PreTrainedModel):
    config_class = Kosmos2VisionConfig