Replace assertion with exception (#16720)

* Updated assertions to exceptions * updated assertions to exceptions * bug fixes * fix-copies * Update modeling_ctrl.py * Update src/transformers/models/ctrl/modeling_tf_ctrl.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/gpt_neo/modeling_gpt_neo.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/gptj/modeling_gptj.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/gptj/modeling_tf_gptj.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update modeling_led.py * Update modeling_led.py * Update modeling_led.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Replace assertion with exception (#16720)
* Updated assertions to exceptions * updated assertions to exceptions * bug fixes * fix-copies * Update modeling_ctrl.py * Update src/transformers/models/ctrl/modeling_tf_ctrl.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/gpt_neo/modeling_gpt_neo.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/gptj/modeling_gptj.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/gptj/modeling_tf_gptj.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update modeling_led.py * Update modeling_led.py * Update modeling_led.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
cc034f72 · Anmol Joshi · GitHub · 14daa610 · cc034f72 · cc034f72
Unverified Commit cc034f72 authored Apr 12, 2022 by Anmol Joshi Committed by GitHub Apr 12, 2022
20 changed files
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -145,7 +145,8 @@ def load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=False):
    # Load weights from TF model
    init_vars = tf.saved_model.load(tf_path).variables if is_trivia_qa else tf.train.list_variables(tf_path)
-    assert len(init_vars) > 0, "Loaded trained variables cannot be empty."
+    if len(init_vars) <= 0:
+        raise ValueError("Loaded trained variables cannot be empty.")
    pt_names = list(model.state_dict().keys())
@@ -460,8 +461,11 @@ class BigBirdBlockSparseAttention(nn.Module):
        to_seq_length = from_seq_length = seqlen
        from_block_size = to_block_size = self.block_size
-        assert from_seq_length % from_block_size == 0, "Query sided sequence length must be multiple of block size"
+        if from_seq_length % from_block_size != 0:
-        assert to_seq_length % to_block_size == 0, "Key/Value sided sequence length must be multiple of block size"
+            raise ValueError("Query sided sequence length must be multiple of block size")
+        if to_seq_length % to_block_size != 0:
+            raise ValueError("Key/Value sided sequence length must be multiple of block size")
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        key_layer = self.transpose_for_scores(self.key(hidden_states))
@@ -1077,9 +1081,8 @@ class BigBirdBlockSparseAttention(nn.Module):
        """
        # using this method when from_seq_length in [1024, 3072, 4096]
-        assert (
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
-            from_seq_length // from_block_size == to_seq_length // to_block_size
+            raise ValueError("Error the number of blocks needs to be same!")
-        ), "Error the number of blocks needs to be same!"
        rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
        middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
@@ -1153,11 +1156,11 @@ class BigBirdBlockSparseAttention(nn.Module):
        """
        # using this method when from_seq_length not in [1024, 3072, 4096]
-        assert (
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
-            from_seq_length // from_block_size == to_seq_length // to_block_size
+            raise ValueError("Error the number of blocks needs to be same!")
-        ), "Error the number of blocks needs to be same!"
-        assert from_seq_length in plan_from_length, "Error from sequence length not in plan!"
+        if from_seq_length not in plan_from_length:
+            raise ValueError("Error from sequence length not in plan!")
        # Total number of blocks in the mmask
        num_blocks = from_seq_length // from_block_size
@@ -1397,9 +1400,8 @@ class BigBirdAttention(nn.Module):
                output_attentions,
            )
        else:
-            assert (
+            if encoder_hidden_states is not None:
-                encoder_hidden_states is None
+                raise ValueError("BigBird cannot be used as a decoder when config.attention_type != 'original_full'")
-            ), "BigBird cannot be used as a decoder when config.attention_type != 'original_full'"
            self_outputs = self.self(
                hidden_states, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, output_attentions
            )
@@ -1451,7 +1453,8 @@ class BigBirdLayer(nn.Module):
        self.is_decoder = config.is_decoder
        self.add_cross_attention = config.add_cross_attention
        if self.add_cross_attention:
-            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            if not self.is_decoder:
+                raise TypeError(f"{self} should be used as a decoder model if cross attention is added")
            self.crossattention = BigBirdAttention(config)
        self.intermediate = BigBirdIntermediate(config)
        self.output = BigBirdOutput(config)
@@ -2183,9 +2186,10 @@ class BigBirdModel(BigBirdPreTrainedModel):
    def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int):
        batch_size, seq_length = attention_mask.size()
-        assert (
+        if seq_length % block_size != 0:
-            seq_length % block_size == 0
+            raise ValueError(
-        ), f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block size is {block_size}."
+                f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block size is {block_size}."
+            )
        def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask):
            """
@@ -2454,7 +2458,8 @@ class BigBirdForMaskedLM(BigBirdPreTrainedModel):
        effective_batch_size = input_shape[0]
        #  add a dummy token
-        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
        dummy_token = torch.full(
            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device

--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -389,9 +389,10 @@ class FlaxBigBirdBlockSparseAttention(nn.Module):
    def create_masks_for_block_sparse_attn(attention_mask, block_size: int):
        batch_size, seq_length = attention_mask.shape
-        assert (
+        if seq_length % block_size != 0:
-            seq_length % block_size == 0
+            raise ValueError(
-        ), f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block size is {block_size}."
+                f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block size is {block_size}."
+            )
        def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask):
            """
@@ -464,8 +465,12 @@ class FlaxBigBirdBlockSparseAttention(nn.Module):
        to_seq_len = key_layer.shape[2]
        from_block_size = to_block_size = self.config.block_size
-        assert from_seq_len % from_block_size == 0, "Query sided sequence length must be multiple of block size"
+        if from_seq_len % from_block_size != 0:
-        assert to_seq_len % to_block_size == 0, "Key/Value sided sequence length must be multiple of block size"
+            raise ValueError("Query sided sequence length must be multiple of block size")
+        if to_seq_len % to_block_size != 0:
+            raise ValueError("Key/Value sided sequence length must be multiple of block size")
        if from_seq_len // from_block_size != to_seq_len // to_block_size:
            raise ValueError("Error the number of blocks needs to be same!")
@@ -863,9 +868,8 @@ class FlaxBigBirdBlockSparseAttention(nn.Module):
        """
        # using this method when from_seq_length in [1024, 3072, 4096]
-        assert (
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
-            from_seq_length // from_block_size == to_seq_length // to_block_size
+            raise ValueError("Error the number of blocks needs to be same!")
-        ), "Error the number of blocks needs to be same!"
        rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
        middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
@@ -939,11 +943,11 @@ class FlaxBigBirdBlockSparseAttention(nn.Module):
        """
        # using this method when from_seq_length not in [1024, 3072, 4096]
-        assert (
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
-            from_seq_length // from_block_size == to_seq_length // to_block_size
+            raise ValueError("Error the number of blocks needs to be same!")
-        ), "Error the number of blocks needs to be same!"
-        assert from_seq_length in plan_from_length, "Error from sequence length not in plan!"
+        if from_seq_length not in plan_from_length:
+            raise ValueError("Error from sequence length not in plan!")
        # Total number of blocks in the mmask
        num_blocks = from_seq_length // from_block_size

--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -83,7 +83,8 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id
-    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
@@ -287,8 +288,11 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
        to_seq_length = from_seq_length = seqlen
        from_block_size = to_block_size = self.block_size
-        assert from_seq_length % from_block_size == 0, "Query sided sequence length must be multiple of block size"
+        if from_seq_length % from_block_size != 0:
-        assert to_seq_length % to_block_size == 0, "Key/Value sided sequence length must be multiple of block size"
+            raise ValueError("Query sided sequence length must be multiple of block size")
+        if to_seq_length % to_block_size != 0:
+            raise ValueError("Key/Value sided sequence length must be multiple of block size")
        query_layer = self.transpose_for_scores(self.query(hidden_states))
        key_layer = self.transpose_for_scores(self.key(hidden_states))
@@ -904,9 +908,8 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
        """
        # using this method when from_seq_length in [1024, 3072, 4096]
-        assert (
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
-            from_seq_length // from_block_size == to_seq_length // to_block_size
+            raise ValueError("Error the number of blocks needs to be same!")
-        ), "Error the number of blocks needs to be same!"
        rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
        middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
@@ -980,11 +983,11 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
        """
        # using this method when from_seq_length not in [1024, 3072, 4096]
-        assert (
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
-            from_seq_length // from_block_size == to_seq_length // to_block_size
+            raise ValueError("Error the number of blocks needs to be same!")
-        ), "Error the number of blocks needs to be same!"
-        assert from_seq_length in plan_from_length, "Error from sequence length not in plan!"
+        if from_seq_length not in plan_from_length:
+            raise ValueError("Error from sequence length not in plan!")
        # Total number of blocks in the mmask
        num_blocks = from_seq_length // from_block_size
@@ -1914,9 +1917,10 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
-            assert head_mask.size()[0] == (
+            if head_mask.size()[0] != len(self.layers):
-                len(self.layers)
+                raise ValueError(
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                )
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
@@ -1997,9 +2001,10 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel):
    def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int):
        batch_size, seq_length = attention_mask.size()
-        assert (
+        if seq_length % block_size != 0:
-            seq_length % block_size == 0
+            raise ValueError(
-        ), f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block size is {block_size}."
+                f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block size is {block_size}."
+            )
        def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask):
            """
@@ -2242,9 +2247,10 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel):
        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
            if attn_mask is not None:
-                assert attn_mask.size()[0] == (
+                if attn_mask.size()[0] != len(self.layers):
-                    len(self.layers)
+                    raise ValueError(
-                ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                    )
        for idx, decoder_layer in enumerate(self.layers):
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            if output_hidden_states:

--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -747,9 +747,10 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel):
        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
-            assert head_mask.size()[0] == (
+            if head_mask.size()[0] != len(self.layers):
-                len(self.layers)
+                raise ValueError(
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                )
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
@@ -986,9 +987,10 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel):
        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
            if attn_mask is not None:
-                assert attn_mask.size()[0] == (
+                if attn_mask.size()[0] != len(self.layers):
-                    len(self.layers)
+                    raise ValueError(
-                ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                    )
        for idx, decoder_layer in enumerate(self.layers):
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            if output_hidden_states:

--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -746,9 +746,10 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
-            assert head_mask.size()[0] == (
+            if head_mask.size()[0] != len(self.layers):
-                len(self.layers)
+                raise ValueError(
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                )
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
@@ -983,9 +984,10 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
            if attn_mask is not None:
-                assert attn_mask.size()[0] == (
+                if attn_mask.size()[0] != len(self.layers):
-                    len(self.layers)
+                    raise ValueError(
-                ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                    )
        for idx, decoder_layer in enumerate(self.layers):
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            if output_hidden_states:

--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -184,13 +184,10 @@ def load_tf_weights_in_canine(model, config, tf_checkpoint_path):
            pointer = getattr(pointer, "weight")
        elif m_name == "kernel":
            array = np.transpose(array)
-        try:
-            assert (
+        if pointer.shape != array.shape:
-                pointer.shape == array.shape
+            raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
        logger.info(f"Initialize PyTorch weight {name}")
        pointer.data = torch.from_numpy(array)
    return model

--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -179,9 +179,10 @@ class CLIPAttention(nn.Module):
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
-        assert (
+        if self.head_dim * self.num_heads != self.embed_dim:
-            self.head_dim * self.num_heads == self.embed_dim
+            raise ValueError(
-        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout

--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -259,9 +259,10 @@ class FlaxCLIPAttention(nn.Module):
        self.embed_dim = self.config.hidden_size
        self.num_heads = self.config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
-        assert (
+        if self.head_dim * self.num_heads != self.embed_dim:
-            self.head_dim * self.num_heads == self.embed_dim
+            raise ValueError(
-        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
        self.scale = self.head_dim**-0.5
        self.dropout = self.config.attention_dropout

--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -317,9 +317,8 @@ class ConvBertSelfAttention(nn.Module):
            self.head_ratio = config.head_ratio
        self.conv_kernel_size = config.conv_kernel_size
-        assert (
+        if config.hidden_size % self.num_attention_heads != 0:
-            config.hidden_size % self.num_attention_heads == 0
+            raise ValueError("hidden_size should be divisible by num_attention_heads")
-        ), "hidden_size should be divisible by num_attention_heads"
        self.attention_head_size = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -554,7 +553,8 @@ class ConvBertLayer(nn.Module):
        self.is_decoder = config.is_decoder
        self.add_cross_attention = config.add_cross_attention
        if self.add_cross_attention:
-            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            if not self.is_decoder:
+                raise TypeError(f"{self} should be used as a decoder model if cross attention is added")
            self.crossattention = ConvBertAttention(config)
        self.intermediate = ConvBertIntermediate(config)
        self.output = ConvBertOutput(config)
@@ -578,9 +578,10 @@ class ConvBertLayer(nn.Module):
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
        if self.is_decoder and encoder_hidden_states is not None:
-            assert hasattr(
+            if not hasattr(self, "crossattention"):
-                self, "crossattention"
+                raise AttributeError(
-            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+                )
            cross_attention_outputs = self.crossattention(
                attention_output,
                encoder_attention_mask,

--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -168,9 +168,8 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer):
        self.num_attention_heads = num_attention_heads
        self.conv_kernel_size = config.conv_kernel_size
-        assert (
+        if config.hidden_size % self.num_attention_heads != 0:
-            config.hidden_size % self.num_attention_heads == 0
+            raise ValueError("hidden_size should be divisible by num_attention_heads")
-        ), "hidden_size should be divisible by num_attention_heads"
        self.attention_head_size = config.hidden_size // config.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size

--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -404,7 +404,8 @@ class CTRLModel(CTRLPreTrainedModel):
        # Attention mask.
        if attention_mask is not None:
-            assert batch_size > 0, "batch_size has to be defined and > 0"
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
            attention_mask = attention_mask.view(batch_size, -1)
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
@@ -669,9 +670,8 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel):
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]
-        assert (
+        if self.config.pad_token_id is None and batch_size != 1:
-            self.config.pad_token_id is not None or batch_size == 1
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1

--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -816,9 +816,8 @@ class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassific
                batch_size, sequence_length = shape_list(input_ids)[:2]
            else:
                batch_size, sequence_length = shape_list(inputs_embeds)[:2]
-            assert (
+            if self.config.pad_token_id is None and batch_size != 1:
-                self.config.pad_token_id is not None or batch_size == 1
+                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-            ), "Cannot handle batch sizes > 1 if no padding token is defined."
            if not tf.is_tensor(sequence_lengths):
                in_logits = logits[0:batch_size, sequence_lengths]

--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -292,7 +292,8 @@ class SPMTokenizer:
        self.vocab_file = vocab_file
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
-        assert os.path.exists(vocab_file)
+        if not os.path.exists(vocab_file):
+            raise FileNotFoundError(f"{vocab_file} does not exist!")
        spm.load(vocab_file)
        bpe_vocab_size = spm.GetPieceSize()
        # Token map

--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -412,7 +412,8 @@ class DetrSinePositionEmbedding(nn.Module):
        self.scale = scale
    def forward(self, pixel_values, pixel_mask):
-        assert pixel_mask is not None, "No pixel mask provided"
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
        if self.normalize:
@@ -486,9 +487,10 @@ class DetrAttention(nn.Module):
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
-        assert (
+        if self.head_dim * num_heads != self.embed_dim:
-            self.head_dim * num_heads == self.embed_dim
+            raise ValueError(
-        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+            )
        self.scaling = self.head_dim**-0.5
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
@@ -1254,7 +1256,8 @@ class DetrModel(DetrPreTrainedModel):
        # get final feature map and downsampled mask
        feature_map, mask = features[-1]
-        assert mask is not None, "Backbone does not return downsampled pixel mask"
+        if mask is None:
+            raise ValueError("Backbone does not return downsampled pixel mask")
        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
        projected_feature_map = self.input_projection(feature_map)
@@ -1709,9 +1712,10 @@ class DetrMaskHeadSmallConv(nn.Module):
    def __init__(self, dim, fpn_dims, context_dim):
        super().__init__()
-        assert (
+        if dim % 8 != 0:
-            dim % 8 == 0
+            raise ValueError(
-        ), "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in GroupNorm is set to 8"
+            )
        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
@@ -1897,7 +1901,8 @@ class DetrLoss(nn.Module):
        Classification loss (NLL) targets dicts must contain the key "class_labels" containing a tensor of dim
        [nb_target_boxes]
        """
-        assert "logits" in outputs, "No logits were found in the outputs"
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
        src_logits = outputs["logits"]
        idx = self._get_src_permutation_idx(indices)
@@ -1935,7 +1940,8 @@ class DetrLoss(nn.Module):
        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
        are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
-        assert "pred_boxes" in outputs, "No predicted boxes found in outputs"
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
        idx = self._get_src_permutation_idx(indices)
        src_boxes = outputs["pred_boxes"][idx]
        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
@@ -1957,7 +1963,8 @@ class DetrLoss(nn.Module):
        Targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w].
        """
-        assert "pred_masks" in outputs, "No predicted masks found in outputs"
+        if "pred_masks" not in outputs:
+            raise KeyError("No predicted masks found in outputs")
        src_idx = self._get_src_permutation_idx(indices)
        tgt_idx = self._get_tgt_permutation_idx(indices)
@@ -2002,7 +2009,8 @@ class DetrLoss(nn.Module):
            "boxes": self.loss_boxes,
            "masks": self.loss_masks,
        }
-        assert loss in loss_map, f"Loss {loss} not supported"
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
        return loss_map[loss](outputs, targets, indices, num_boxes)
    def forward(self, outputs, targets):
@@ -2097,7 +2105,8 @@ class DetrHungarianMatcher(nn.Module):
        self.class_cost = class_cost
        self.bbox_cost = bbox_cost
        self.giou_cost = giou_cost
-        assert class_cost != 0 or bbox_cost != 0 or giou_cost != 0, "All costs of the Matcher can't be 0"
+        if class_cost == 0 or bbox_cost == 0 or giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
    @torch.no_grad()
    def forward(self, outputs, targets):

--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -176,7 +176,8 @@ class DPREncoder(DPRPreTrainedModel):
    def __init__(self, config: DPRConfig):
        super().__init__(config)
        self.bert_model = BertModel(config, add_pooling_layer=False)
-        assert self.bert_model.config.hidden_size > 0, "Encoder hidden_size can't be zero"
+        if self.bert_model.config.hidden_size <= 0:
+            raise ValueError("Encoder hidden_size can't be zero")
        self.projection_dim = config.projection_dim
        if self.projection_dim > 0:
            self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)

--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -156,7 +156,8 @@ class TFDPREncoderLayer(tf.keras.layers.Layer):
        self.bert_model = TFBertMainLayer(config, add_pooling_layer=False, name="bert_model")
        self.config = config
-        assert self.config.hidden_size > 0, "Encoder hidden_size can't be zero"
+        if self.config.hidden_size <= 0:
+            raise ValueError("Encoder hidden_size can't be zero")
        self.projection_dim = config.projection_dim
        if self.projection_dim > 0:
            self.encode_proj = tf.keras.layers.Dense(

--- a/src/transformers/models/dpr/tokenization_dpr.py
+++ b/src/transformers/models/dpr/tokenization_dpr.py
@@ -234,9 +234,10 @@ class CustomDPRReaderTokenizerMixin:
        texts = texts if not isinstance(texts, str) else [texts]
        n_passages = len(titles)
        questions = questions if not isinstance(questions, str) else [questions] * n_passages
-        assert len(titles) == len(
+        if len(titles) != len(texts):
-            texts
+            raise ValueError(
-        ), f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
+                f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
+            )
        encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
        encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
        encoded_inputs = {
@@ -347,9 +348,11 @@ class CustomDPRReaderTokenizerMixin:
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        chosen_span_intervals = []
        for (start_index, end_index), score in scores:
-            assert start_index <= end_index, f"Wrong span indices: [{start_index}:{end_index}]"
+            if start_index > end_index:
+                raise ValueError(f"Wrong span indices: [{start_index}:{end_index}]")
            length = end_index - start_index + 1
-            assert length <= max_answer_length, f"Span is too long: {length} > {max_answer_length}"
+            if length > max_answer_length:
+                raise ValueError(f"Span is too long: {length} > {max_answer_length}")
            if any(
                [
                    start_index <= prev_start_index <= prev_end_index <= end_index

--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -115,13 +115,9 @@ def load_tf_weights_in_gpt_neo(model, config, gpt_neo_checkpoint_path):
            # if vocab is padded, then trim off the padding embeddings
            array = array[: config.vocab_size]
-        try:
+        if pointer.shape != array.shape:
-            assert (
+            raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched {name}")
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched {name}"
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
        print(f"Initialize PyTorch weight {name}")
        pointer.data = torch.from_numpy(array)
@@ -552,7 +548,8 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
        # Attention mask.
        if attention_mask is not None:
-            assert batch_size > 0, "batch_size has to be defined and > 0"
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
            attention_mask = attention_mask.view(batch_size, -1)
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
@@ -875,9 +872,8 @@ class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel):
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]
-        assert (
+        if self.config.pad_token_id is None and batch_size != 1:
-            self.config.pad_token_id is not None or batch_size == 1
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:

--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -573,7 +573,8 @@ class GPTJModel(GPTJPreTrainedModel):
        # Attention mask.
        if attention_mask is not None:
-            assert batch_size > 0, "batch_size has to be defined and > 0"
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
            attention_mask = attention_mask.view(batch_size, -1)
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
@@ -939,9 +940,8 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel):
        else:
            batch_size = inputs_embeds.shape[0]
-        assert (
+        if self.config.pad_token_id is None and batch_size != 1:
-            self.config.pad_token_id is not None or batch_size == 1
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:

--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -934,9 +934,8 @@ class TFGPTJForSequenceClassification(TFGPTJPreTrainedModel, TFSequenceClassific
        loss = None
        if labels is not None:
-            assert (
+            if self.config.pad_token_id is None and logits_shape[0] != 1:
-                self.config.pad_token_id is not None or logits_shape[0] == 1
+                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-            ), "Cannot handle batch sizes > 1 if no padding token is defined."
            if not tf.is_tensor(sequence_lengths):
                in_logits = logits[0 : logits_shape[0], sequence_lengths]