Unverified Commit cc034f72 authored by Anmol Joshi's avatar Anmol Joshi Committed by GitHub
Browse files

Replace assertion with exception (#16720)



* Updated assertions to exceptions

* updated assertions to exceptions

* bug fixes

* fix-copies

* Update modeling_ctrl.py

* Update src/transformers/models/ctrl/modeling_tf_ctrl.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/gpt_neo/modeling_gpt_neo.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/gptj/modeling_gptj.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/gptj/modeling_tf_gptj.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update modeling_led.py

* Update modeling_led.py

* Update modeling_led.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 14daa610
......@@ -228,9 +228,8 @@ class IBertSelfAttention(nn.Module):
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
assert (
self.position_embedding_type == "absolute"
), "I-BERT only supports 'absolute' for `config.position_embedding_type`"
if self.position_embedding_type != "absolute":
raise ValueError("I-BERT only supports 'absolute' for `config.position_embedding_type`")
self.softmax = IntSoftmax(self.act_bit, quant_mode=self.quant_mode, force_dequant=config.force_dequant)
......@@ -429,7 +428,8 @@ class IBertIntermediate(nn.Module):
quant_mode=self.quant_mode,
per_channel=True,
)
assert config.hidden_act == "gelu", "I-BERT only supports 'gelu' for `config.hidden_act`"
if config.hidden_act != "gelu":
raise ValueError("I-BERT only supports 'gelu' for `config.hidden_act`")
self.intermediate_act_fn = IntGELU(quant_mode=self.quant_mode, force_dequant=config.force_dequant)
self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode)
......
......@@ -67,7 +67,8 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
shifted_input_ids[:, 0] = decoder_start_token_id
assert pad_token_id is not None, "config.pad_token_id has to be defined."
if pad_token_id is None:
raise ValueError("config.pad_token_id has to be defined.")
# replace possible -100 values in labels by `pad_token_id`
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
......@@ -763,9 +764,10 @@ class LEDDecoderAttention(nn.Module):
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert (
self.head_dim * num_heads == self.embed_dim
), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.is_decoder = is_decoder
......@@ -833,27 +835,25 @@ class LEDDecoderAttention(nn.Module):
src_len = key_states.size(1)
attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
assert attn_weights.size() == (
bsz * self.num_heads,
tgt_len,
src_len,
), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
raise ValueError(
f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
)
if attention_mask is not None:
assert attention_mask.size() == (
bsz,
1,
tgt_len,
src_len,
), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
if layer_head_mask is not None:
assert layer_head_mask.size() == (
self.num_heads,
), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
if layer_head_mask.size() != (self.num_heads,):
raise ValueError(
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
)
attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
......@@ -871,11 +871,10 @@ class LEDDecoderAttention(nn.Module):
attn_output = torch.bmm(attn_probs, value_states)
assert attn_output.size() == (
bsz * self.num_heads,
tgt_len,
self.head_dim,
), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
)
attn_output = (
attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
......@@ -1615,11 +1614,14 @@ class LEDEncoder(LEDPreTrainedModel):
self.max_source_positions = config.max_encoder_position_embeddings
if isinstance(config.attention_window, int):
assert config.attention_window % 2 == 0, "`config.attention_window` has to be an even value"
assert config.attention_window > 0, "`config.attention_window` has to be positive"
if config.attention_window % 2 != 0:
raise ValueError("`config.attention_window` has to be an even value")
if config.attention_window <= 0:
raise ValueError("`config.attention_window` has to be positive")
config.attention_window = [config.attention_window] * config.num_hidden_layers # one value per layer
else:
assert len(config.attention_window) == config.num_hidden_layers, (
if len(config.attention_window) != config.num_hidden_layers:
raise ValueError(
"`len(config.attention_window)` should equal `config.num_hidden_layers`. "
f"Expected {config.num_hidden_layers}, given {len(config.attention_window)}"
)
......@@ -1667,7 +1669,8 @@ class LEDEncoder(LEDPreTrainedModel):
else max(self.config.attention_window)
)
assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}"
if attention_window % 2 != 0:
raise ValueError(f"`attention_window` should be an even value. Given {attention_window}")
input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
batch_size, seq_len = input_shape[:2]
......@@ -1810,9 +1813,10 @@ class LEDEncoder(LEDPreTrainedModel):
# check if head_mask has a correct number of layers specified if desired
if head_mask is not None:
assert head_mask.size()[0] == (
len(self.layers)
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
if head_mask.size()[0] != len(self.layers):
raise ValueError(
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
)
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
......@@ -2065,9 +2069,10 @@ class LEDDecoder(LEDPreTrainedModel):
# check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
if attn_mask is not None:
assert attn_mask.size()[0] == (
len(self.layers)
), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
if attn_mask.size()[0] != len(self.layers):
raise ValueError(
f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
)
for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
......
......@@ -790,9 +790,10 @@ class M2M100Encoder(M2M100PreTrainedModel):
# check if head_mask has a correct number of layers specified if desired
if head_mask is not None:
assert head_mask.size()[0] == (
len(self.layers)
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
if head_mask.size()[0] != len(self.layers):
raise ValueError(
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
)
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
......@@ -1013,9 +1014,10 @@ class M2M100Decoder(M2M100PreTrainedModel):
# check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
if attn_mask is not None:
assert attn_mask.size()[0] == (
len(self.layers)
), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
if attn_mask.size()[0] != len(self.layers):
raise ValueError(
f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
)
for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
......
......@@ -303,7 +303,8 @@ class M2M100Tokenizer(PreTrainedTokenizer):
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
save_dir = Path(save_directory)
assert save_dir.is_dir(), f"{save_directory} should be a directory"
if not save_dir.is_dir():
raise OSError(f"{save_directory} should be a directory")
vocab_save_path = save_dir / (
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
)
......
......@@ -1185,9 +1185,10 @@ class DetrAttention(nn.Module):
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert (
self.head_dim * num_heads == self.embed_dim
), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
......
......@@ -226,7 +226,8 @@ def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int) -> jnp.ndarray
"""
prev_output_tokens = np.array(input_ids).copy()
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
# replace possible -100 values in labels by `pad_token_id`
prev_output_tokens = np.where(prev_output_tokens == -100, pad_token_id, input_ids)
......
......@@ -78,7 +78,8 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
"""
prev_output_tokens = input_ids.clone()
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
# replace possible -100 values in labels by `pad_token_id`
prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
......@@ -805,9 +806,10 @@ class MBartEncoder(MBartPreTrainedModel):
# check if head_mask has a correct number of layers specified if desired
if head_mask is not None:
assert head_mask.size()[0] == (
len(self.layers)
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
if head_mask.size()[0] != len(self.layers):
raise ValueError(
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
)
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
......@@ -1044,9 +1046,10 @@ class MBartDecoder(MBartPreTrainedModel):
# check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
if attn_mask is not None:
assert attn_mask.size()[0] == (
len(self.layers)
), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
if attn_mask.size()[0] != len(self.layers):
raise ValueError(
f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
)
for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
......
......@@ -66,7 +66,8 @@ def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int):
Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
have a single `decoder_start_token_id` in contrast to other Bart-like models.
"""
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
# replace possible -100 values in labels by `pad_token_id`
input_ids = tf.where(input_ids == -100, tf.fill(shape_list(input_ids), pad_token_id), input_ids)
language_id_index = (
......
......@@ -130,13 +130,8 @@ def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
assert (
pointer.shape == array.shape
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
return model
......@@ -425,7 +420,8 @@ class MegatronBertLayer(nn.Module):
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
if not self.is_decoder:
raise TypeError(f"{self} should be used as a decoder model if cross attention is added")
self.crossattention = MegatronBertAttention(config)
self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.intermediate = MegatronBertIntermediate(config)
......@@ -461,9 +457,10 @@ class MegatronBertLayer(nn.Module):
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
assert hasattr(
self, "crossattention"
), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
if not hasattr(self, "crossattention"):
raise AttributeError(
f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
)
# cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
......@@ -1354,7 +1351,8 @@ class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
effective_batch_size = input_shape[0]
# add a dummy token
assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
if self.config.pad_token_id is None:
raise ValueError("The PAD token should be defined for generation")
attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
dummy_token = torch.full(
(effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
......
......@@ -137,9 +137,10 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
pos_embeddings = embeddings["position_embeddings"]["weight"]
# Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
n_positions = pos_embeddings.size(0)
assert (
n_positions == config.n_positions
), f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
if n_positions != config.n_positions:
raise ValueError(
f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
)
# Store the position embeddings.
output_state_dict["transformer.wpe.weight"] = pos_embeddings
......
......@@ -773,9 +773,10 @@ class PegasusEncoder(PegasusPreTrainedModel):
# check if head_mask has a correct number of layers specified if desired
if head_mask is not None:
assert head_mask.size()[0] == (
len(self.layers)
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
if head_mask.size()[0] != len(self.layers):
raise ValueError(
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
)
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
......@@ -1040,9 +1041,10 @@ class PegasusDecoder(PegasusPreTrainedModel):
# check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
if attn_mask is not None:
assert attn_mask.size()[0] == (
len(self.layers)
), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
if attn_mask.size()[0] != len(self.layers):
raise ValueError(
f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
)
for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
......
......@@ -117,9 +117,10 @@ class PegasusTokenizer(PreTrainedTokenizer):
) -> None:
self.offset = offset
if additional_special_tokens is not None:
assert isinstance(
additional_special_tokens, list
), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
if not isinstance(additional_special_tokens, list):
raise TypeError(
f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
)
additional_special_tokens_extended = (
([mask_token_sent] + additional_special_tokens)
......
......@@ -113,9 +113,10 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast):
self.offset = offset
if additional_special_tokens is not None:
assert isinstance(
additional_special_tokens, list
), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
if not isinstance(additional_special_tokens, list):
raise TypeError(
f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
)
additional_special_tokens_extended = (
([mask_token_sent] + additional_special_tokens)
......@@ -155,9 +156,10 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast):
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
assert all_special_ids == set(
range(len(self.additional_special_tokens) + 3)
), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
if all_special_ids != set(range(len(self.additional_special_tokens) + 3)):
raise ValueError(
f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
)
return [1 if x in all_special_ids else 0 for x in seq]
......
......@@ -75,7 +75,8 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
"""
prev_output_tokens = input_ids.clone()
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
# replace possible -100 values in labels by `pad_token_id`
prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
......
......@@ -886,9 +886,10 @@ class UniSpeechGumbelVectorQuantizer(nn.Module):
self.num_groups = config.num_codevector_groups
self.num_vars = config.num_codevectors_per_group
assert (
config.codevector_dim % self.num_groups == 0
), f"`config.codevector_dim {config.codevector_dim} must be divisible by `config.num_codevector_groups` {self.num_groups} for concatenation"
if config.codevector_dim % self.num_groups != 0:
raise ValueError(
f"`config.codevector_dim {config.codevector_dim} must be divisible by `config.num_codevector_groups` {self.num_groups} for concatenation"
)
# storage for codebook variables (codewords)
self.codevectors = nn.Parameter(
......
......@@ -70,9 +70,10 @@ def set_recursively(hf_pointer, key, value, full_name, weight_type):
else:
hf_shape = hf_pointer.shape
assert (
hf_shape == value.shape
), f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be {value.shape} for {full_name}"
if hf_shape != value.shape:
raise ValueError(
f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be {value.shape} for {full_name}"
)
if weight_type == "weight":
hf_pointer.weight.data = value
......@@ -143,28 +144,32 @@ def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_gro
if type_id == 0:
if "bias" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.bias.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].conv.weight.data = value
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
if "bias" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was found."
if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
elif "weight" in name:
assert (
value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape
), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
raise ValueError(
f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
)
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
else:
......
......@@ -925,9 +925,10 @@ class UniSpeechSatGumbelVectorQuantizer(nn.Module):
self.num_groups = config.num_codevector_groups
self.num_vars = config.num_codevectors_per_group
assert (
config.codevector_dim % self.num_groups == 0
), f"`config.codevector_dim {config.codevector_dim} must be divisible by `config.num_codevector_groups` {self.num_groups} for concatenation"
if config.codevector_dim % self.num_groups != 0:
raise ValueError(
f"`config.codevector_dim {config.codevector_dim} must be divisible by `config.num_codevector_groups` {self.num_groups} for concatenation"
)
# storage for codebook variables (codewords)
self.codevectors = nn.Parameter(
......
......@@ -139,7 +139,8 @@ def convert_weight_and_push(
module_transfer(x)
our_model = copy_parameters(from_model, our_model)
assert torch.allclose(from_model(x), our_model(x).logits), "The model logits don't match the original one."
if not torch.allclose(from_model(x), our_model(x).logits):
raise ValueError("The model logits don't match the original one.")
checkpoint_name = name
print(checkpoint_name)
......
......@@ -720,9 +720,10 @@ class XGLMModel(XGLMPreTrainedModel):
# check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
if attn_mask is not None:
assert attn_mask.size()[0] == (
len(self.layers)
), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
if attn_mask.size()[0] != len(self.layers):
raise ValueError(
f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
)
for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
......
......@@ -263,7 +263,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
self.n_layers = config.n_layers
self.max_position_embeddings = config.max_position_embeddings
self.embed_init_std = config.embed_init_std
assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
if self.dim % self.n_heads != 0:
raise ValueError("transformer dim must be a multiple of n_heads")
# embeddings
self.dropout = tf.keras.layers.Dropout(config.dropout)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment