"vllm/model_executor/models/mistral3.py" did not exist on "a115ac46b5be22289dec975c2c06653b22cd6315"
Unverified Commit 5b5b48b1 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Remove copied froms for deprecated models (#31153)

* Remove copied froms for deprecated models

* Remove automatically in script
parent 97e5a707
......@@ -78,7 +78,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size.
......@@ -110,7 +109,6 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
return (oh, ow)
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size(
input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]],
......@@ -139,7 +137,6 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
......@@ -175,7 +172,6 @@ def get_image_size_for_max_height_width(
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
"""
Returns a function that converts a numpy array to the framework of the input array.
......@@ -200,7 +196,6 @@ def get_numpy_to_framework_fn(arr) -> Callable:
raise ValueError(f"Cannot convert arrays of type {type(arr)}")
# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
"""
Squeezes an array, but only if the axis specified has dim 1.
......@@ -214,7 +209,6 @@ def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
return arr
# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
image_height, image_width = image_size
norm_annotation = {}
......@@ -229,7 +223,6 @@ def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
return norm_annotation
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
Return the maximum value across all indices of an iterable of values.
......@@ -237,7 +230,6 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
return [max(values_i) for values_i in zip(*values)]
# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
......@@ -256,7 +248,6 @@ def get_max_height_width(
return (max_height, max_width)
# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
......@@ -275,7 +266,6 @@ def make_pixel_mask(
return mask
# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
Convert a COCO polygon annotation to a mask.
......@@ -310,7 +300,6 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
return masks
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DETA
def prepare_coco_detection_annotation(
image,
target,
......@@ -371,7 +360,6 @@ def prepare_coco_detection_annotation(
return new_target
# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
"""
Compute the bounding boxes around the provided panoptic segmentation masks.
......@@ -406,7 +394,6 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
return np.stack([x_min, y_min, x_max, y_max], 1)
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DETA
def prepare_coco_panoptic_annotation(
image: np.ndarray,
target: Dict,
......@@ -448,7 +435,6 @@ def prepare_coco_panoptic_annotation(
return new_target
# Copied from transformers.models.detr.image_processing_detr.resize_annotation
def resize_annotation(
annotation: Dict[str, Any],
orig_size: Tuple[int, int],
......@@ -594,7 +580,6 @@ class DetaImageProcessor(BaseImageProcessor):
self.do_pad = do_pad
self.pad_size = pad_size
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA
def prepare_annotation(
self,
image: np.ndarray,
......@@ -683,7 +668,6 @@ class DetaImageProcessor(BaseImageProcessor):
)
return image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
def resize_annotation(
self,
annotation,
......@@ -697,7 +681,6 @@ class DetaImageProcessor(BaseImageProcessor):
"""
return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale(
self,
image: np.ndarray,
......@@ -726,7 +709,6 @@ class DetaImageProcessor(BaseImageProcessor):
"""
return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
......@@ -734,7 +716,6 @@ class DetaImageProcessor(BaseImageProcessor):
"""
return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
......@@ -778,7 +759,6 @@ class DetaImageProcessor(BaseImageProcessor):
new_annotation[key] = value
return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
......@@ -812,7 +792,6 @@ class DetaImageProcessor(BaseImageProcessor):
)
return padded_image, annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
......
......@@ -52,7 +52,6 @@ logger = logging.get_logger(__name__)
MultiScaleDeformableAttention = None
# Copied from models.deformable_detr.load_cuda_kernels
def load_cuda_kernels():
from torch.utils.cpp_extension import load
......@@ -83,7 +82,6 @@ def load_cuda_kernels():
)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
class MultiScaleDeformableAttentionFunction(Function):
@staticmethod
def forward(
......@@ -152,7 +150,6 @@ _CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365"
@dataclass
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->Deta
class DetaDecoderOutput(ModelOutput):
"""
Base class for outputs of the DetaDecoder. This class adds two attributes to
......@@ -344,7 +341,6 @@ def inverse_sigmoid(x, eps=1e-5):
return torch.log(x1 / x2)
# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->Deta
class DetaFrozenBatchNorm2d(nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed.
......@@ -384,7 +380,6 @@ class DetaFrozenBatchNorm2d(nn.Module):
return x * scale + bias
# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->Deta
def replace_batch_norm(model):
r"""
Recursively replace all `torch.nn.BatchNorm2d` with `DetaFrozenBatchNorm2d`.
......@@ -454,7 +449,6 @@ class DetaBackboneWithPositionalEncodings(nn.Module):
return out, pos
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->Deta
class DetaSinePositionEmbedding(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
......@@ -493,7 +487,6 @@ class DetaSinePositionEmbedding(nn.Module):
return pos
# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
class DetaLearnedPositionEmbedding(nn.Module):
"""
This module learns positional embeddings up to a fixed maximum size.
......@@ -517,7 +510,6 @@ class DetaLearnedPositionEmbedding(nn.Module):
return pos
# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->Deta
def build_position_encoding(config):
n_steps = config.d_model // 2
if config.position_embedding_type == "sine":
......@@ -531,7 +523,6 @@ def build_position_encoding(config):
return position_embedding
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
def multi_scale_deformable_attention(
value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor:
......@@ -571,7 +562,6 @@ def multi_scale_deformable_attention(
return output.transpose(1, 2).contiguous()
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->Deta
class DetaMultiscaleDeformableAttention(nn.Module):
"""
Multiscale deformable attention as proposed in Deformable DETR.
......@@ -715,7 +705,6 @@ class DetaMultiscaleDeformableAttention(nn.Module):
return output, attention_weights
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->Deta,Deformable DETR->DETA
class DetaMultiheadAttention(nn.Module):
"""
Multi-headed attention from 'Attention Is All You Need' paper.
......@@ -1506,11 +1495,9 @@ class DetaModel(DetaPreTrainedModel):
self.post_init()
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_encoder
def get_encoder(self):
return self.encoder
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_decoder
def get_decoder(self):
return self.decoder
......@@ -1522,7 +1509,6 @@ class DetaModel(DetaPreTrainedModel):
for name, param in self.backbone.model.named_parameters():
param.requires_grad_(True)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_valid_ratio
def get_valid_ratio(self, mask, dtype=torch.float32):
"""Get the valid ratio of all feature maps."""
......@@ -1534,7 +1520,6 @@ class DetaModel(DetaPreTrainedModel):
valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
return valid_ratio
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_proposal_pos_embed
def get_proposal_pos_embed(self, proposals):
"""Get the position embedding of the proposals."""
......@@ -1869,7 +1854,6 @@ class DetaForObjectDetection(DetaPreTrainedModel):
# We can't initialize the model on meta device as some weights are modified during the initialization
_no_split_modules = None
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta
def __init__(self, config: DetaConfig):
super().__init__(config)
......@@ -2105,7 +2089,6 @@ class DetaForObjectDetection(DetaPreTrainedModel):
return dict_outputs
# Copied from transformers.models.detr.modeling_detr.dice_loss
def dice_loss(inputs, targets, num_boxes):
"""
Compute the DICE loss, similar to generalized IOU for masks
......@@ -2125,7 +2108,6 @@ def dice_loss(inputs, targets, num_boxes):
return loss.sum() / num_boxes
# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
......@@ -2197,7 +2179,6 @@ class DetaLoss(nn.Module):
if self.assign_second_stage:
self.stg2_assigner = DetaStage2Assigner(num_queries)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_labels
def loss_labels(self, outputs, targets, indices, num_boxes):
"""
Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
......@@ -2232,7 +2213,6 @@ class DetaLoss(nn.Module):
return losses
@torch.no_grad()
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_cardinality
def loss_cardinality(self, outputs, targets, indices, num_boxes):
"""
Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
......@@ -2248,7 +2228,6 @@ class DetaLoss(nn.Module):
losses = {"cardinality_error": card_err}
return losses
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_boxes
def loss_boxes(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
......@@ -2273,21 +2252,18 @@ class DetaLoss(nn.Module):
losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx
def _get_source_permutation_idx(self, indices):
# permute predictions following indices
batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
source_idx = torch.cat([source for (source, _) in indices])
return batch_idx, source_idx
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_target_permutation_idx
def _get_target_permutation_idx(self, indices):
# permute targets following indices
batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
target_idx = torch.cat([target for (_, target) in indices])
return batch_idx, target_idx
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.get_loss
def get_loss(self, loss, outputs, targets, indices, num_boxes):
loss_map = {
"labels": self.loss_labels,
......@@ -2360,7 +2336,6 @@ class DetaLoss(nn.Module):
return losses
# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
class DetaMLPPredictionHead(nn.Module):
"""
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
......@@ -2382,7 +2357,6 @@ class DetaMLPPredictionHead(nn.Module):
return x
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->Deta
class DetaHungarianMatcher(nn.Module):
"""
This class computes an assignment between the targets and the predictions of the network.
......@@ -2463,7 +2437,6 @@ class DetaHungarianMatcher(nn.Module):
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
# Copied from transformers.models.detr.modeling_detr._upcast
def _upcast(t: Tensor) -> Tensor:
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
if t.is_floating_point():
......@@ -2472,7 +2445,6 @@ def _upcast(t: Tensor) -> Tensor:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
# Copied from transformers.models.detr.modeling_detr.box_area
def box_area(boxes: Tensor) -> Tensor:
"""
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
......@@ -2489,7 +2461,6 @@ def box_area(boxes: Tensor) -> Tensor:
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# Copied from transformers.models.detr.modeling_detr.box_iou
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
......@@ -2506,7 +2477,6 @@ def box_iou(boxes1, boxes2):
return iou, union
# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
def generalized_box_iou(boxes1, boxes2):
"""
Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
......
......@@ -239,7 +239,6 @@ class EfficientFormerConvMlp(nn.Module):
return hidden_state
# Copied from transformers.models.convnext.modeling_convnext.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
......@@ -260,7 +259,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->EfficientFormer
class EfficientFormerDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
......
......@@ -86,7 +86,6 @@ class ErnieMEmbeddings(nn.Module):
return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->ErnieM,self.value->self.v_proj,self.key->self.k_proj,self.query->self.q_proj
class ErnieMSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
......@@ -380,7 +379,6 @@ class ErnieMEncoder(nn.Module):
)
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->ErnieM
class ErnieMPooler(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -599,7 +597,6 @@ class ErnieMModel(ErnieMPreTrainedModel):
ERNIE_M_START_DOCSTRING,
)
class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......@@ -701,7 +698,6 @@ class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
ERNIE_M_START_DOCSTRING,
)
class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config):
super().__init__(config)
......@@ -791,7 +787,6 @@ class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
ERNIE_M_START_DOCSTRING,
)
class ErnieMForTokenClassification(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......@@ -872,7 +867,6 @@ class ErnieMForTokenClassification(ErnieMPreTrainedModel):
ERNIE_M_START_DOCSTRING,
)
class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
......@@ -968,7 +962,6 @@ class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""",
ERNIE_M_START_DOCSTRING,
)
# Copied from paddlenlp.transformers.ernie_m.modeling.UIEM
class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
def __init__(self, config):
super(ErnieMForInformationExtraction, self).__init__(config)
......
......@@ -45,7 +45,6 @@ _CHECKPOINT_FOR_DOC = "Tanrei/GPTSAN-japanese"
####################################################
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.router_z_loss_func
def router_z_loss_func(router_logits: torch.Tensor) -> float:
r"""
Compute the router z-loss implemented in PyTorch.
......@@ -66,7 +65,6 @@ def router_z_loss_func(router_logits: torch.Tensor) -> float:
return torch.sum(z_loss) / (num_groups * tokens_per_group)
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.load_balancing_loss_func
def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float:
r"""
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
......@@ -140,7 +138,6 @@ class GPTSanJapaneseDenseActDense(nn.Module):
return hidden_states
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersTop1Router with SwitchTransformers->GPTSanJapanese
class GPTSanJapaneseTop1Router(nn.Module):
"""
Router using tokens choose top-1 experts assignment.
......@@ -234,7 +231,6 @@ class GPTSanJapaneseTop1Router(nn.Module):
return expert_index, router_probs, router_logits
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersSparseMLP with SwitchTransformers->GPTSanJapanese
class GPTSanJapaneseSparseMLP(nn.Module):
r"""
Implementation of the Switch Transformers Sparse MLP module.
......@@ -345,7 +341,6 @@ class GPTSanJapaneseLayerDenseFF(nn.Module):
return output
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->GPTSanJapanese
class GPTSanJapaneseAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
......@@ -749,7 +744,6 @@ class GPTSanJapanesePreTrainedModel(PreTrainedModel):
module.experts[f"expert_{idx}"].wi.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
module.experts[f"expert_{idx}"].wo.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
# Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel._shift_right
def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id
......@@ -1298,17 +1292,14 @@ class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
"past_key_values": None,
}
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration.prepare_decoder_input_ids_from_labels with SwitchTransformers->GPTSanJapanese
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return self._shift_right(labels)
# Copied from transformers.models.mbart.modeling_mbart.MBartForConditionalGeneration.resize_token_embeddings with MBart->GPTSanJapanese
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings
# Copied from transformers.models.mbart.modeling_mbart.MBartForConditionalGeneration._resize_final_logits_bias with MBart->GPTSanJapanese
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
......@@ -1324,15 +1315,12 @@ class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
def set_input_embeddings(self, new_embeddings):
self.model.set_input_embeddings(new_embeddings)
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration.set_output_embeddings with SwitchTransformers->GPTSanJapanese
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration.get_output_embeddings with SwitchTransformers->GPTSanJapanese
def get_output_embeddings(self):
return self.lm_head
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration._unpack_router_logits with SwitchTransformers->GPTSanJapanese
def _unpack_router_logits(self, router_outputs):
total_router_logits = []
total_expert_indexes = []
......
......@@ -179,25 +179,20 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
)
@property
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.vocab_size
def vocab_size(self):
# self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab
return len(self.raw_vocab)
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.get_vocab
def get_vocab(self):
return dict(self.raw_vocab, **self.added_tokens_encoder)
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer._tokenize
def _tokenize(self, text):
return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer._convert_token_to_id
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer._convert_id_to_token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.subword_tokenizer.convert_id_to_token(index)
......@@ -254,7 +249,6 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
"{% endfor %}"
)
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
......@@ -412,7 +406,6 @@ class SubWordJapaneseTokenizer(object):
SOFTWARE.
"""
# Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__init__
def __init__(self, vocab, ids_to_tokens, emoji):
self.vocab = vocab # same as swe
self.ids_to_tokens = ids_to_tokens # same as bpe
......@@ -434,11 +427,9 @@ class SubWordJapaneseTokenizer(object):
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})
# Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__len__
def __len__(self):
return len(self.ids_to_tokens)
# Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.clean_text
def clean_text(self, content):
content = self.content_repatter1.sub("<URL>", content)
content = self.content_repatter2.sub("<EMAIL>", content)
......@@ -451,7 +442,6 @@ class SubWordJapaneseTokenizer(object):
content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
return content
# Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.tokenize
def tokenize(self, text, clean=False):
text = text.replace(" ", "<SP>")
text = text.replace(" ", "<SP>")
......
......@@ -256,7 +256,6 @@ class NatDownsampler(nn.Module):
return input_feature
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
......@@ -277,7 +276,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Nat
class NatDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
......
......@@ -346,7 +346,6 @@ class NezhaSelfAttention(nn.Module):
return outputs
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Nezha
class NezhaSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -410,7 +409,6 @@ class NezhaAttention(nn.Module):
return outputs
# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Nezha
class NezhaIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -426,7 +424,6 @@ class NezhaIntermediate(nn.Module):
return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Nezha
class NezhaOutput(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -527,7 +524,6 @@ class NezhaLayer(nn.Module):
return layer_output
# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Nezha
class NezhaEncoder(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -621,7 +617,6 @@ class NezhaEncoder(nn.Module):
)
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Nezha
class NezhaPooler(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -637,7 +632,6 @@ class NezhaPooler(nn.Module):
return pooled_output
# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->Nezha
class NezhaPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -655,7 +649,6 @@ class NezhaPredictionHeadTransform(nn.Module):
return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Nezha
class NezhaLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -679,7 +672,6 @@ class NezhaLMPredictionHead(nn.Module):
return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Nezha
class NezhaOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -690,7 +682,6 @@ class NezhaOnlyMLMHead(nn.Module):
return prediction_scores
# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->Nezha
class NezhaOnlyNSPHead(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -701,7 +692,6 @@ class NezhaOnlyNSPHead(nn.Module):
return seq_relationship_score
# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->Nezha
class NezhaPreTrainingHeads(nn.Module):
def __init__(self, config):
super().__init__()
......
......@@ -145,7 +145,6 @@ class OpenLlamaConfig(PretrainedConfig):
**kwargs,
)
# Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
......
......@@ -46,7 +46,6 @@ except ImportError:
_CONFIG_FOR_DOC = "OpenLlamaConfig"
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->OpenLlama
class OpenLlamaRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
......@@ -64,7 +63,6 @@ class OpenLlamaRMSNorm(nn.Module):
return self.weight * hidden_states.to(input_dtype)
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->OpenLlama
class OpenLlamaRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
......@@ -101,7 +99,6 @@ class OpenLlamaRotaryEmbedding(nn.Module):
)
# Copied from transformers.models.falcon.modeling_falcon.FalconLinearScalingRotaryEmbedding with Falcon->OpenLlama
class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
"""OpenLlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
......@@ -121,7 +118,6 @@ class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
# Copied from transformers.models.falcon.modeling_falcon.FalconDynamicNTKScalingRotaryEmbedding with Falcon->OpenLlama
class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
"""OpenLlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
......@@ -155,7 +151,6 @@ def rotate_half(x):
return torch.cat((-x2, x1), dim=-1)
# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
......@@ -228,7 +223,6 @@ class OpenLlamaAttention(nn.Module):
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
self._init_rope()
# Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->OpenLlama
def _init_rope(self):
if self.config.rope_scaling is None:
self.rotary_emb = OpenLlamaRotaryEmbedding(
......
......@@ -142,7 +142,6 @@ def load_tf_weights_in_qdqbert(model, tf_checkpoint_path):
return model
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert -> QDQBert
class QDQBertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
......@@ -628,7 +627,6 @@ class QDQBertEncoder(nn.Module):
)
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert -> QDQBert
class QDQBertPooler(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -644,7 +642,6 @@ class QDQBertPooler(nn.Module):
return pooled_output
# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert -> QDQBert
class QDQBertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -697,7 +694,6 @@ class QDQBertOnlyMLMHead(nn.Module):
return prediction_scores
# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert -> QDQBert
class QDQBertOnlyNSPHead(nn.Module):
def __init__(self, config):
super().__init__()
......
......@@ -150,7 +150,6 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
return model
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->Realm
class RealmEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
......@@ -215,7 +214,6 @@ class RealmEmbeddings(nn.Module):
return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Realm
class RealmSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
......@@ -350,7 +348,6 @@ class RealmSelfAttention(nn.Module):
return outputs
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Realm
class RealmSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -370,7 +367,6 @@ REALM_SELF_ATTENTION_CLASSES = {
}
# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Realm,BERT->REALM
class RealmAttention(nn.Module):
def __init__(self, config, position_embedding_type=None):
super().__init__()
......@@ -422,7 +418,6 @@ class RealmAttention(nn.Module):
return outputs
# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Realm
class RealmIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -438,7 +433,6 @@ class RealmIntermediate(nn.Module):
return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Realm
class RealmOutput(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -453,7 +447,6 @@ class RealmOutput(nn.Module):
return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Realm
class RealmLayer(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -540,7 +533,6 @@ class RealmLayer(nn.Module):
return layer_output
# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Realm
class RealmEncoder(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -634,7 +626,6 @@ class RealmEncoder(nn.Module):
)
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Realm
class RealmPooler(nn.Module):
def __init__(self, config):
super().__init__()
......
......@@ -28,7 +28,6 @@ logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
# Copied from transformers.models.bert.tokenization_bert.load_vocab
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
......@@ -40,7 +39,6 @@ def load_vocab(vocab_file):
return vocab
# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
......@@ -96,7 +94,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.__init__
def __init__(
self,
vocab_file,
......@@ -145,20 +142,16 @@ class RetriBertTokenizer(PreTrainedTokenizer):
)
@property
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size
def vocab_size(self):
return len(self.vocab)
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
def _tokenize(self, text, split_special_tokens=False):
split_tokens = []
if self.do_basic_tokenize:
......@@ -174,23 +167,19 @@ class RetriBertTokenizer(PreTrainedTokenizer):
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
......@@ -216,7 +205,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
......@@ -245,7 +233,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
......@@ -275,7 +262,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0
if os.path.isdir(save_directory):
......@@ -297,7 +283,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
return (vocab_file,)
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
......@@ -459,7 +444,6 @@ class BasicTokenizer(object):
return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
......
......@@ -76,7 +76,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
slow_tokenizer_class = RetriBertTokenizer
model_input_names = ["input_ids", "attention_mask"]
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.__init__
def __init__(
self,
vocab_file=None,
......@@ -119,7 +118,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
self.do_lower_case = do_lower_case
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
......@@ -144,7 +142,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
return output
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
......@@ -174,7 +171,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
......@@ -36,7 +36,6 @@ _CONFIG_FOR_DOC = "Speech2Text2Config"
_CHECKPOINT_FOR_DOC = "facebook/s2t-wav2vec2-large-en-de"
# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextSinusoidalPositionalEmbedding with Speech2Text->Speech2Text2
class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length."""
......@@ -107,7 +106,6 @@ class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
return incremental_indices.long() + padding_idx
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Speech2Text2
class Speech2Text2Attention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
......
......@@ -340,7 +340,6 @@ class TvltAudioPatchEmbeddings(nn.Module):
return embeddings
# Copied from transformers.models.vilt.modeling_vilt.ViltSelfAttention with Vilt->Tvlt
class TvltSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -401,7 +400,6 @@ class TvltSelfAttention(nn.Module):
return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltSelfOutput with Vilt->Tvlt
class TvltSelfOutput(nn.Module):
"""
The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
......@@ -420,7 +418,6 @@ class TvltSelfOutput(nn.Module):
return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltAttention with Vilt->Tvlt
class TvltAttention(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -455,7 +452,6 @@ class TvltAttention(nn.Module):
return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltIntermediate with Vilt->Tvlt
class TvltIntermediate(nn.Module):
def __init__(self, config: TvltConfig) -> None:
super().__init__()
......@@ -472,7 +468,6 @@ class TvltIntermediate(nn.Module):
return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltOutput with Vilt->Tvlt
class TvltOutput(nn.Module):
def __init__(self, config: TvltConfig) -> None:
super().__init__()
......@@ -488,7 +483,6 @@ class TvltOutput(nn.Module):
return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltLayer with Vilt->Tvlt
class TvltLayer(nn.Module):
"""This corresponds to the Block class in the timm implementation."""
......@@ -527,7 +521,6 @@ class TvltLayer(nn.Module):
return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltEncoder with Vilt->Tvlt
class TvltEncoder(nn.Module):
def __init__(self, config):
super().__init__()
......
......@@ -48,7 +48,6 @@ _IMAGE_CLASS_CHECKPOINT = "Visual-Attention-Network/van-base"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
# Copied from transformers.models.convnext.modeling_convnext.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
......@@ -69,7 +68,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
return output
# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Van
class VanDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
......
......@@ -140,7 +140,6 @@ class ViTHybridImageProcessor(BaseImageProcessor):
"input_data_format",
]
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
def resize(
self,
image: np.ndarray,
......
......@@ -51,7 +51,6 @@ class ViTHybridEmbeddings(nn.Module):
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
"""
# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.__init__ with ViT->ViTHybrid
def __init__(self, config: ViTHybridConfig, use_mask_token: bool = False) -> None:
super().__init__()
......@@ -186,7 +185,6 @@ class ViTHybridPatchEmbeddings(nn.Module):
return embeddings
# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTHybrid
class ViTHybridSelfAttention(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None:
super().__init__()
......@@ -247,7 +245,6 @@ class ViTHybridSelfAttention(nn.Module):
return outputs
# Copied from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention with ViT->ViTHybrid
class ViTHybridSdpaSelfAttention(ViTHybridSelfAttention):
def __init__(self, config: ViTHybridConfig) -> None:
super().__init__(config)
......@@ -279,7 +276,6 @@ class ViTHybridSdpaSelfAttention(ViTHybridSelfAttention):
return context_layer, None
# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTHybrid
class ViTHybridSelfOutput(nn.Module):
"""
The residual connection is defined in ViTHybridLayer instead of here (as is the case with other models), due to the
......@@ -298,7 +294,6 @@ class ViTHybridSelfOutput(nn.Module):
return hidden_states
# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTHybrid
class ViTHybridAttention(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None:
super().__init__()
......@@ -338,14 +333,12 @@ class ViTHybridAttention(nn.Module):
return outputs
# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->ViTHybrid
class ViTHybridSdpaAttention(ViTHybridAttention):
def __init__(self, config: ViTHybridConfig) -> None:
super().__init__(config)
self.attention = ViTHybridSdpaSelfAttention(config)
# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->ViTHybrid
class ViTHybridIntermediate(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None:
super().__init__()
......@@ -362,7 +355,6 @@ class ViTHybridIntermediate(nn.Module):
return hidden_states
# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->ViTHybrid
class ViTHybridOutput(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None:
super().__init__()
......@@ -427,7 +419,6 @@ class ViTHybridLayer(nn.Module):
return outputs
# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTHybrid
class ViTHybridEncoder(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None:
super().__init__()
......@@ -479,7 +470,6 @@ class ViTHybridEncoder(nn.Module):
)
# Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel with ViT->ViTHybrid
class ViTHybridPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
......@@ -558,7 +548,6 @@ VIT_INPUTS_DOCSTRING = r"""
"The bare ViT Hybrid Model transformer outputting raw hidden-states without any specific head on top.",
VIT_START_DOCSTRING,
)
# Copied from transformers.models.vit.modeling_vit.ViTModel with ViT->ViTHybrid
class ViTHybridModel(ViTHybridPreTrainedModel):
def __init__(self, config: ViTHybridConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
super().__init__(config)
......@@ -654,7 +643,6 @@ class ViTHybridModel(ViTHybridPreTrainedModel):
)
# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->ViTHybrid
class ViTHybridPooler(nn.Module):
def __init__(self, config: ViTHybridConfig):
super().__init__()
......@@ -677,7 +665,6 @@ class ViTHybridPooler(nn.Module):
""",
VIT_START_DOCSTRING,
)
# Copied from transformers.models.vit.modeling_vit.ViTForImageClassification with ViT->ViTHybrid
class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
def __init__(self, config: ViTHybridConfig) -> None:
super().__init__(config)
......
......@@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "XLMProphetNetConfig"
# Copied from src.transformers.models.prophetnet.modeling_prophetnet.PROPHETNET_START_DOCSTRING with ProphetNetConfig->XLMProphetNetConfig
XLM_PROPHETNET_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
......@@ -64,7 +63,6 @@ XLM_PROPHETNET_START_DOCSTRING = r"""
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# Copied from src.transformers.models.prophetnet.modeling_prophetnet.PROPHETNET_INPUTS_DOCSTRING with ProphetNet->XLMProphetNet
XLM_PROPHETNET_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
......@@ -139,7 +137,6 @@ XLM_PROPHETNET_INPUTS_DOCSTRING = r"""
"""
# Copied from src.transformers.models.prophetnet.modeling_prophetnet.PROPHETNET_STANDALONE_INPUTS_DOCSTRING with ProphetNet->XLMProphetNet
XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
......@@ -174,7 +171,6 @@ XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
"""
# Copied from transformers.models.prophetnet.modeling_prophetnet.softmax
def softmax(hidden_state, dim, onnx_trace=False):
if onnx_trace:
return nn.functional.softmax(hidden_state.float(), dim=dim)
......@@ -182,7 +178,6 @@ def softmax(hidden_state, dim, onnx_trace=False):
return nn.functional.softmax(hidden_state, dim=dim, dtype=torch.float32)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ngram_attention_bias
def ngram_attention_bias(sequence_length, ngram, device, dtype):
"""
This function computes the bias for the predict stream
......@@ -200,7 +195,6 @@ def ngram_attention_bias(sequence_length, ngram, device, dtype):
return torch.cat([left_block, right_block], dim=2)
# Copied from transformers.models.prophetnet.modeling_prophetnet.compute_relative_buckets
def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False):
"""
This function computes individual parts of the relative position buckets. For more detail, see paper.
......@@ -228,7 +222,6 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b
return rel_positions_bucket
# Copied from transformers.models.prophetnet.modeling_prophetnet.compute_all_stream_relative_buckets
def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids):
"""
This function computes both main and predict relative position buckets. For more detail, see paper.
......@@ -253,7 +246,6 @@ def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids)
@dataclass
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput with ProphetNet->XLMProphetNet all-casing
class XLMProphetNetSeq2SeqLMOutput(ModelOutput):
"""
Base class for sequence-to-sequence language models outputs.
......@@ -339,7 +331,6 @@ class XLMProphetNetSeq2SeqLMOutput(ModelOutput):
@dataclass
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput with ProphetNet->XLMProphetNet all-casing
class XLMProphetNetSeq2SeqModelOutput(ModelOutput):
"""
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
......@@ -426,7 +417,6 @@ class XLMProphetNetSeq2SeqModelOutput(ModelOutput):
@dataclass
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput with ProphetNet->XLMProphetNet all-casing
class XLMProphetNetDecoderModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
......@@ -487,7 +477,6 @@ class XLMProphetNetDecoderModelOutput(ModelOutput):
@dataclass
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput with ProphetNet->XLMProphetNet all-casing
class XLMProphetNetDecoderLMOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
......@@ -549,7 +538,6 @@ class XLMProphetNetDecoderLMOutput(ModelOutput):
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetPreTrainedModel with ProphetNet->XLMProphetNet
class XLMProphetNetPreTrainedModel(PreTrainedModel):
config_class = XLMProphetNetConfig
base_model_prefix = "prophetnet"
......@@ -588,7 +576,6 @@ class XLMProphetNetPreTrainedModel(PreTrainedModel):
return shifted_input_ids
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetPositionalEmbeddings with ProphetNet->XLMProphetNet
class XLMProphetNetPositionalEmbeddings(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
......@@ -632,7 +619,6 @@ class XLMProphetNetPositionalEmbeddings(nn.Embedding):
return super().forward(position_ids)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetAttention with ProphetNet->XLMProphetNet
class XLMProphetNetAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
......@@ -762,7 +748,6 @@ class XLMProphetNetAttention(nn.Module):
return attn_output, attn_weights_reshaped, past_key_value
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetFeedForward with ProphetNet->XLMProphetNet
class XLMProphetNetFeedForward(nn.Module):
"""
This is the residual two feed-forward layer block based on the original Transformer implementation.
......@@ -786,7 +771,6 @@ class XLMProphetNetFeedForward(nn.Module):
return hidden_states
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetNgramSelfAttention with ProphetNet->XLMProphetNet
class XLMProphetNetNgramSelfAttention(nn.Module):
def __init__(self, config: XLMProphetNetConfig):
super().__init__()
......@@ -1106,7 +1090,6 @@ class XLMProphetNetNgramSelfAttention(nn.Module):
return predict_relative_pos_embeddings
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetEncoderLayer with ProphetNet->XLMProphetNet, Prophetnet->XLMProphetnet
class XLMProphetNetEncoderLayer(nn.Module):
"""
Encoder block for XLMProphetnet
......@@ -1150,7 +1133,6 @@ class XLMProphetNetEncoderLayer(nn.Module):
return outputs
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderLayer with Prophetnet->XLMProphetnet, ProphetNet->XLMProphetNet
class XLMProphetNetDecoderLayer(nn.Module):
"""
Decoder block for XLMProphetnet
......@@ -1239,7 +1221,6 @@ class XLMProphetNetDecoderLayer(nn.Module):
"The standalone encoder part of the XLMProphetNetModel.",
XLM_PROPHETNET_START_DOCSTRING,
)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetEncoder with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel):
r"""
word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
......@@ -1374,7 +1355,6 @@ class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel):
"The standalone decoder part of the XLMProphetNetModel.",
XLM_PROPHETNET_START_DOCSTRING,
)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoder with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET,
class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel):
r"""
word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
......@@ -1743,7 +1723,6 @@ class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel):
"The bare XLMProphetNet Model outputting raw hidden-states without any specific head on top.",
XLM_PROPHETNET_START_DOCSTRING,
)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetModel with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
......@@ -1878,7 +1857,6 @@ class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
"The XLMProphetNet Model with a language modeling head. Can be used for sequence generation tasks.",
XLM_PROPHETNET_START_DOCSTRING,
)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForConditionalGeneration with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
......@@ -2073,7 +2051,6 @@ class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
return self._shift_right(labels)
@staticmethod
# Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration._reorder_cache
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
......@@ -2096,7 +2073,6 @@ class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
" language modeling.",
XLM_PROPHETNET_START_DOCSTRING,
)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForCausalLM with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
_tied_weights_keys = [
"prophetnet.word_embeddings.weight",
......@@ -2329,7 +2305,6 @@ class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
}
@staticmethod
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM._reorder_cache
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
......@@ -2339,7 +2314,6 @@ class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
return reordered_past
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderWrapper with ProphetNet->XLMProphetNet, prophetnet->XLMProphetNet
class XLMProphetNetDecoderWrapper(XLMProphetNetPreTrainedModel):
"""
This is a wrapper class, so that [`XLMProphetNetForCausalLM`] can correctly be loaded from pretrained XLMProphetNet
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment