Unverified Commit 5b5b48b1 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Remove copied froms for deprecated models (#31153)

* Remove copied froms for deprecated models

* Remove automatically in script
parent 97e5a707
...@@ -78,7 +78,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name ...@@ -78,7 +78,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
""" """
Computes the output image size given the input image size and the desired output size. Computes the output image size given the input image size and the desired output size.
...@@ -110,7 +109,6 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in ...@@ -110,7 +109,6 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
return (oh, ow) return (oh, ow)
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size( def get_resize_output_image_size(
input_image: np.ndarray, input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]], size: Union[int, Tuple[int, int], List[int]],
...@@ -139,7 +137,6 @@ def get_resize_output_image_size( ...@@ -139,7 +137,6 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size) return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width( def get_image_size_for_max_height_width(
input_image: np.ndarray, input_image: np.ndarray,
max_height: int, max_height: int,
...@@ -175,7 +172,6 @@ def get_image_size_for_max_height_width( ...@@ -175,7 +172,6 @@ def get_image_size_for_max_height_width(
return new_height, new_width return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable: def get_numpy_to_framework_fn(arr) -> Callable:
""" """
Returns a function that converts a numpy array to the framework of the input array. Returns a function that converts a numpy array to the framework of the input array.
...@@ -200,7 +196,6 @@ def get_numpy_to_framework_fn(arr) -> Callable: ...@@ -200,7 +196,6 @@ def get_numpy_to_framework_fn(arr) -> Callable:
raise ValueError(f"Cannot convert arrays of type {type(arr)}") raise ValueError(f"Cannot convert arrays of type {type(arr)}")
# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
""" """
Squeezes an array, but only if the axis specified has dim 1. Squeezes an array, but only if the axis specified has dim 1.
...@@ -214,7 +209,6 @@ def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: ...@@ -214,7 +209,6 @@ def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
return arr return arr
# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict: def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
image_height, image_width = image_size image_height, image_width = image_size
norm_annotation = {} norm_annotation = {}
...@@ -229,7 +223,6 @@ def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict: ...@@ -229,7 +223,6 @@ def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
return norm_annotation return norm_annotation
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
def max_across_indices(values: Iterable[Any]) -> List[Any]: def max_across_indices(values: Iterable[Any]) -> List[Any]:
""" """
Return the maximum value across all indices of an iterable of values. Return the maximum value across all indices of an iterable of values.
...@@ -237,7 +230,6 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]: ...@@ -237,7 +230,6 @@ def max_across_indices(values: Iterable[Any]) -> List[Any]:
return [max(values_i) for values_i in zip(*values)] return [max(values_i) for values_i in zip(*values)]
# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
def get_max_height_width( def get_max_height_width(
images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]: ) -> List[int]:
...@@ -256,7 +248,6 @@ def get_max_height_width( ...@@ -256,7 +248,6 @@ def get_max_height_width(
return (max_height, max_width) return (max_height, max_width)
# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
def make_pixel_mask( def make_pixel_mask(
image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray: ) -> np.ndarray:
...@@ -275,7 +266,6 @@ def make_pixel_mask( ...@@ -275,7 +266,6 @@ def make_pixel_mask(
return mask return mask
# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
""" """
Convert a COCO polygon annotation to a mask. Convert a COCO polygon annotation to a mask.
...@@ -310,7 +300,6 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar ...@@ -310,7 +300,6 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndar
return masks return masks
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DETA
def prepare_coco_detection_annotation( def prepare_coco_detection_annotation(
image, image,
target, target,
...@@ -371,7 +360,6 @@ def prepare_coco_detection_annotation( ...@@ -371,7 +360,6 @@ def prepare_coco_detection_annotation(
return new_target return new_target
# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes
def masks_to_boxes(masks: np.ndarray) -> np.ndarray: def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
""" """
Compute the bounding boxes around the provided panoptic segmentation masks. Compute the bounding boxes around the provided panoptic segmentation masks.
...@@ -406,7 +394,6 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray: ...@@ -406,7 +394,6 @@ def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
return np.stack([x_min, y_min, x_max, y_max], 1) return np.stack([x_min, y_min, x_max, y_max], 1)
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DETA
def prepare_coco_panoptic_annotation( def prepare_coco_panoptic_annotation(
image: np.ndarray, image: np.ndarray,
target: Dict, target: Dict,
...@@ -448,7 +435,6 @@ def prepare_coco_panoptic_annotation( ...@@ -448,7 +435,6 @@ def prepare_coco_panoptic_annotation(
return new_target return new_target
# Copied from transformers.models.detr.image_processing_detr.resize_annotation
def resize_annotation( def resize_annotation(
annotation: Dict[str, Any], annotation: Dict[str, Any],
orig_size: Tuple[int, int], orig_size: Tuple[int, int],
...@@ -594,7 +580,6 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -594,7 +580,6 @@ class DetaImageProcessor(BaseImageProcessor):
self.do_pad = do_pad self.do_pad = do_pad
self.pad_size = pad_size self.pad_size = pad_size
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA
def prepare_annotation( def prepare_annotation(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -683,7 +668,6 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -683,7 +668,6 @@ class DetaImageProcessor(BaseImageProcessor):
) )
return image return image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
def resize_annotation( def resize_annotation(
self, self,
annotation, annotation,
...@@ -697,7 +681,6 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -697,7 +681,6 @@ class DetaImageProcessor(BaseImageProcessor):
""" """
return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale( def rescale(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -726,7 +709,6 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -726,7 +709,6 @@ class DetaImageProcessor(BaseImageProcessor):
""" """
return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
""" """
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
...@@ -734,7 +716,6 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -734,7 +716,6 @@ class DetaImageProcessor(BaseImageProcessor):
""" """
return normalize_annotation(annotation, image_size=image_size) return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
def _update_annotation_for_padded_image( def _update_annotation_for_padded_image(
self, self,
annotation: Dict, annotation: Dict,
...@@ -778,7 +759,6 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -778,7 +759,6 @@ class DetaImageProcessor(BaseImageProcessor):
new_annotation[key] = value new_annotation[key] = value
return new_annotation return new_annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image( def _pad_image(
self, self,
image: np.ndarray, image: np.ndarray,
...@@ -812,7 +792,6 @@ class DetaImageProcessor(BaseImageProcessor): ...@@ -812,7 +792,6 @@ class DetaImageProcessor(BaseImageProcessor):
) )
return padded_image, annotation return padded_image, annotation
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad( def pad(
self, self,
images: List[np.ndarray], images: List[np.ndarray],
......
...@@ -52,7 +52,6 @@ logger = logging.get_logger(__name__) ...@@ -52,7 +52,6 @@ logger = logging.get_logger(__name__)
MultiScaleDeformableAttention = None MultiScaleDeformableAttention = None
# Copied from models.deformable_detr.load_cuda_kernels
def load_cuda_kernels(): def load_cuda_kernels():
from torch.utils.cpp_extension import load from torch.utils.cpp_extension import load
...@@ -83,7 +82,6 @@ def load_cuda_kernels(): ...@@ -83,7 +82,6 @@ def load_cuda_kernels():
) )
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
class MultiScaleDeformableAttentionFunction(Function): class MultiScaleDeformableAttentionFunction(Function):
@staticmethod @staticmethod
def forward( def forward(
...@@ -152,7 +150,6 @@ _CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365" ...@@ -152,7 +150,6 @@ _CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365"
@dataclass @dataclass
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->Deta
class DetaDecoderOutput(ModelOutput): class DetaDecoderOutput(ModelOutput):
""" """
Base class for outputs of the DetaDecoder. This class adds two attributes to Base class for outputs of the DetaDecoder. This class adds two attributes to
...@@ -344,7 +341,6 @@ def inverse_sigmoid(x, eps=1e-5): ...@@ -344,7 +341,6 @@ def inverse_sigmoid(x, eps=1e-5):
return torch.log(x1 / x2) return torch.log(x1 / x2)
# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->Deta
class DetaFrozenBatchNorm2d(nn.Module): class DetaFrozenBatchNorm2d(nn.Module):
""" """
BatchNorm2d where the batch statistics and the affine parameters are fixed. BatchNorm2d where the batch statistics and the affine parameters are fixed.
...@@ -384,7 +380,6 @@ class DetaFrozenBatchNorm2d(nn.Module): ...@@ -384,7 +380,6 @@ class DetaFrozenBatchNorm2d(nn.Module):
return x * scale + bias return x * scale + bias
# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->Deta
def replace_batch_norm(model): def replace_batch_norm(model):
r""" r"""
Recursively replace all `torch.nn.BatchNorm2d` with `DetaFrozenBatchNorm2d`. Recursively replace all `torch.nn.BatchNorm2d` with `DetaFrozenBatchNorm2d`.
...@@ -454,7 +449,6 @@ class DetaBackboneWithPositionalEncodings(nn.Module): ...@@ -454,7 +449,6 @@ class DetaBackboneWithPositionalEncodings(nn.Module):
return out, pos return out, pos
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->Deta
class DetaSinePositionEmbedding(nn.Module): class DetaSinePositionEmbedding(nn.Module):
""" """
This is a more standard version of the position embedding, very similar to the one used by the Attention is all you This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
...@@ -493,7 +487,6 @@ class DetaSinePositionEmbedding(nn.Module): ...@@ -493,7 +487,6 @@ class DetaSinePositionEmbedding(nn.Module):
return pos return pos
# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
class DetaLearnedPositionEmbedding(nn.Module): class DetaLearnedPositionEmbedding(nn.Module):
""" """
This module learns positional embeddings up to a fixed maximum size. This module learns positional embeddings up to a fixed maximum size.
...@@ -517,7 +510,6 @@ class DetaLearnedPositionEmbedding(nn.Module): ...@@ -517,7 +510,6 @@ class DetaLearnedPositionEmbedding(nn.Module):
return pos return pos
# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->Deta
def build_position_encoding(config): def build_position_encoding(config):
n_steps = config.d_model // 2 n_steps = config.d_model // 2
if config.position_embedding_type == "sine": if config.position_embedding_type == "sine":
...@@ -531,7 +523,6 @@ def build_position_encoding(config): ...@@ -531,7 +523,6 @@ def build_position_encoding(config):
return position_embedding return position_embedding
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
def multi_scale_deformable_attention( def multi_scale_deformable_attention(
value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor: ) -> Tensor:
...@@ -571,7 +562,6 @@ def multi_scale_deformable_attention( ...@@ -571,7 +562,6 @@ def multi_scale_deformable_attention(
return output.transpose(1, 2).contiguous() return output.transpose(1, 2).contiguous()
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->Deta
class DetaMultiscaleDeformableAttention(nn.Module): class DetaMultiscaleDeformableAttention(nn.Module):
""" """
Multiscale deformable attention as proposed in Deformable DETR. Multiscale deformable attention as proposed in Deformable DETR.
...@@ -715,7 +705,6 @@ class DetaMultiscaleDeformableAttention(nn.Module): ...@@ -715,7 +705,6 @@ class DetaMultiscaleDeformableAttention(nn.Module):
return output, attention_weights return output, attention_weights
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->Deta,Deformable DETR->DETA
class DetaMultiheadAttention(nn.Module): class DetaMultiheadAttention(nn.Module):
""" """
Multi-headed attention from 'Attention Is All You Need' paper. Multi-headed attention from 'Attention Is All You Need' paper.
...@@ -1506,11 +1495,9 @@ class DetaModel(DetaPreTrainedModel): ...@@ -1506,11 +1495,9 @@ class DetaModel(DetaPreTrainedModel):
self.post_init() self.post_init()
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_encoder
def get_encoder(self): def get_encoder(self):
return self.encoder return self.encoder
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_decoder
def get_decoder(self): def get_decoder(self):
return self.decoder return self.decoder
...@@ -1522,7 +1509,6 @@ class DetaModel(DetaPreTrainedModel): ...@@ -1522,7 +1509,6 @@ class DetaModel(DetaPreTrainedModel):
for name, param in self.backbone.model.named_parameters(): for name, param in self.backbone.model.named_parameters():
param.requires_grad_(True) param.requires_grad_(True)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_valid_ratio
def get_valid_ratio(self, mask, dtype=torch.float32): def get_valid_ratio(self, mask, dtype=torch.float32):
"""Get the valid ratio of all feature maps.""" """Get the valid ratio of all feature maps."""
...@@ -1534,7 +1520,6 @@ class DetaModel(DetaPreTrainedModel): ...@@ -1534,7 +1520,6 @@ class DetaModel(DetaPreTrainedModel):
valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
return valid_ratio return valid_ratio
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_proposal_pos_embed
def get_proposal_pos_embed(self, proposals): def get_proposal_pos_embed(self, proposals):
"""Get the position embedding of the proposals.""" """Get the position embedding of the proposals."""
...@@ -1869,7 +1854,6 @@ class DetaForObjectDetection(DetaPreTrainedModel): ...@@ -1869,7 +1854,6 @@ class DetaForObjectDetection(DetaPreTrainedModel):
# We can't initialize the model on meta device as some weights are modified during the initialization # We can't initialize the model on meta device as some weights are modified during the initialization
_no_split_modules = None _no_split_modules = None
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta
def __init__(self, config: DetaConfig): def __init__(self, config: DetaConfig):
super().__init__(config) super().__init__(config)
...@@ -2105,7 +2089,6 @@ class DetaForObjectDetection(DetaPreTrainedModel): ...@@ -2105,7 +2089,6 @@ class DetaForObjectDetection(DetaPreTrainedModel):
return dict_outputs return dict_outputs
# Copied from transformers.models.detr.modeling_detr.dice_loss
def dice_loss(inputs, targets, num_boxes): def dice_loss(inputs, targets, num_boxes):
""" """
Compute the DICE loss, similar to generalized IOU for masks Compute the DICE loss, similar to generalized IOU for masks
...@@ -2125,7 +2108,6 @@ def dice_loss(inputs, targets, num_boxes): ...@@ -2125,7 +2108,6 @@ def dice_loss(inputs, targets, num_boxes):
return loss.sum() / num_boxes return loss.sum() / num_boxes
# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
""" """
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
...@@ -2197,7 +2179,6 @@ class DetaLoss(nn.Module): ...@@ -2197,7 +2179,6 @@ class DetaLoss(nn.Module):
if self.assign_second_stage: if self.assign_second_stage:
self.stg2_assigner = DetaStage2Assigner(num_queries) self.stg2_assigner = DetaStage2Assigner(num_queries)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_labels
def loss_labels(self, outputs, targets, indices, num_boxes): def loss_labels(self, outputs, targets, indices, num_boxes):
""" """
Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
...@@ -2232,7 +2213,6 @@ class DetaLoss(nn.Module): ...@@ -2232,7 +2213,6 @@ class DetaLoss(nn.Module):
return losses return losses
@torch.no_grad() @torch.no_grad()
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_cardinality
def loss_cardinality(self, outputs, targets, indices, num_boxes): def loss_cardinality(self, outputs, targets, indices, num_boxes):
""" """
Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
...@@ -2248,7 +2228,6 @@ class DetaLoss(nn.Module): ...@@ -2248,7 +2228,6 @@ class DetaLoss(nn.Module):
losses = {"cardinality_error": card_err} losses = {"cardinality_error": card_err}
return losses return losses
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_boxes
def loss_boxes(self, outputs, targets, indices, num_boxes): def loss_boxes(self, outputs, targets, indices, num_boxes):
""" """
Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
...@@ -2273,21 +2252,18 @@ class DetaLoss(nn.Module): ...@@ -2273,21 +2252,18 @@ class DetaLoss(nn.Module):
losses["loss_giou"] = loss_giou.sum() / num_boxes losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses return losses
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx
def _get_source_permutation_idx(self, indices): def _get_source_permutation_idx(self, indices):
# permute predictions following indices # permute predictions following indices
batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
source_idx = torch.cat([source for (source, _) in indices]) source_idx = torch.cat([source for (source, _) in indices])
return batch_idx, source_idx return batch_idx, source_idx
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_target_permutation_idx
def _get_target_permutation_idx(self, indices): def _get_target_permutation_idx(self, indices):
# permute targets following indices # permute targets following indices
batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
target_idx = torch.cat([target for (_, target) in indices]) target_idx = torch.cat([target for (_, target) in indices])
return batch_idx, target_idx return batch_idx, target_idx
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.get_loss
def get_loss(self, loss, outputs, targets, indices, num_boxes): def get_loss(self, loss, outputs, targets, indices, num_boxes):
loss_map = { loss_map = {
"labels": self.loss_labels, "labels": self.loss_labels,
...@@ -2360,7 +2336,6 @@ class DetaLoss(nn.Module): ...@@ -2360,7 +2336,6 @@ class DetaLoss(nn.Module):
return losses return losses
# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
class DetaMLPPredictionHead(nn.Module): class DetaMLPPredictionHead(nn.Module):
""" """
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
...@@ -2382,7 +2357,6 @@ class DetaMLPPredictionHead(nn.Module): ...@@ -2382,7 +2357,6 @@ class DetaMLPPredictionHead(nn.Module):
return x return x
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->Deta
class DetaHungarianMatcher(nn.Module): class DetaHungarianMatcher(nn.Module):
""" """
This class computes an assignment between the targets and the predictions of the network. This class computes an assignment between the targets and the predictions of the network.
...@@ -2463,7 +2437,6 @@ class DetaHungarianMatcher(nn.Module): ...@@ -2463,7 +2437,6 @@ class DetaHungarianMatcher(nn.Module):
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
# Copied from transformers.models.detr.modeling_detr._upcast
def _upcast(t: Tensor) -> Tensor: def _upcast(t: Tensor) -> Tensor:
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
if t.is_floating_point(): if t.is_floating_point():
...@@ -2472,7 +2445,6 @@ def _upcast(t: Tensor) -> Tensor: ...@@ -2472,7 +2445,6 @@ def _upcast(t: Tensor) -> Tensor:
return t if t.dtype in (torch.int32, torch.int64) else t.int() return t if t.dtype in (torch.int32, torch.int64) else t.int()
# Copied from transformers.models.detr.modeling_detr.box_area
def box_area(boxes: Tensor) -> Tensor: def box_area(boxes: Tensor) -> Tensor:
""" """
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
...@@ -2489,7 +2461,6 @@ def box_area(boxes: Tensor) -> Tensor: ...@@ -2489,7 +2461,6 @@ def box_area(boxes: Tensor) -> Tensor:
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# Copied from transformers.models.detr.modeling_detr.box_iou
def box_iou(boxes1, boxes2): def box_iou(boxes1, boxes2):
area1 = box_area(boxes1) area1 = box_area(boxes1)
area2 = box_area(boxes2) area2 = box_area(boxes2)
...@@ -2506,7 +2477,6 @@ def box_iou(boxes1, boxes2): ...@@ -2506,7 +2477,6 @@ def box_iou(boxes1, boxes2):
return iou, union return iou, union
# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
def generalized_box_iou(boxes1, boxes2): def generalized_box_iou(boxes1, boxes2):
""" """
Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
......
...@@ -239,7 +239,6 @@ class EfficientFormerConvMlp(nn.Module): ...@@ -239,7 +239,6 @@ class EfficientFormerConvMlp(nn.Module):
return hidden_state return hidden_state
# Copied from transformers.models.convnext.modeling_convnext.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
""" """
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
...@@ -260,7 +259,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals ...@@ -260,7 +259,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
return output return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->EfficientFormer
class EfficientFormerDropPath(nn.Module): class EfficientFormerDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
......
...@@ -86,7 +86,6 @@ class ErnieMEmbeddings(nn.Module): ...@@ -86,7 +86,6 @@ class ErnieMEmbeddings(nn.Module):
return embeddings return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->ErnieM,self.value->self.v_proj,self.key->self.k_proj,self.query->self.q_proj
class ErnieMSelfAttention(nn.Module): class ErnieMSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None): def __init__(self, config, position_embedding_type=None):
super().__init__() super().__init__()
...@@ -380,7 +379,6 @@ class ErnieMEncoder(nn.Module): ...@@ -380,7 +379,6 @@ class ErnieMEncoder(nn.Module):
) )
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->ErnieM
class ErnieMPooler(nn.Module): class ErnieMPooler(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -599,7 +597,6 @@ class ErnieMModel(ErnieMPreTrainedModel): ...@@ -599,7 +597,6 @@ class ErnieMModel(ErnieMPreTrainedModel):
ERNIE_M_START_DOCSTRING, ERNIE_M_START_DOCSTRING,
) )
class ErnieMForSequenceClassification(ErnieMPreTrainedModel): class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -701,7 +698,6 @@ class ErnieMForSequenceClassification(ErnieMPreTrainedModel): ...@@ -701,7 +698,6 @@ class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
ERNIE_M_START_DOCSTRING, ERNIE_M_START_DOCSTRING,
) )
class ErnieMForMultipleChoice(ErnieMPreTrainedModel): class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
...@@ -791,7 +787,6 @@ class ErnieMForMultipleChoice(ErnieMPreTrainedModel): ...@@ -791,7 +787,6 @@ class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
ERNIE_M_START_DOCSTRING, ERNIE_M_START_DOCSTRING,
) )
class ErnieMForTokenClassification(ErnieMPreTrainedModel): class ErnieMForTokenClassification(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -872,7 +867,6 @@ class ErnieMForTokenClassification(ErnieMPreTrainedModel): ...@@ -872,7 +867,6 @@ class ErnieMForTokenClassification(ErnieMPreTrainedModel):
ERNIE_M_START_DOCSTRING, ERNIE_M_START_DOCSTRING,
) )
class ErnieMForQuestionAnswering(ErnieMPreTrainedModel): class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->ErnieM,bert->ernie_m
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -968,7 +962,6 @@ class ErnieMForQuestionAnswering(ErnieMPreTrainedModel): ...@@ -968,7 +962,6 @@ class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""", compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""",
ERNIE_M_START_DOCSTRING, ERNIE_M_START_DOCSTRING,
) )
# Copied from paddlenlp.transformers.ernie_m.modeling.UIEM
class ErnieMForInformationExtraction(ErnieMPreTrainedModel): class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super(ErnieMForInformationExtraction, self).__init__(config) super(ErnieMForInformationExtraction, self).__init__(config)
......
...@@ -45,7 +45,6 @@ _CHECKPOINT_FOR_DOC = "Tanrei/GPTSAN-japanese" ...@@ -45,7 +45,6 @@ _CHECKPOINT_FOR_DOC = "Tanrei/GPTSAN-japanese"
#################################################### ####################################################
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.router_z_loss_func
def router_z_loss_func(router_logits: torch.Tensor) -> float: def router_z_loss_func(router_logits: torch.Tensor) -> float:
r""" r"""
Compute the router z-loss implemented in PyTorch. Compute the router z-loss implemented in PyTorch.
...@@ -66,7 +65,6 @@ def router_z_loss_func(router_logits: torch.Tensor) -> float: ...@@ -66,7 +65,6 @@ def router_z_loss_func(router_logits: torch.Tensor) -> float:
return torch.sum(z_loss) / (num_groups * tokens_per_group) return torch.sum(z_loss) / (num_groups * tokens_per_group)
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.load_balancing_loss_func
def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float: def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float:
r""" r"""
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
...@@ -140,7 +138,6 @@ class GPTSanJapaneseDenseActDense(nn.Module): ...@@ -140,7 +138,6 @@ class GPTSanJapaneseDenseActDense(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersTop1Router with SwitchTransformers->GPTSanJapanese
class GPTSanJapaneseTop1Router(nn.Module): class GPTSanJapaneseTop1Router(nn.Module):
""" """
Router using tokens choose top-1 experts assignment. Router using tokens choose top-1 experts assignment.
...@@ -234,7 +231,6 @@ class GPTSanJapaneseTop1Router(nn.Module): ...@@ -234,7 +231,6 @@ class GPTSanJapaneseTop1Router(nn.Module):
return expert_index, router_probs, router_logits return expert_index, router_probs, router_logits
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersSparseMLP with SwitchTransformers->GPTSanJapanese
class GPTSanJapaneseSparseMLP(nn.Module): class GPTSanJapaneseSparseMLP(nn.Module):
r""" r"""
Implementation of the Switch Transformers Sparse MLP module. Implementation of the Switch Transformers Sparse MLP module.
...@@ -345,7 +341,6 @@ class GPTSanJapaneseLayerDenseFF(nn.Module): ...@@ -345,7 +341,6 @@ class GPTSanJapaneseLayerDenseFF(nn.Module):
return output return output
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->GPTSanJapanese
class GPTSanJapaneseAttention(nn.Module): class GPTSanJapaneseAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper""" """Multi-headed attention from 'Attention Is All You Need' paper"""
...@@ -749,7 +744,6 @@ class GPTSanJapanesePreTrainedModel(PreTrainedModel): ...@@ -749,7 +744,6 @@ class GPTSanJapanesePreTrainedModel(PreTrainedModel):
module.experts[f"expert_{idx}"].wi.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) module.experts[f"expert_{idx}"].wi.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
module.experts[f"expert_{idx}"].wo.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) module.experts[f"expert_{idx}"].wo.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
# Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel._shift_right
def _shift_right(self, input_ids): def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id pad_token_id = self.config.pad_token_id
...@@ -1298,17 +1292,14 @@ class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel): ...@@ -1298,17 +1292,14 @@ class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
"past_key_values": None, "past_key_values": None,
} }
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration.prepare_decoder_input_ids_from_labels with SwitchTransformers->GPTSanJapanese
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return self._shift_right(labels) return self._shift_right(labels)
# Copied from transformers.models.mbart.modeling_mbart.MBartForConditionalGeneration.resize_token_embeddings with MBart->GPTSanJapanese
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding: def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of) new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_embeddings.weight.shape[0]) self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings return new_embeddings
# Copied from transformers.models.mbart.modeling_mbart.MBartForConditionalGeneration._resize_final_logits_bias with MBart->GPTSanJapanese
def _resize_final_logits_bias(self, new_num_tokens: int) -> None: def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1] old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens: if new_num_tokens <= old_num_tokens:
...@@ -1324,15 +1315,12 @@ class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel): ...@@ -1324,15 +1315,12 @@ class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
def set_input_embeddings(self, new_embeddings): def set_input_embeddings(self, new_embeddings):
self.model.set_input_embeddings(new_embeddings) self.model.set_input_embeddings(new_embeddings)
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration.set_output_embeddings with SwitchTransformers->GPTSanJapanese
def set_output_embeddings(self, new_embeddings): def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings self.lm_head = new_embeddings
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration.get_output_embeddings with SwitchTransformers->GPTSanJapanese
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head return self.lm_head
# Copied from transformers.models.switch_transformers.modeling_switch_transformers.SwitchTransformersForConditionalGeneration._unpack_router_logits with SwitchTransformers->GPTSanJapanese
def _unpack_router_logits(self, router_outputs): def _unpack_router_logits(self, router_outputs):
total_router_logits = [] total_router_logits = []
total_expert_indexes = [] total_expert_indexes = []
......
...@@ -179,25 +179,20 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer): ...@@ -179,25 +179,20 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
) )
@property @property
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.vocab_size
def vocab_size(self): def vocab_size(self):
# self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab # self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab
return len(self.raw_vocab) return len(self.raw_vocab)
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.get_vocab
def get_vocab(self): def get_vocab(self):
return dict(self.raw_vocab, **self.added_tokens_encoder) return dict(self.raw_vocab, **self.added_tokens_encoder)
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer._tokenize
def _tokenize(self, text): def _tokenize(self, text):
return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text) return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer._convert_token_to_id
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token)) return self.vocab.get(token, self.vocab.get(self.unk_token))
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer._convert_id_to_token
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
return self.subword_tokenizer.convert_id_to_token(index) return self.subword_tokenizer.convert_id_to_token(index)
...@@ -254,7 +249,6 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer): ...@@ -254,7 +249,6 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
"{% endfor %}" "{% endfor %}"
) )
# Copied from tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0 index = 0
if os.path.isdir(save_directory): if os.path.isdir(save_directory):
...@@ -412,7 +406,6 @@ class SubWordJapaneseTokenizer(object): ...@@ -412,7 +406,6 @@ class SubWordJapaneseTokenizer(object):
SOFTWARE. SOFTWARE.
""" """
# Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__init__
def __init__(self, vocab, ids_to_tokens, emoji): def __init__(self, vocab, ids_to_tokens, emoji):
self.vocab = vocab # same as swe self.vocab = vocab # same as swe
self.ids_to_tokens = ids_to_tokens # same as bpe self.ids_to_tokens = ids_to_tokens # same as bpe
...@@ -434,11 +427,9 @@ class SubWordJapaneseTokenizer(object): ...@@ -434,11 +427,9 @@ class SubWordJapaneseTokenizer(object):
blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟" blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks}) self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})
# Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.__len__
def __len__(self): def __len__(self):
return len(self.ids_to_tokens) return len(self.ids_to_tokens)
# Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.clean_text
def clean_text(self, content): def clean_text(self, content):
content = self.content_repatter1.sub("<URL>", content) content = self.content_repatter1.sub("<URL>", content)
content = self.content_repatter2.sub("<EMAIL>", content) content = self.content_repatter2.sub("<EMAIL>", content)
...@@ -451,7 +442,6 @@ class SubWordJapaneseTokenizer(object): ...@@ -451,7 +442,6 @@ class SubWordJapaneseTokenizer(object):
content = content.replace("<BLOCK><BLOCK>", "<BLOCK>") content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
return content return content
# Copied from tokenization_gpt_neox_japanese.SubWordJapaneseTokenizer.tokenize
def tokenize(self, text, clean=False): def tokenize(self, text, clean=False):
text = text.replace(" ", "<SP>") text = text.replace(" ", "<SP>")
text = text.replace(" ", "<SP>") text = text.replace(" ", "<SP>")
......
...@@ -256,7 +256,6 @@ class NatDownsampler(nn.Module): ...@@ -256,7 +256,6 @@ class NatDownsampler(nn.Module):
return input_feature return input_feature
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
""" """
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
...@@ -277,7 +276,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals ...@@ -277,7 +276,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
return output return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Nat
class NatDropPath(nn.Module): class NatDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
......
...@@ -346,7 +346,6 @@ class NezhaSelfAttention(nn.Module): ...@@ -346,7 +346,6 @@ class NezhaSelfAttention(nn.Module):
return outputs return outputs
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Nezha
class NezhaSelfOutput(nn.Module): class NezhaSelfOutput(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -410,7 +409,6 @@ class NezhaAttention(nn.Module): ...@@ -410,7 +409,6 @@ class NezhaAttention(nn.Module):
return outputs return outputs
# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Nezha
class NezhaIntermediate(nn.Module): class NezhaIntermediate(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -426,7 +424,6 @@ class NezhaIntermediate(nn.Module): ...@@ -426,7 +424,6 @@ class NezhaIntermediate(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Nezha
class NezhaOutput(nn.Module): class NezhaOutput(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -527,7 +524,6 @@ class NezhaLayer(nn.Module): ...@@ -527,7 +524,6 @@ class NezhaLayer(nn.Module):
return layer_output return layer_output
# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Nezha
class NezhaEncoder(nn.Module): class NezhaEncoder(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -621,7 +617,6 @@ class NezhaEncoder(nn.Module): ...@@ -621,7 +617,6 @@ class NezhaEncoder(nn.Module):
) )
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Nezha
class NezhaPooler(nn.Module): class NezhaPooler(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -637,7 +632,6 @@ class NezhaPooler(nn.Module): ...@@ -637,7 +632,6 @@ class NezhaPooler(nn.Module):
return pooled_output return pooled_output
# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->Nezha
class NezhaPredictionHeadTransform(nn.Module): class NezhaPredictionHeadTransform(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -655,7 +649,6 @@ class NezhaPredictionHeadTransform(nn.Module): ...@@ -655,7 +649,6 @@ class NezhaPredictionHeadTransform(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Nezha
class NezhaLMPredictionHead(nn.Module): class NezhaLMPredictionHead(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -679,7 +672,6 @@ class NezhaLMPredictionHead(nn.Module): ...@@ -679,7 +672,6 @@ class NezhaLMPredictionHead(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Nezha
class NezhaOnlyMLMHead(nn.Module): class NezhaOnlyMLMHead(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -690,7 +682,6 @@ class NezhaOnlyMLMHead(nn.Module): ...@@ -690,7 +682,6 @@ class NezhaOnlyMLMHead(nn.Module):
return prediction_scores return prediction_scores
# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->Nezha
class NezhaOnlyNSPHead(nn.Module): class NezhaOnlyNSPHead(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -701,7 +692,6 @@ class NezhaOnlyNSPHead(nn.Module): ...@@ -701,7 +692,6 @@ class NezhaOnlyNSPHead(nn.Module):
return seq_relationship_score return seq_relationship_score
# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->Nezha
class NezhaPreTrainingHeads(nn.Module): class NezhaPreTrainingHeads(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
......
...@@ -145,7 +145,6 @@ class OpenLlamaConfig(PretrainedConfig): ...@@ -145,7 +145,6 @@ class OpenLlamaConfig(PretrainedConfig):
**kwargs, **kwargs,
) )
# Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
def _rope_scaling_validation(self): def _rope_scaling_validation(self):
""" """
Validate the `rope_scaling` configuration. Validate the `rope_scaling` configuration.
......
...@@ -46,7 +46,6 @@ except ImportError: ...@@ -46,7 +46,6 @@ except ImportError:
_CONFIG_FOR_DOC = "OpenLlamaConfig" _CONFIG_FOR_DOC = "OpenLlamaConfig"
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->OpenLlama
class OpenLlamaRMSNorm(nn.Module): class OpenLlamaRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6): def __init__(self, hidden_size, eps=1e-6):
""" """
...@@ -64,7 +63,6 @@ class OpenLlamaRMSNorm(nn.Module): ...@@ -64,7 +63,6 @@ class OpenLlamaRMSNorm(nn.Module):
return self.weight * hidden_states.to(input_dtype) return self.weight * hidden_states.to(input_dtype)
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->OpenLlama
class OpenLlamaRotaryEmbedding(nn.Module): class OpenLlamaRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__() super().__init__()
...@@ -101,7 +99,6 @@ class OpenLlamaRotaryEmbedding(nn.Module): ...@@ -101,7 +99,6 @@ class OpenLlamaRotaryEmbedding(nn.Module):
) )
# Copied from transformers.models.falcon.modeling_falcon.FalconLinearScalingRotaryEmbedding with Falcon->OpenLlama
class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding): class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
"""OpenLlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" """OpenLlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
...@@ -121,7 +118,6 @@ class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding): ...@@ -121,7 +118,6 @@ class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
# Copied from transformers.models.falcon.modeling_falcon.FalconDynamicNTKScalingRotaryEmbedding with Falcon->OpenLlama
class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding): class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
"""OpenLlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" """OpenLlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
...@@ -155,7 +151,6 @@ def rotate_half(x): ...@@ -155,7 +151,6 @@ def rotate_half(x):
return torch.cat((-x2, x1), dim=-1) return torch.cat((-x2, x1), dim=-1)
# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors. """Applies Rotary Position Embedding to the query and key tensors.
...@@ -228,7 +223,6 @@ class OpenLlamaAttention(nn.Module): ...@@ -228,7 +223,6 @@ class OpenLlamaAttention(nn.Module):
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
self._init_rope() self._init_rope()
# Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->OpenLlama
def _init_rope(self): def _init_rope(self):
if self.config.rope_scaling is None: if self.config.rope_scaling is None:
self.rotary_emb = OpenLlamaRotaryEmbedding( self.rotary_emb = OpenLlamaRotaryEmbedding(
......
...@@ -142,7 +142,6 @@ def load_tf_weights_in_qdqbert(model, tf_checkpoint_path): ...@@ -142,7 +142,6 @@ def load_tf_weights_in_qdqbert(model, tf_checkpoint_path):
return model return model
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert -> QDQBert
class QDQBertEmbeddings(nn.Module): class QDQBertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
...@@ -628,7 +627,6 @@ class QDQBertEncoder(nn.Module): ...@@ -628,7 +627,6 @@ class QDQBertEncoder(nn.Module):
) )
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert -> QDQBert
class QDQBertPooler(nn.Module): class QDQBertPooler(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -644,7 +642,6 @@ class QDQBertPooler(nn.Module): ...@@ -644,7 +642,6 @@ class QDQBertPooler(nn.Module):
return pooled_output return pooled_output
# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert -> QDQBert
class QDQBertPredictionHeadTransform(nn.Module): class QDQBertPredictionHeadTransform(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -697,7 +694,6 @@ class QDQBertOnlyMLMHead(nn.Module): ...@@ -697,7 +694,6 @@ class QDQBertOnlyMLMHead(nn.Module):
return prediction_scores return prediction_scores
# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert -> QDQBert
class QDQBertOnlyNSPHead(nn.Module): class QDQBertOnlyNSPHead(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
......
...@@ -150,7 +150,6 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path): ...@@ -150,7 +150,6 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
return model return model
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->Realm
class RealmEmbeddings(nn.Module): class RealmEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.""" """Construct the embeddings from word, position and token_type embeddings."""
...@@ -215,7 +214,6 @@ class RealmEmbeddings(nn.Module): ...@@ -215,7 +214,6 @@ class RealmEmbeddings(nn.Module):
return embeddings return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Realm
class RealmSelfAttention(nn.Module): class RealmSelfAttention(nn.Module):
def __init__(self, config, position_embedding_type=None): def __init__(self, config, position_embedding_type=None):
super().__init__() super().__init__()
...@@ -350,7 +348,6 @@ class RealmSelfAttention(nn.Module): ...@@ -350,7 +348,6 @@ class RealmSelfAttention(nn.Module):
return outputs return outputs
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Realm
class RealmSelfOutput(nn.Module): class RealmSelfOutput(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -370,7 +367,6 @@ REALM_SELF_ATTENTION_CLASSES = { ...@@ -370,7 +367,6 @@ REALM_SELF_ATTENTION_CLASSES = {
} }
# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Realm,BERT->REALM
class RealmAttention(nn.Module): class RealmAttention(nn.Module):
def __init__(self, config, position_embedding_type=None): def __init__(self, config, position_embedding_type=None):
super().__init__() super().__init__()
...@@ -422,7 +418,6 @@ class RealmAttention(nn.Module): ...@@ -422,7 +418,6 @@ class RealmAttention(nn.Module):
return outputs return outputs
# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Realm
class RealmIntermediate(nn.Module): class RealmIntermediate(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -438,7 +433,6 @@ class RealmIntermediate(nn.Module): ...@@ -438,7 +433,6 @@ class RealmIntermediate(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Realm
class RealmOutput(nn.Module): class RealmOutput(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -453,7 +447,6 @@ class RealmOutput(nn.Module): ...@@ -453,7 +447,6 @@ class RealmOutput(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Realm
class RealmLayer(nn.Module): class RealmLayer(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -540,7 +533,6 @@ class RealmLayer(nn.Module): ...@@ -540,7 +533,6 @@ class RealmLayer(nn.Module):
return layer_output return layer_output
# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Realm
class RealmEncoder(nn.Module): class RealmEncoder(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -634,7 +626,6 @@ class RealmEncoder(nn.Module): ...@@ -634,7 +626,6 @@ class RealmEncoder(nn.Module):
) )
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Realm
class RealmPooler(nn.Module): class RealmPooler(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
......
...@@ -28,7 +28,6 @@ logger = logging.get_logger(__name__) ...@@ -28,7 +28,6 @@ logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
# Copied from transformers.models.bert.tokenization_bert.load_vocab
def load_vocab(vocab_file): def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary.""" """Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict() vocab = collections.OrderedDict()
...@@ -40,7 +39,6 @@ def load_vocab(vocab_file): ...@@ -40,7 +39,6 @@ def load_vocab(vocab_file):
return vocab return vocab
# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
def whitespace_tokenize(text): def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text.""" """Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip() text = text.strip()
...@@ -96,7 +94,6 @@ class RetriBertTokenizer(PreTrainedTokenizer): ...@@ -96,7 +94,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"] model_input_names = ["input_ids", "attention_mask"]
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.__init__
def __init__( def __init__(
self, self,
vocab_file, vocab_file,
...@@ -145,20 +142,16 @@ class RetriBertTokenizer(PreTrainedTokenizer): ...@@ -145,20 +142,16 @@ class RetriBertTokenizer(PreTrainedTokenizer):
) )
@property @property
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
def do_lower_case(self): def do_lower_case(self):
return self.basic_tokenizer.do_lower_case return self.basic_tokenizer.do_lower_case
@property @property
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size
def vocab_size(self): def vocab_size(self):
return len(self.vocab) return len(self.vocab)
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab
def get_vocab(self): def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder) return dict(self.vocab, **self.added_tokens_encoder)
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
def _tokenize(self, text, split_special_tokens=False): def _tokenize(self, text, split_special_tokens=False):
split_tokens = [] split_tokens = []
if self.do_basic_tokenize: if self.do_basic_tokenize:
...@@ -174,23 +167,19 @@ class RetriBertTokenizer(PreTrainedTokenizer): ...@@ -174,23 +167,19 @@ class RetriBertTokenizer(PreTrainedTokenizer):
split_tokens = self.wordpiece_tokenizer.tokenize(text) split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens return split_tokens
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token)) return self.vocab.get(token, self.vocab.get(self.unk_token))
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token) return self.ids_to_tokens.get(index, self.unk_token)
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string.""" """Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip() out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string return out_string
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
...@@ -216,7 +205,6 @@ class RetriBertTokenizer(PreTrainedTokenizer): ...@@ -216,7 +205,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
def get_special_tokens_mask( def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]: ) -> List[int]:
...@@ -245,7 +233,6 @@ class RetriBertTokenizer(PreTrainedTokenizer): ...@@ -245,7 +233,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
...@@ -275,7 +262,6 @@ class RetriBertTokenizer(PreTrainedTokenizer): ...@@ -275,7 +262,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0 index = 0
if os.path.isdir(save_directory): if os.path.isdir(save_directory):
...@@ -297,7 +283,6 @@ class RetriBertTokenizer(PreTrainedTokenizer): ...@@ -297,7 +283,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
return (vocab_file,) return (vocab_file,)
# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
class BasicTokenizer(object): class BasicTokenizer(object):
""" """
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
...@@ -459,7 +444,6 @@ class BasicTokenizer(object): ...@@ -459,7 +444,6 @@ class BasicTokenizer(object):
return "".join(output) return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
class WordpieceTokenizer(object): class WordpieceTokenizer(object):
"""Runs WordPiece tokenization.""" """Runs WordPiece tokenization."""
......
...@@ -76,7 +76,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast): ...@@ -76,7 +76,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
slow_tokenizer_class = RetriBertTokenizer slow_tokenizer_class = RetriBertTokenizer
model_input_names = ["input_ids", "attention_mask"] model_input_names = ["input_ids", "attention_mask"]
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.__init__
def __init__( def __init__(
self, self,
vocab_file=None, vocab_file=None,
...@@ -119,7 +118,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast): ...@@ -119,7 +118,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
...@@ -144,7 +142,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast): ...@@ -144,7 +142,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
return output return output
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
...@@ -174,7 +171,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast): ...@@ -174,7 +171,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix) files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files) return tuple(files)
...@@ -36,7 +36,6 @@ _CONFIG_FOR_DOC = "Speech2Text2Config" ...@@ -36,7 +36,6 @@ _CONFIG_FOR_DOC = "Speech2Text2Config"
_CHECKPOINT_FOR_DOC = "facebook/s2t-wav2vec2-large-en-de" _CHECKPOINT_FOR_DOC = "facebook/s2t-wav2vec2-large-en-de"
# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextSinusoidalPositionalEmbedding with Speech2Text->Speech2Text2
class Speech2Text2SinusoidalPositionalEmbedding(nn.Module): class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
"""This module produces sinusoidal positional embeddings of any length.""" """This module produces sinusoidal positional embeddings of any length."""
...@@ -107,7 +106,6 @@ class Speech2Text2SinusoidalPositionalEmbedding(nn.Module): ...@@ -107,7 +106,6 @@ class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
return incremental_indices.long() + padding_idx return incremental_indices.long() + padding_idx
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Speech2Text2
class Speech2Text2Attention(nn.Module): class Speech2Text2Attention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper""" """Multi-headed attention from 'Attention Is All You Need' paper"""
......
...@@ -340,7 +340,6 @@ class TvltAudioPatchEmbeddings(nn.Module): ...@@ -340,7 +340,6 @@ class TvltAudioPatchEmbeddings(nn.Module):
return embeddings return embeddings
# Copied from transformers.models.vilt.modeling_vilt.ViltSelfAttention with Vilt->Tvlt
class TvltSelfAttention(nn.Module): class TvltSelfAttention(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -401,7 +400,6 @@ class TvltSelfAttention(nn.Module): ...@@ -401,7 +400,6 @@ class TvltSelfAttention(nn.Module):
return outputs return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltSelfOutput with Vilt->Tvlt
class TvltSelfOutput(nn.Module): class TvltSelfOutput(nn.Module):
""" """
The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
...@@ -420,7 +418,6 @@ class TvltSelfOutput(nn.Module): ...@@ -420,7 +418,6 @@ class TvltSelfOutput(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltAttention with Vilt->Tvlt
class TvltAttention(nn.Module): class TvltAttention(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -455,7 +452,6 @@ class TvltAttention(nn.Module): ...@@ -455,7 +452,6 @@ class TvltAttention(nn.Module):
return outputs return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltIntermediate with Vilt->Tvlt
class TvltIntermediate(nn.Module): class TvltIntermediate(nn.Module):
def __init__(self, config: TvltConfig) -> None: def __init__(self, config: TvltConfig) -> None:
super().__init__() super().__init__()
...@@ -472,7 +468,6 @@ class TvltIntermediate(nn.Module): ...@@ -472,7 +468,6 @@ class TvltIntermediate(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltOutput with Vilt->Tvlt
class TvltOutput(nn.Module): class TvltOutput(nn.Module):
def __init__(self, config: TvltConfig) -> None: def __init__(self, config: TvltConfig) -> None:
super().__init__() super().__init__()
...@@ -488,7 +483,6 @@ class TvltOutput(nn.Module): ...@@ -488,7 +483,6 @@ class TvltOutput(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltLayer with Vilt->Tvlt
class TvltLayer(nn.Module): class TvltLayer(nn.Module):
"""This corresponds to the Block class in the timm implementation.""" """This corresponds to the Block class in the timm implementation."""
...@@ -527,7 +521,6 @@ class TvltLayer(nn.Module): ...@@ -527,7 +521,6 @@ class TvltLayer(nn.Module):
return outputs return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltEncoder with Vilt->Tvlt
class TvltEncoder(nn.Module): class TvltEncoder(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
......
...@@ -48,7 +48,6 @@ _IMAGE_CLASS_CHECKPOINT = "Visual-Attention-Network/van-base" ...@@ -48,7 +48,6 @@ _IMAGE_CLASS_CHECKPOINT = "Visual-Attention-Network/van-base"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" _IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
# Copied from transformers.models.convnext.modeling_convnext.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
""" """
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
...@@ -69,7 +68,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals ...@@ -69,7 +68,6 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
return output return output
# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->Van
class VanDropPath(nn.Module): class VanDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
......
...@@ -140,7 +140,6 @@ class ViTHybridImageProcessor(BaseImageProcessor): ...@@ -140,7 +140,6 @@ class ViTHybridImageProcessor(BaseImageProcessor):
"input_data_format", "input_data_format",
] ]
# Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
def resize( def resize(
self, self,
image: np.ndarray, image: np.ndarray,
......
...@@ -51,7 +51,6 @@ class ViTHybridEmbeddings(nn.Module): ...@@ -51,7 +51,6 @@ class ViTHybridEmbeddings(nn.Module):
Construct the CLS token, position and patch embeddings. Optionally, also the mask token. Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
""" """
# Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.__init__ with ViT->ViTHybrid
def __init__(self, config: ViTHybridConfig, use_mask_token: bool = False) -> None: def __init__(self, config: ViTHybridConfig, use_mask_token: bool = False) -> None:
super().__init__() super().__init__()
...@@ -186,7 +185,6 @@ class ViTHybridPatchEmbeddings(nn.Module): ...@@ -186,7 +185,6 @@ class ViTHybridPatchEmbeddings(nn.Module):
return embeddings return embeddings
# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTHybrid
class ViTHybridSelfAttention(nn.Module): class ViTHybridSelfAttention(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None: def __init__(self, config: ViTHybridConfig) -> None:
super().__init__() super().__init__()
...@@ -247,7 +245,6 @@ class ViTHybridSelfAttention(nn.Module): ...@@ -247,7 +245,6 @@ class ViTHybridSelfAttention(nn.Module):
return outputs return outputs
# Copied from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention with ViT->ViTHybrid
class ViTHybridSdpaSelfAttention(ViTHybridSelfAttention): class ViTHybridSdpaSelfAttention(ViTHybridSelfAttention):
def __init__(self, config: ViTHybridConfig) -> None: def __init__(self, config: ViTHybridConfig) -> None:
super().__init__(config) super().__init__(config)
...@@ -279,7 +276,6 @@ class ViTHybridSdpaSelfAttention(ViTHybridSelfAttention): ...@@ -279,7 +276,6 @@ class ViTHybridSdpaSelfAttention(ViTHybridSelfAttention):
return context_layer, None return context_layer, None
# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTHybrid
class ViTHybridSelfOutput(nn.Module): class ViTHybridSelfOutput(nn.Module):
""" """
The residual connection is defined in ViTHybridLayer instead of here (as is the case with other models), due to the The residual connection is defined in ViTHybridLayer instead of here (as is the case with other models), due to the
...@@ -298,7 +294,6 @@ class ViTHybridSelfOutput(nn.Module): ...@@ -298,7 +294,6 @@ class ViTHybridSelfOutput(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTHybrid
class ViTHybridAttention(nn.Module): class ViTHybridAttention(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None: def __init__(self, config: ViTHybridConfig) -> None:
super().__init__() super().__init__()
...@@ -338,14 +333,12 @@ class ViTHybridAttention(nn.Module): ...@@ -338,14 +333,12 @@ class ViTHybridAttention(nn.Module):
return outputs return outputs
# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->ViTHybrid
class ViTHybridSdpaAttention(ViTHybridAttention): class ViTHybridSdpaAttention(ViTHybridAttention):
def __init__(self, config: ViTHybridConfig) -> None: def __init__(self, config: ViTHybridConfig) -> None:
super().__init__(config) super().__init__(config)
self.attention = ViTHybridSdpaSelfAttention(config) self.attention = ViTHybridSdpaSelfAttention(config)
# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->ViTHybrid
class ViTHybridIntermediate(nn.Module): class ViTHybridIntermediate(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None: def __init__(self, config: ViTHybridConfig) -> None:
super().__init__() super().__init__()
...@@ -362,7 +355,6 @@ class ViTHybridIntermediate(nn.Module): ...@@ -362,7 +355,6 @@ class ViTHybridIntermediate(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->ViTHybrid
class ViTHybridOutput(nn.Module): class ViTHybridOutput(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None: def __init__(self, config: ViTHybridConfig) -> None:
super().__init__() super().__init__()
...@@ -427,7 +419,6 @@ class ViTHybridLayer(nn.Module): ...@@ -427,7 +419,6 @@ class ViTHybridLayer(nn.Module):
return outputs return outputs
# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTHybrid
class ViTHybridEncoder(nn.Module): class ViTHybridEncoder(nn.Module):
def __init__(self, config: ViTHybridConfig) -> None: def __init__(self, config: ViTHybridConfig) -> None:
super().__init__() super().__init__()
...@@ -479,7 +470,6 @@ class ViTHybridEncoder(nn.Module): ...@@ -479,7 +470,6 @@ class ViTHybridEncoder(nn.Module):
) )
# Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel with ViT->ViTHybrid
class ViTHybridPreTrainedModel(PreTrainedModel): class ViTHybridPreTrainedModel(PreTrainedModel):
""" """
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
...@@ -558,7 +548,6 @@ VIT_INPUTS_DOCSTRING = r""" ...@@ -558,7 +548,6 @@ VIT_INPUTS_DOCSTRING = r"""
"The bare ViT Hybrid Model transformer outputting raw hidden-states without any specific head on top.", "The bare ViT Hybrid Model transformer outputting raw hidden-states without any specific head on top.",
VIT_START_DOCSTRING, VIT_START_DOCSTRING,
) )
# Copied from transformers.models.vit.modeling_vit.ViTModel with ViT->ViTHybrid
class ViTHybridModel(ViTHybridPreTrainedModel): class ViTHybridModel(ViTHybridPreTrainedModel):
def __init__(self, config: ViTHybridConfig, add_pooling_layer: bool = True, use_mask_token: bool = False): def __init__(self, config: ViTHybridConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
super().__init__(config) super().__init__(config)
...@@ -654,7 +643,6 @@ class ViTHybridModel(ViTHybridPreTrainedModel): ...@@ -654,7 +643,6 @@ class ViTHybridModel(ViTHybridPreTrainedModel):
) )
# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViT->ViTHybrid
class ViTHybridPooler(nn.Module): class ViTHybridPooler(nn.Module):
def __init__(self, config: ViTHybridConfig): def __init__(self, config: ViTHybridConfig):
super().__init__() super().__init__()
...@@ -677,7 +665,6 @@ class ViTHybridPooler(nn.Module): ...@@ -677,7 +665,6 @@ class ViTHybridPooler(nn.Module):
""", """,
VIT_START_DOCSTRING, VIT_START_DOCSTRING,
) )
# Copied from transformers.models.vit.modeling_vit.ViTForImageClassification with ViT->ViTHybrid
class ViTHybridForImageClassification(ViTHybridPreTrainedModel): class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
def __init__(self, config: ViTHybridConfig) -> None: def __init__(self, config: ViTHybridConfig) -> None:
super().__init__(config) super().__init__(config)
......
...@@ -44,7 +44,6 @@ logger = logging.get_logger(__name__) ...@@ -44,7 +44,6 @@ logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "XLMProphetNetConfig" _CONFIG_FOR_DOC = "XLMProphetNetConfig"
# Copied from src.transformers.models.prophetnet.modeling_prophetnet.PROPHETNET_START_DOCSTRING with ProphetNetConfig->XLMProphetNetConfig
XLM_PROPHETNET_START_DOCSTRING = r""" XLM_PROPHETNET_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
...@@ -64,7 +63,6 @@ XLM_PROPHETNET_START_DOCSTRING = r""" ...@@ -64,7 +63,6 @@ XLM_PROPHETNET_START_DOCSTRING = r"""
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
""" """
# Copied from src.transformers.models.prophetnet.modeling_prophetnet.PROPHETNET_INPUTS_DOCSTRING with ProphetNet->XLMProphetNet
XLM_PROPHETNET_INPUTS_DOCSTRING = r""" XLM_PROPHETNET_INPUTS_DOCSTRING = r"""
Args: Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
...@@ -139,7 +137,6 @@ XLM_PROPHETNET_INPUTS_DOCSTRING = r""" ...@@ -139,7 +137,6 @@ XLM_PROPHETNET_INPUTS_DOCSTRING = r"""
""" """
# Copied from src.transformers.models.prophetnet.modeling_prophetnet.PROPHETNET_STANDALONE_INPUTS_DOCSTRING with ProphetNet->XLMProphetNet
XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r""" XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
Args: Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
...@@ -174,7 +171,6 @@ XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r""" ...@@ -174,7 +171,6 @@ XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
""" """
# Copied from transformers.models.prophetnet.modeling_prophetnet.softmax
def softmax(hidden_state, dim, onnx_trace=False): def softmax(hidden_state, dim, onnx_trace=False):
if onnx_trace: if onnx_trace:
return nn.functional.softmax(hidden_state.float(), dim=dim) return nn.functional.softmax(hidden_state.float(), dim=dim)
...@@ -182,7 +178,6 @@ def softmax(hidden_state, dim, onnx_trace=False): ...@@ -182,7 +178,6 @@ def softmax(hidden_state, dim, onnx_trace=False):
return nn.functional.softmax(hidden_state, dim=dim, dtype=torch.float32) return nn.functional.softmax(hidden_state, dim=dim, dtype=torch.float32)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ngram_attention_bias
def ngram_attention_bias(sequence_length, ngram, device, dtype): def ngram_attention_bias(sequence_length, ngram, device, dtype):
""" """
This function computes the bias for the predict stream This function computes the bias for the predict stream
...@@ -200,7 +195,6 @@ def ngram_attention_bias(sequence_length, ngram, device, dtype): ...@@ -200,7 +195,6 @@ def ngram_attention_bias(sequence_length, ngram, device, dtype):
return torch.cat([left_block, right_block], dim=2) return torch.cat([left_block, right_block], dim=2)
# Copied from transformers.models.prophetnet.modeling_prophetnet.compute_relative_buckets
def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False): def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False):
""" """
This function computes individual parts of the relative position buckets. For more detail, see paper. This function computes individual parts of the relative position buckets. For more detail, see paper.
...@@ -228,7 +222,6 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b ...@@ -228,7 +222,6 @@ def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_b
return rel_positions_bucket return rel_positions_bucket
# Copied from transformers.models.prophetnet.modeling_prophetnet.compute_all_stream_relative_buckets
def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids): def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids):
""" """
This function computes both main and predict relative position buckets. For more detail, see paper. This function computes both main and predict relative position buckets. For more detail, see paper.
...@@ -253,7 +246,6 @@ def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids) ...@@ -253,7 +246,6 @@ def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids)
@dataclass @dataclass
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput with ProphetNet->XLMProphetNet all-casing
class XLMProphetNetSeq2SeqLMOutput(ModelOutput): class XLMProphetNetSeq2SeqLMOutput(ModelOutput):
""" """
Base class for sequence-to-sequence language models outputs. Base class for sequence-to-sequence language models outputs.
...@@ -339,7 +331,6 @@ class XLMProphetNetSeq2SeqLMOutput(ModelOutput): ...@@ -339,7 +331,6 @@ class XLMProphetNetSeq2SeqLMOutput(ModelOutput):
@dataclass @dataclass
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput with ProphetNet->XLMProphetNet all-casing
class XLMProphetNetSeq2SeqModelOutput(ModelOutput): class XLMProphetNetSeq2SeqModelOutput(ModelOutput):
""" """
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
...@@ -426,7 +417,6 @@ class XLMProphetNetSeq2SeqModelOutput(ModelOutput): ...@@ -426,7 +417,6 @@ class XLMProphetNetSeq2SeqModelOutput(ModelOutput):
@dataclass @dataclass
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput with ProphetNet->XLMProphetNet all-casing
class XLMProphetNetDecoderModelOutput(ModelOutput): class XLMProphetNetDecoderModelOutput(ModelOutput):
""" """
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
...@@ -487,7 +477,6 @@ class XLMProphetNetDecoderModelOutput(ModelOutput): ...@@ -487,7 +477,6 @@ class XLMProphetNetDecoderModelOutput(ModelOutput):
@dataclass @dataclass
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput with ProphetNet->XLMProphetNet all-casing
class XLMProphetNetDecoderLMOutput(ModelOutput): class XLMProphetNetDecoderLMOutput(ModelOutput):
""" """
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
...@@ -549,7 +538,6 @@ class XLMProphetNetDecoderLMOutput(ModelOutput): ...@@ -549,7 +538,6 @@ class XLMProphetNetDecoderLMOutput(ModelOutput):
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetPreTrainedModel with ProphetNet->XLMProphetNet
class XLMProphetNetPreTrainedModel(PreTrainedModel): class XLMProphetNetPreTrainedModel(PreTrainedModel):
config_class = XLMProphetNetConfig config_class = XLMProphetNetConfig
base_model_prefix = "prophetnet" base_model_prefix = "prophetnet"
...@@ -588,7 +576,6 @@ class XLMProphetNetPreTrainedModel(PreTrainedModel): ...@@ -588,7 +576,6 @@ class XLMProphetNetPreTrainedModel(PreTrainedModel):
return shifted_input_ids return shifted_input_ids
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetPositionalEmbeddings with ProphetNet->XLMProphetNet
class XLMProphetNetPositionalEmbeddings(nn.Embedding): class XLMProphetNetPositionalEmbeddings(nn.Embedding):
""" """
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
...@@ -632,7 +619,6 @@ class XLMProphetNetPositionalEmbeddings(nn.Embedding): ...@@ -632,7 +619,6 @@ class XLMProphetNetPositionalEmbeddings(nn.Embedding):
return super().forward(position_ids) return super().forward(position_ids)
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetAttention with ProphetNet->XLMProphetNet
class XLMProphetNetAttention(nn.Module): class XLMProphetNetAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper""" """Multi-headed attention from 'Attention Is All You Need' paper"""
...@@ -762,7 +748,6 @@ class XLMProphetNetAttention(nn.Module): ...@@ -762,7 +748,6 @@ class XLMProphetNetAttention(nn.Module):
return attn_output, attn_weights_reshaped, past_key_value return attn_output, attn_weights_reshaped, past_key_value
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetFeedForward with ProphetNet->XLMProphetNet
class XLMProphetNetFeedForward(nn.Module): class XLMProphetNetFeedForward(nn.Module):
""" """
This is the residual two feed-forward layer block based on the original Transformer implementation. This is the residual two feed-forward layer block based on the original Transformer implementation.
...@@ -786,7 +771,6 @@ class XLMProphetNetFeedForward(nn.Module): ...@@ -786,7 +771,6 @@ class XLMProphetNetFeedForward(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetNgramSelfAttention with ProphetNet->XLMProphetNet
class XLMProphetNetNgramSelfAttention(nn.Module): class XLMProphetNetNgramSelfAttention(nn.Module):
def __init__(self, config: XLMProphetNetConfig): def __init__(self, config: XLMProphetNetConfig):
super().__init__() super().__init__()
...@@ -1106,7 +1090,6 @@ class XLMProphetNetNgramSelfAttention(nn.Module): ...@@ -1106,7 +1090,6 @@ class XLMProphetNetNgramSelfAttention(nn.Module):
return predict_relative_pos_embeddings return predict_relative_pos_embeddings
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetEncoderLayer with ProphetNet->XLMProphetNet, Prophetnet->XLMProphetnet
class XLMProphetNetEncoderLayer(nn.Module): class XLMProphetNetEncoderLayer(nn.Module):
""" """
Encoder block for XLMProphetnet Encoder block for XLMProphetnet
...@@ -1150,7 +1133,6 @@ class XLMProphetNetEncoderLayer(nn.Module): ...@@ -1150,7 +1133,6 @@ class XLMProphetNetEncoderLayer(nn.Module):
return outputs return outputs
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderLayer with Prophetnet->XLMProphetnet, ProphetNet->XLMProphetNet
class XLMProphetNetDecoderLayer(nn.Module): class XLMProphetNetDecoderLayer(nn.Module):
""" """
Decoder block for XLMProphetnet Decoder block for XLMProphetnet
...@@ -1239,7 +1221,6 @@ class XLMProphetNetDecoderLayer(nn.Module): ...@@ -1239,7 +1221,6 @@ class XLMProphetNetDecoderLayer(nn.Module):
"The standalone encoder part of the XLMProphetNetModel.", "The standalone encoder part of the XLMProphetNetModel.",
XLM_PROPHETNET_START_DOCSTRING, XLM_PROPHETNET_START_DOCSTRING,
) )
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetEncoder with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel): class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel):
r""" r"""
word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*): word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
...@@ -1374,7 +1355,6 @@ class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel): ...@@ -1374,7 +1355,6 @@ class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel):
"The standalone decoder part of the XLMProphetNetModel.", "The standalone decoder part of the XLMProphetNetModel.",
XLM_PROPHETNET_START_DOCSTRING, XLM_PROPHETNET_START_DOCSTRING,
) )
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoder with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET,
class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel): class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel):
r""" r"""
word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*): word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
...@@ -1743,7 +1723,6 @@ class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel): ...@@ -1743,7 +1723,6 @@ class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel):
"The bare XLMProphetNet Model outputting raw hidden-states without any specific head on top.", "The bare XLMProphetNet Model outputting raw hidden-states without any specific head on top.",
XLM_PROPHETNET_START_DOCSTRING, XLM_PROPHETNET_START_DOCSTRING,
) )
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetModel with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetModel(XLMProphetNetPreTrainedModel): class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
...@@ -1878,7 +1857,6 @@ class XLMProphetNetModel(XLMProphetNetPreTrainedModel): ...@@ -1878,7 +1857,6 @@ class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
"The XLMProphetNet Model with a language modeling head. Can be used for sequence generation tasks.", "The XLMProphetNet Model with a language modeling head. Can be used for sequence generation tasks.",
XLM_PROPHETNET_START_DOCSTRING, XLM_PROPHETNET_START_DOCSTRING,
) )
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForConditionalGeneration with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel): class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
_tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
...@@ -2073,7 +2051,6 @@ class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel): ...@@ -2073,7 +2051,6 @@ class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
return self._shift_right(labels) return self._shift_right(labels)
@staticmethod @staticmethod
# Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration._reorder_cache
def _reorder_cache(past_key_values, beam_idx): def _reorder_cache(past_key_values, beam_idx):
reordered_past = () reordered_past = ()
for layer_past in past_key_values: for layer_past in past_key_values:
...@@ -2096,7 +2073,6 @@ class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel): ...@@ -2096,7 +2073,6 @@ class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
" language modeling.", " language modeling.",
XLM_PROPHETNET_START_DOCSTRING, XLM_PROPHETNET_START_DOCSTRING,
) )
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForCausalLM with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET
class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel): class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
_tied_weights_keys = [ _tied_weights_keys = [
"prophetnet.word_embeddings.weight", "prophetnet.word_embeddings.weight",
...@@ -2329,7 +2305,6 @@ class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel): ...@@ -2329,7 +2305,6 @@ class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
} }
@staticmethod @staticmethod
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM._reorder_cache
def _reorder_cache(past_key_values, beam_idx): def _reorder_cache(past_key_values, beam_idx):
reordered_past = () reordered_past = ()
for layer_past in past_key_values: for layer_past in past_key_values:
...@@ -2339,7 +2314,6 @@ class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel): ...@@ -2339,7 +2314,6 @@ class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
return reordered_past return reordered_past
# Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderWrapper with ProphetNet->XLMProphetNet, prophetnet->XLMProphetNet
class XLMProphetNetDecoderWrapper(XLMProphetNetPreTrainedModel): class XLMProphetNetDecoderWrapper(XLMProphetNetPreTrainedModel):
""" """
This is a wrapper class, so that [`XLMProphetNetForCausalLM`] can correctly be loaded from pretrained XLMProphetNet This is a wrapper class, so that [`XLMProphetNetForCausalLM`] can correctly be loaded from pretrained XLMProphetNet
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment