Bugfix device map detr model (#26849)

* Fixed replace_batch_norm when on meta device * lint fix * Adding coauthor Co-authored-by: Pi Esposito <piero.skywalker@gmail.com> * Removed tests * Remove unused deps * Try to fix copy issue * try fix copy one more time * Reverted import changes --------- Co-authored-by: Pi Esposito <piero.skywalker@gmail.com>

Bugfix device map detr model (#26849)
* Fixed replace_batch_norm when on meta device * lint fix * Adding coauthor Co-authored-by: Pi Esposito <piero.skywalker@gmail.com> * Removed tests * Remove unused deps * Try to fix copy issue * try fix copy one more time * Reverted import changes --------- Co-authored-by: Pi Esposito <piero.skywalker@gmail.com>
f370bebd · Pedro Gabriel Gengo Lourenço · GitHub · b0d1d7f7 · f370bebd · f370bebd
Unverified Commit f370bebd authored Oct 23, 2023 by Pedro Gabriel Gengo Lourenço Committed by GitHub Oct 23, 2023
5 changed files
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -322,6 +322,7 @@ def replace_batch_norm(model):
        if isinstance(module, nn.BatchNorm2d):
            new_module = ConditionalDetrFrozenBatchNorm2d(module.num_features)

+            if not module.weight.device == torch.device("meta"):
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
@@ -1145,6 +1146,7 @@ class ConditionalDetrPreTrainedModel(PreTrainedModel):
    config_class = ConditionalDetrConfig
    base_model_prefix = "model"
    main_input_name = "pixel_values"
+    _no_split_modules = [r"ConditionalDetrConvEncoder", r"ConditionalDetrEncoderLayer", r"ConditionalDetrDecoderLayer"]

    def _init_weights(self, module):
        std = self.config.init_std

--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -369,6 +369,7 @@ def replace_batch_norm(model):
        if isinstance(module, nn.BatchNorm2d):
            new_module = DeformableDetrFrozenBatchNorm2d(module.num_features)

+            if not module.weight.device == torch.device("meta"):
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
@@ -1061,6 +1062,7 @@ class DeformableDetrPreTrainedModel(PreTrainedModel):
    config_class = DeformableDetrConfig
    base_model_prefix = "model"
    main_input_name = "pixel_values"
+    _no_split_modules = [r"DeformableDetrConvEncoder", r"DeformableDetrEncoderLayer", r"DeformableDetrDecoderLayer"]

    def _init_weights(self, module):
        std = self.config.init_std

--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -307,6 +307,7 @@ def replace_batch_norm(model):
        if isinstance(module, nn.BatchNorm2d):
            new_module = DetaFrozenBatchNorm2d(module.num_features)

+            if not module.weight.device == torch.device("meta"):
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
@@ -947,11 +948,12 @@ class DetaClassificationHead(nn.Module):
        return hidden_states


-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->Deta
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetrConvEncoder->DetaBackboneWithPositionalEncodings,DeformableDetr->Deta
 class DetaPreTrainedModel(PreTrainedModel):
    config_class = DetaConfig
    base_model_prefix = "model"
    main_input_name = "pixel_values"
+    _no_split_modules = [r"DetaBackboneWithPositionalEncodings", r"DetaEncoderLayer", r"DetaDecoderLayer"]

    def _init_weights(self, module):
        std = self.config.init_std

--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -316,6 +316,7 @@ def replace_batch_norm(model):
        if isinstance(module, nn.BatchNorm2d):
            new_module = DetrFrozenBatchNorm2d(module.num_features)

+            if not module.weight.device == torch.device("meta"):
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
@@ -901,6 +902,7 @@ class DetrPreTrainedModel(PreTrainedModel):
    config_class = DetrConfig
    base_model_prefix = "model"
    main_input_name = "pixel_values"
+    _no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"]

    def _init_weights(self, module):
        std = self.config.init_std

--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -251,6 +251,7 @@ def replace_batch_norm(model):
        if isinstance(module, nn.BatchNorm2d):
            new_module = TableTransformerFrozenBatchNorm2d(module.num_features)

+            if not module.weight.device == torch.device("meta"):
                new_module.weight.data.copy_(module.weight)
                new_module.bias.data.copy_(module.bias)
                new_module.running_mean.data.copy_(module.running_mean)
@@ -813,6 +814,11 @@ class TableTransformerPreTrainedModel(PreTrainedModel):
    config_class = TableTransformerConfig
    base_model_prefix = "model"
    main_input_name = "pixel_values"
+    _no_split_modules = [
+        r"TableTransformerConvEncoder",
+        r"TableTransformerEncoderLayer",
+        r"TableTransformerDecoderLayer",
+    ]

    def _init_weights(self, module):
        std = self.config.init_std