Move torch.compile() wrapping after DDP/FSDP wrapping to ensure correct graph...

Move torch.compile() wrapping after DDP/FSDP wrapping to ensure correct graph breaks during training (#22279)

Move torch.compile() wrapping after DDP/FSDP wrapping to ensure correct graph...
Move torch.compile() wrapping after DDP/FSDP wrapping to ensure correct graph breaks during training (#22279)
fb0a38b4 · Antoni Viros · GitHub · 8ac29fe0 · fb0a38b4
Unverified Commit fb0a38b4 authored Mar 20, 2023 by Antoni Viros Committed by GitHub Mar 20, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 3 deletions

src/transformers/trainer.py src/transformers/trainer.py +5 -3

No files found.
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1361,9 +1361,6 @@ class Trainer:
        return model

    def _wrap_model(self, model, training=True, dataloader=None):
-        if self.args.torch_compile:
-            model = torch.compile(model, backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode)
-
        if self.args.use_ipex:
            dtype = torch.bfloat16 if self.use_cpu_amp else torch.float32
            model = self.ipex_optimize_model(model, training, dtype=dtype)
@@ -1550,6 +1547,11 @@ class Trainer:
                **kwargs,
            )

+        # torch.compile() needs to be called after wrapping the model with FSDP or DDP
+        # to ensure that it accounts for the graph breaks required by those wrappers
+        if self.args.torch_compile:
+            model = torch.compile(model, backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode)
+
        return model

    def train(