[tests] add a test on torch compile for varied resolutions (#11776)

* add test for checking compile on different shapes. * update * update * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

[tests] add a test on torch compile for varied resolutions (#11776)
* add test for checking compile on different shapes. * update * update * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
a185e1ab · Sayak Paul · GitHub · d93381cd · a185e1ab · a185e1ab
Unverified Commit a185e1ab authored Jun 26, 2025 by Sayak Paul Committed by GitHub Jun 26, 2025
3 changed files
--- a/docs/source/en/optimization/fp16.md
+++ b/docs/source/en/optimization/fp16.md
@@ -150,6 +150,28 @@ pipeline(prompt, num_inference_steps=30).images[0]

 Compilation is slow the first time, but once compiled, it is significantly faster. Try to only use the compiled pipeline on the same type of inference operations. Calling the compiled pipeline on a different image size retriggers compilation which is slow and inefficient.

+### Dynamic shape compilation
+
+> [!TIP]
+> Make sure to always use the nightly version of PyTorch for better support.
+
+`torch.compile` keeps track of input shapes and conditions, and if these are different, it recompiles the model. For example, if a model is compiled on a 1024x1024 resolution image and used on an image with a different resolution, it triggers recompilation.
+
+To avoid recompilation, add `dynamic=True` to try and generate a more dynamic kernel to avoid recompilation when conditions change.
+
+```diff
+ torch.fx.experimental._config.use_duck_shape = False
+ pipeline.unet = torch.compile(
+    pipeline.unet, fullgraph=True, dynamic=True
+)
+```
+
+Specifying `use_duck_shape=False` instructs the compiler if it should use the same symbolic variable to represent input sizes that are the same. For more details, check out this [comment](https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790).
+
+Not all models may benefit from dynamic compilation out of the box and may require changes. Refer to this [PR](https://github.com/huggingface/diffusers/pull/11297/) that improved the [`AuraFlowPipeline`] implementation to benefit from dynamic compilation.
+
+Feel free to open an issue if dynamic compilation doesn't work as expected for a Diffusers model.
+
 ### Regional compilation



--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -76,6 +76,7 @@ from diffusers.utils.testing_utils import (
    require_torch_accelerator_with_training,
    require_torch_gpu,
    require_torch_multi_accelerator,
+    require_torch_version_greater,
    run_test_in_subprocess,
    slow,
    torch_all_close,
@@ -1907,6 +1908,8 @@ class ModelPushToHubTester(unittest.TestCase):
 @is_torch_compile
 @slow
 class TorchCompileTesterMixin:
+    different_shapes_for_compilation = None
+
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
@@ -1957,14 +1960,14 @@ class TorchCompileTesterMixin:
            _ = model(**inputs_dict)

    def test_compile_with_group_offloading(self):
+        if not self.model_class._supports_group_offloading:
+            pytest.skip("Model does not support group offloading.")
+
        torch._dynamo.config.cache_size_limit = 10000

        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**init_dict)

-        if not getattr(model, "_supports_group_offloading", True):
-            return
-
        model.eval()
        # TODO: Can test for other group offloading kwargs later if needed.
        group_offload_kwargs = {
@@ -1981,6 +1984,21 @@ class TorchCompileTesterMixin:
            _ = model(**inputs_dict)
            _ = model(**inputs_dict)

+    @require_torch_version_greater("2.7.1")
+    def test_compile_on_different_shapes(self):
+        if self.different_shapes_for_compilation is None:
+            pytest.skip(f"Skipping as `different_shapes_for_compilation` is not set for {self.__class__.__name__}.")
+        torch.fx.experimental._config.use_duck_shape = False
+
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True, dynamic=True)
+
+        for height, width in self.different_shapes_for_compilation:
+            with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+                inputs_dict = self.prepare_dummy_input(height=height, width=width)
+                _ = model(**inputs_dict)
+

 @slow
 @require_torch_2

--- a/tests/models/transformers/test_models_transformer_flux.py
+++ b/tests/models/transformers/test_models_transformer_flux.py
@@ -91,10 +91,20 @@ class FluxTransformerTests(ModelTesterMixin, unittest.TestCase):

    @property
    def dummy_input(self):
+        return self.prepare_dummy_input()
+
+    @property
+    def input_shape(self):
+        return (16, 4)
+
+    @property
+    def output_shape(self):
+        return (16, 4)
+
+    def prepare_dummy_input(self, height=4, width=4):
        batch_size = 1
        num_latent_channels = 4
        num_image_channels = 3
-        height = width = 4
        sequence_length = 48
        embedding_dim = 32

@@ -114,14 +124,6 @@ class FluxTransformerTests(ModelTesterMixin, unittest.TestCase):
            "timestep": timestep,
        }

-    @property
-    def input_shape(self):
-        return (16, 4)
-
-    @property
-    def output_shape(self):
-        return (16, 4)
-
    def prepare_init_args_and_inputs_for_common(self):
        init_dict = {
            "patch_size": 1,
@@ -173,10 +175,14 @@ class FluxTransformerTests(ModelTesterMixin, unittest.TestCase):

 class FluxTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
    model_class = FluxTransformer2DModel
+    different_shapes_for_compilation = [(4, 4), (4, 8), (8, 8)]

    def prepare_init_args_and_inputs_for_common(self):
        return FluxTransformerTests().prepare_init_args_and_inputs_for_common()

+    def prepare_dummy_input(self, height, width):
+        return FluxTransformerTests().prepare_dummy_input(height=height, width=width)
+

 class FluxTransformerLoRAHotSwapTests(LoraHotSwappingForModelTesterMixin, unittest.TestCase):
    model_class = FluxTransformer2DModel