[core] Layerwise Upcasting (#10347)

* update * update * make style * remove dynamo disable * add coauthor Co-Authored-By: Dhruv Nair <dhruv.nair@gmail.com> * update * update * update * update mixin * add some basic tests * update * update * non_blocking * improvements * update * norm.* -> norm * apply suggestions from review * add example * update hook implementation to the latest changes from pyramid attention broadcast * deinitialize should raise an error * update doc page * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * update docs * update * refactor * fix _always_upcast_modules for asym ae and vq_model * fix lumina embedding forward to not depend on weight dtype * refactor tests * add simple lora inference tests * _always_upcast_modules -> _precision_sensitive_module_patterns * remove todo comments about review; revert changes to self.dtype in unets because .dtype on ModelMixin should be able to handle fp8 weight case * check layer dtypes in lora test * fix UNet1DModelTests::test_layerwise_upcasting_inference * _precision_sensitive_module_patterns -> _skip_layerwise_casting_patterns based on feedback * skip test in NCSNppModelTests * skip tests for AutoencoderTinyTests * skip tests for AutoencoderOobleckTests * skip tests for UNet1DModelTests - unsupported pytorch operations * layerwise_upcasting -> layerwise_casting * skip tests for UNetRLModelTests; needs next pytorch release for currently unimplemented operation support * add layerwise fp8 pipeline test * use xfail * Apply suggestions from code review Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com> * add assertion with fp32 comparison; add tolerance to fp8-fp32 vs fp32-fp32 comparison (required for a few models' test to pass) * add note about memory consumption on tesla CI runner for failing test --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com> Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

[core] Layerwise Upcasting (#10347)
* update * update * make style * remove dynamo disable * add coauthor Co-Authored-By: Dhruv Nair <dhruv.nair@gmail.com> * update * update * update * update mixin * add some basic tests * update * update * non_blocking * improvements * update * norm.* -> norm * apply suggestions from review * add example * update hook implementation to the latest changes from pyramid attention broadcast * deinitialize should raise an error * update doc page * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * update docs * update * refactor * fix _always_upcast_modules for asym ae and vq_model * fix lumina embedding forward to not depend on weight dtype * refactor tests * add simple lora inference tests * _always_upcast_modules -> _precision_sensitive_module_patterns * remove todo comments about review; revert changes to self.dtype in unets because .dtype on ModelMixin should be able to handle fp8 weight case * check layer dtypes in lora test * fix UNet1DModelTests::test_layerwise_upcasting_inference * _precision_sensitive_module_patterns -> _skip_layerwise_casting_patterns based on feedback * skip test in NCSNppModelTests * skip tests for AutoencoderTinyTests * skip tests for AutoencoderOobleckTests * skip tests for UNet1DModelTests - unsupported pytorch operations * layerwise_upcasting -> layerwise_casting * skip tests for UNetRLModelTests; needs next pytorch release for currently unimplemented operation support * add layerwise fp8 pipeline test * use xfail * Apply suggestions from code review Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com> * add assertion with fp32 comparison; add tolerance to fp8-fp32 vs fp32-fp32 comparison (required for a few models' test to pass) * add note about memory consumption on tesla CI runner for failing test --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com> Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
beacaa55 · Aryan · GitHub · a6476822 · beacaa55 · beacaa55
Unverified Commit beacaa55 authored Jan 22, 2025 by Aryan Committed by GitHub Jan 22, 2025
20 changed files
--- a/tests/pipelines/animatediff/test_animatediff.py
+++ b/tests/pipelines/animatediff/test_animatediff.py
@@ -60,6 +60,7 @@ class AnimateDiffPipelineFastTests(
            "callback_on_step_end_tensor_inputs",
        ]
    )
+    test_layerwise_casting = True

    def get_dummy_components(self):
        cross_attention_dim = 8

--- a/tests/pipelines/aura_flow/test_pipeline_aura_flow.py
+++ b/tests/pipelines/aura_flow/test_pipeline_aura_flow.py
@@ -30,6 +30,7 @@ class AuraFlowPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
        ]
    )
    batch_params = frozenset(["prompt", "negative_prompt"])
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/cogvideo/test_cogvideox.py
+++ b/tests/pipelines/cogvideo/test_cogvideox.py
@@ -58,6 +58,7 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        ]
    )
    test_xformers_attention = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/cogvideo/test_cogvideox_fun_control.py
+++ b/tests/pipelines/cogvideo/test_cogvideox_fun_control.py
@@ -55,6 +55,7 @@ class CogVideoXFunControlPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        ]
    )
    test_xformers_attention = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/cogview3/test_cogview3plus.py
+++ b/tests/pipelines/cogview3/test_cogview3plus.py
@@ -56,6 +56,7 @@ class CogView3PlusPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        ]
    )
    test_xformers_attention = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/consisid/test_consisid.py
+++ b/tests/pipelines/consisid/test_consisid.py
@@ -58,6 +58,7 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        ]
    )
    test_xformers_attention = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -126,6 +126,7 @@ class ControlNetPipelineFastTests(
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    test_layerwise_casting = True

    def get_dummy_components(self, time_cond_proj_dim=None):
        torch.manual_seed(0)

--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -75,6 +75,7 @@ class StableDiffusionXLControlNetPipelineFastTests(
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    test_layerwise_casting = True

    def get_dummy_components(self, time_cond_proj_dim=None):
        torch.manual_seed(0)

--- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py
+++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py
@@ -50,6 +50,7 @@ class FluxControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin):

    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
    batch_params = frozenset(["prompt"])
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py
+++ b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py
@@ -57,6 +57,7 @@ class HunyuanDiTControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMix
        ]
    )
    batch_params = frozenset(["prompt", "negative_prompt"])
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
+++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -59,6 +59,7 @@ class StableDiffusion3ControlNetPipelineFastTests(unittest.TestCase, PipelineTes
        ]
    )
    batch_params = frozenset(["prompt", "negative_prompt"])
+    test_layerwise_casting = True

    def get_dummy_components(
        self, num_controlnet_layers: int = 3, qk_norm: Optional[str] = "rms_norm", use_dual_attention=False

--- a/tests/pipelines/controlnet_xs/test_controlnetxs.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py
@@ -139,6 +139,7 @@ class ControlNetXSPipelineFastTests(
    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS

    test_attention_slicing = False
+    test_layerwise_casting = True

    def get_dummy_components(self, time_cond_proj_dim=None):
        torch.manual_seed(0)

--- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
@@ -78,6 +78,7 @@ class StableDiffusionXLControlNetXSPipelineFastTests(
    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS

    test_attention_slicing = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -31,6 +31,7 @@ class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxIPAdapte

    # there is no xformers processor for Flux
    test_xformers_attention = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/flux/test_pipeline_flux_control.py
+++ b/tests/pipelines/flux/test_pipeline_flux_control.py
@@ -22,6 +22,7 @@ class FluxControlPipelineFastTests(unittest.TestCase, PipelineTesterMixin):

    # there is no xformers processor for Flux
    test_xformers_attention = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/flux/test_pipeline_flux_fill.py
+++ b/tests/pipelines/flux/test_pipeline_flux_fill.py
@@ -23,6 +23,7 @@ class FluxFillPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
    params = frozenset(["prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"])
    batch_params = frozenset(["prompt"])
    test_xformers_attention = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py
+++ b/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py
@@ -55,6 +55,7 @@ class HunyuanDiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS

    required_optional_params = PipelineTesterMixin.required_optional_params
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/hunyuan_video/test_hunyuan_video.py
+++ b/tests/pipelines/hunyuan_video/test_hunyuan_video.py
@@ -53,6 +53,7 @@ class HunyuanVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

    # there is no xformers processor for Flux
    test_xformers_attention = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
+++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
@@ -61,6 +61,7 @@ class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unit
    required_optional_params = frozenset(["num_inference_steps", "generator", "latents", "return_dict"])

    supports_dduf = False
+    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

--- a/tests/pipelines/kolors/test_kolors.py
+++ b/tests/pipelines/kolors/test_kolors.py
@@ -48,6 +48,7 @@ class KolorsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"})

    supports_dduf = False
+    test_layerwise_casting = True

    def get_dummy_components(self, time_cond_proj_dim=None):
        torch.manual_seed(0)