Remove CogVideoX mentions from single file docs; Test updates (#9444)

* remove mentions from single file * update tests * update

Remove CogVideoX mentions from single file docs; Test updates (#9444)
* remove mentions from single file * update tests * update
ba06124e · Aryan · GitHub · bb1b0fa1 · ba06124e · ba06124e
Unverified Commit ba06124e authored Sep 18, 2024 by Aryan Committed by GitHub Sep 17, 2024
4 changed files
--- a/docs/source/en/api/loaders/single_file.md
+++ b/docs/source/en/api/loaders/single_file.md
@@ -22,9 +22,6 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:

 ## Supported pipelines

- [`CogVideoXPipeline`]
- [`CogVideoXImageToVideoPipeline`]
- [`CogVideoXVideoToVideoPipeline`]
 - [`StableDiffusionPipeline`]
 - [`StableDiffusionImg2ImgPipeline`]
 - [`StableDiffusionInpaintPipeline`]
@@ -52,7 +49,6 @@ The [`~loaders.FromSingleFileMixin.from_single_file`] method allows you to load:
 - [`UNet2DConditionModel`]
 - [`StableCascadeUNet`]
 - [`AutoencoderKL`]
- [`AutoencoderKLCogVideoX`]
 - [`ControlNetModel`]
 - [`SD3Transformer2DModel`]
 - [`FluxTransformer2DModel`]

--- a/tests/pipelines/cogvideo/test_cogvideox.py
+++ b/tests/pipelines/cogvideo/test_cogvideox.py
@@ -57,6 +57,7 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            "callback_on_step_end_tensor_inputs",
        ]
    )
+    test_xformers_attention = False

    def get_dummy_components(self):
        torch.manual_seed(0)
@@ -71,8 +72,8 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            time_embed_dim=2,
            text_embed_dim=32,  # Must match with tiny-random-t5
            num_layers=1,
-            sample_width=16,  # latent width: 2 -> final width: 16
-            sample_height=16,  # latent height: 2 -> final height: 16
+            sample_width=2,  # latent width: 2 -> final width: 16
+            sample_height=2,  # latent height: 2 -> final height: 16
            sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
            patch_size=2,
            temporal_compression_ratio=4,
@@ -280,10 +281,6 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            "VAE tiling should not affect the inference results",
        )

-    @unittest.skip("xformers attention processor does not exist for CogVideoX")
-    def test_xformers_attention_forwardGenerator_pass(self):
-        pass
-
    def test_fused_qkv_projections(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
        components = self.get_dummy_components()

--- a/tests/pipelines/cogvideo/test_cogvideox_image2video.py
+++ b/tests/pipelines/cogvideo/test_cogvideox_image2video.py
@@ -269,8 +269,9 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        generator_device = "cpu"
        components = self.get_dummy_components()

-        # The reason to modify it this way is because I2V Transformer limits the generation to resolutions.
-        # See the if-statement on "self.use_learned_positional_embeddings"
+        # The reason to modify it this way is because I2V Transformer limits the generation to resolutions used during initalization.
+        # This limitation comes from using learned positional embeddings which cannot be generated on-the-fly like sincos or RoPE embeddings.
+        # See the if-statement on "self.use_learned_positional_embeddings" in diffusers/models/embeddings.py
        components["transformer"] = CogVideoXTransformer3DModel.from_config(
            components["transformer"].config,
            sample_height=16,

--- a/tests/pipelines/cogvideo/test_cogvideox_video2video.py
+++ b/tests/pipelines/cogvideo/test_cogvideox_video2video.py
@@ -51,6 +51,7 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
            "callback_on_step_end_tensor_inputs",
        ]
    )
+    test_xformers_attention = False

    def get_dummy_components(self):
        torch.manual_seed(0)
@@ -65,8 +66,8 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
            time_embed_dim=2,
            text_embed_dim=32,  # Must match with tiny-random-t5
            num_layers=1,
-            sample_width=16,  # latent width: 2 -> final width: 16
-            sample_height=16,  # latent height: 2 -> final height: 16
+            sample_width=2,  # latent width: 2 -> final width: 16
+            sample_height=2,  # latent height: 2 -> final height: 16
            sample_frames=9,  # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9
            patch_size=2,
            temporal_compression_ratio=4,
@@ -285,10 +286,6 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
            "VAE tiling should not affect the inference results",
        )

-    @unittest.skip("xformers attention processor does not exist for CogVideoX")
-    def test_xformers_attention_forwardGenerator_pass(self):
-        pass
-
    def test_fused_qkv_projections(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
        components = self.get_dummy_components()