revert automatic chunking (#3934)

* revert automatic chunking * Apply suggestions from code review * revert automatic chunking

revert automatic chunking (#3934)
* revert automatic chunking * Apply suggestions from code review * revert automatic chunking
4e898560 · Patrick von Platen · GitHub · 332d2bbe · 4e898560 · 4e898560
Unverified Commit 4e898560 authored Jul 03, 2023 by Patrick von Platen Committed by GitHub Jul 03, 2023
3 changed files
--- a/docs/source/en/api/pipelines/text_to_video.mdx
+++ b/docs/source/en/api/pipelines/text_to_video.mdx
@@ -138,6 +138,7 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt
 pipe.enable_model_cpu_offload()
 # memory optimization
+pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
 pipe.enable_vae_slicing()
 prompt = "Darth Vader surfing a wave"
@@ -150,10 +151,13 @@ Now the video can be upscaled:
 ```py
 pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16)
-pipe.vae.enable_slicing()
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 pipe.enable_model_cpu_offload()
+# memory optimization
+pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
+pipe.enable_vae_slicing()
 video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames]
 video_frames = pipe(prompt, video=video, strength=0.6).frames
@@ -175,6 +179,28 @@ Here are some sample outputs:
    </tr>
 </table>
+### Memory optimizations
+Text-guided video generation with [`~TextToVideoSDPipeline`] and [`~VideoToVideoSDPipeline`] is very memory intensive both
+when denoising with [`~UNet3DConditionModel`] and when decoding with [`~AutoencoderKL`]. It is possible though to reduce 
+memory usage at the cost of increased runtime to achieve the exact same result. To do so, it is recommended to enable
+**forward chunking** and **vae slicing**:
+Forward chunking via [`~UNet3DConditionModel.enable_forward_chunking`]is explained in [this blog post](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers) and 
+allows to significantly reduce the required memory for the unet. You can chunk the feed forward layer over the `num_frames`
+dimension by doing:
+```py
+pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
+```
+Vae slicing via [`~TextToVideoSDPipeline.enable_vae_slicing`] and [`~VideoToVideoSDPipeline.enable_vae_slicing`] also 
+gives significant memory savings since the two pipelines decode all image frames at once.
+```py
+pipe.enable_vae_slicing()
+```
 ## Available checkpoints 
 * [damo-vilab/text-to-video-ms-1.7b](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/)

--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -634,9 +634,6 @@ class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lora
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        # 6.1 Chunk feed-forward computation to save memory
-        self.unet.enable_forward_chunking(chunk_size=1, dim=1)
        # 7. Denoising loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
        with self.progress_bar(total=num_inference_steps) as progress_bar:

--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -709,9 +709,6 @@ class VideoToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        # 6.1 Chunk feed-forward computation to save memory
-        self.unet.enable_forward_chunking(chunk_size=1, dim=1)
        # 7. Denoising loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
        with self.progress_bar(total=num_inference_steps) as progress_bar: