"vscode:/vscode.git/clone" did not exist on "c814abdadd31d7a92ba2b77c48b2f042318a5a7f"
Unverified Commit ba5af5ae authored by Sayak Paul's avatar Sayak Paul Committed by GitHub
Browse files

[Cog] some minor fixes and nits (#9466)

* fix positional arguments in check_inputs().

* add video and latetns to check_inputs().

* prep latents_in_channels.

* quality

* multiple fixes.

* fix
parent aa73072f
...@@ -188,6 +188,9 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): ...@@ -188,6 +188,9 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
self.vae_scale_factor_temporal = ( self.vae_scale_factor_temporal = (
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4 self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
) )
self.vae_scaling_factor_image = (
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
)
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial) self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
...@@ -317,6 +320,12 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): ...@@ -317,6 +320,12 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
def prepare_latents( def prepare_latents(
self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
): ):
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
shape = ( shape = (
batch_size, batch_size,
(num_frames - 1) // self.vae_scale_factor_temporal + 1, (num_frames - 1) // self.vae_scale_factor_temporal + 1,
...@@ -324,11 +333,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): ...@@ -324,11 +333,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
height // self.vae_scale_factor_spatial, height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial, width // self.vae_scale_factor_spatial,
) )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if latents is None: if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
...@@ -341,7 +345,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): ...@@ -341,7 +345,7 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor: def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width] latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
latents = 1 / self.vae.config.scaling_factor * latents latents = 1 / self.vae_scaling_factor_image * latents
frames = self.vae.decode(latents).sample frames = self.vae.decode(latents).sample
return frames return frames
...@@ -510,10 +514,10 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): ...@@ -510,10 +514,10 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
The prompt or prompts not to guide the image generation. If not defined, one has to pass The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`). less than `1`).
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
The height in pixels of the generated image. This is set to 1024 by default for the best results. The height in pixels of the generated image. This is set to 480 by default for the best results.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
The width in pixels of the generated image. This is set to 1024 by default for the best results. The width in pixels of the generated image. This is set to 720 by default for the best results.
num_frames (`int`, defaults to `48`): num_frames (`int`, defaults to `48`):
Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
...@@ -587,8 +591,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): ...@@ -587,8 +591,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
num_videos_per_prompt = 1 num_videos_per_prompt = 1
# 1. Check inputs. Raise error if not correct # 1. Check inputs. Raise error if not correct
......
...@@ -207,6 +207,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -207,6 +207,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
self.vae_scale_factor_temporal = ( self.vae_scale_factor_temporal = (
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4 self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
) )
self.vae_scaling_factor_image = (
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
)
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial) self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
...@@ -348,6 +351,12 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -348,6 +351,12 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
generator: Optional[torch.Generator] = None, generator: Optional[torch.Generator] = None,
latents: Optional[torch.Tensor] = None, latents: Optional[torch.Tensor] = None,
): ):
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1 num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
shape = ( shape = (
batch_size, batch_size,
...@@ -357,12 +366,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -357,12 +366,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
width // self.vae_scale_factor_spatial, width // self.vae_scale_factor_spatial,
) )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
image = image.unsqueeze(2) # [B, C, F, H, W] image = image.unsqueeze(2) # [B, C, F, H, W]
if isinstance(generator, list): if isinstance(generator, list):
...@@ -373,7 +376,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -373,7 +376,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image] image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W] image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
image_latents = self.vae.config.scaling_factor * image_latents image_latents = self.vae_scaling_factor_image * image_latents
padding_shape = ( padding_shape = (
batch_size, batch_size,
...@@ -397,7 +400,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -397,7 +400,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor: def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width] latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
latents = 1 / self.vae.config.scaling_factor * latents latents = 1 / self.vae_scaling_factor_image * latents
frames = self.vae.decode(latents).sample frames = self.vae.decode(latents).sample
return frames return frames
...@@ -438,7 +441,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -438,7 +441,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
width, width,
negative_prompt, negative_prompt,
callback_on_step_end_tensor_inputs, callback_on_step_end_tensor_inputs,
video=None,
latents=None, latents=None,
prompt_embeds=None, prompt_embeds=None,
negative_prompt_embeds=None, negative_prompt_embeds=None,
...@@ -494,9 +496,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -494,9 +496,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
f" {negative_prompt_embeds.shape}." f" {negative_prompt_embeds.shape}."
) )
if video is not None and latents is not None:
raise ValueError("Only one of `video` or `latents` should be provided")
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
def fuse_qkv_projections(self) -> None: def fuse_qkv_projections(self) -> None:
r"""Enables fused QKV projections.""" r"""Enables fused QKV projections."""
...@@ -584,7 +583,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -584,7 +583,7 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
Args: Args:
image (`PipelineImageInput`): image (`PipelineImageInput`):
The input video to condition the generation on. Must be an image, a list of images or a `torch.Tensor`. The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
prompt (`str` or `List[str]`, *optional*): prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
instead. instead.
...@@ -592,10 +591,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -592,10 +591,10 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
The prompt or prompts not to guide the image generation. If not defined, one has to pass The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`). less than `1`).
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
The height in pixels of the generated image. This is set to 1024 by default for the best results. The height in pixels of the generated image. This is set to 480 by default for the best results.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
The width in pixels of the generated image. This is set to 1024 by default for the best results. The width in pixels of the generated image. This is set to 720 by default for the best results.
num_frames (`int`, defaults to `48`): num_frames (`int`, defaults to `48`):
Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
...@@ -665,20 +664,19 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): ...@@ -665,20 +664,19 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline):
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
num_videos_per_prompt = 1 num_videos_per_prompt = 1
# 1. Check inputs. Raise error if not correct # 1. Check inputs. Raise error if not correct
self.check_inputs( self.check_inputs(
image, image=image,
prompt, prompt=prompt,
height, height=height,
width, width=width,
negative_prompt, negative_prompt=negative_prompt,
callback_on_step_end_tensor_inputs, callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
prompt_embeds, latents=latents,
negative_prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
) )
self._guidance_scale = guidance_scale self._guidance_scale = guidance_scale
self._interrupt = False self._interrupt = False
......
...@@ -204,12 +204,16 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) ...@@ -204,12 +204,16 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin)
self.register_modules( self.register_modules(
tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
) )
self.vae_scale_factor_spatial = ( self.vae_scale_factor_spatial = (
2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
) )
self.vae_scale_factor_temporal = ( self.vae_scale_factor_temporal = (
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4 self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
) )
self.vae_scaling_factor_image = (
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
)
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial) self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
...@@ -351,6 +355,12 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) ...@@ -351,6 +355,12 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin)
latents: Optional[torch.Tensor] = None, latents: Optional[torch.Tensor] = None,
timestep: Optional[torch.Tensor] = None, timestep: Optional[torch.Tensor] = None,
): ):
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
num_frames = (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1) num_frames = (video.size(2) - 1) // self.vae_scale_factor_temporal + 1 if latents is None else latents.size(1)
shape = ( shape = (
...@@ -361,12 +371,6 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) ...@@ -361,12 +371,6 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin)
width // self.vae_scale_factor_spatial, width // self.vae_scale_factor_spatial,
) )
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if latents is None: if latents is None:
if isinstance(generator, list): if isinstance(generator, list):
if len(generator) != batch_size: if len(generator) != batch_size:
...@@ -382,7 +386,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) ...@@ -382,7 +386,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin)
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video] init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
init_latents = torch.cat(init_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W] init_latents = torch.cat(init_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
init_latents = self.vae.config.scaling_factor * init_latents init_latents = self.vae_scaling_factor_image * init_latents
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
latents = self.scheduler.add_noise(init_latents, noise, timestep) latents = self.scheduler.add_noise(init_latents, noise, timestep)
...@@ -396,7 +400,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) ...@@ -396,7 +400,7 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin)
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor: def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width] latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width]
latents = 1 / self.vae.config.scaling_factor * latents latents = 1 / self.vae_scaling_factor_image * latents
frames = self.vae.decode(latents).sample frames = self.vae.decode(latents).sample
return frames return frames
...@@ -589,10 +593,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) ...@@ -589,10 +593,10 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin)
The prompt or prompts not to guide the image generation. If not defined, one has to pass The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`). less than `1`).
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
The height in pixels of the generated image. This is set to 1024 by default for the best results. The height in pixels of the generated image. This is set to 480 by default for the best results.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
The width in pixels of the generated image. This is set to 1024 by default for the best results. The width in pixels of the generated image. This is set to 720 by default for the best results.
num_inference_steps (`int`, *optional*, defaults to 50): num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference. expense of slower inference.
...@@ -658,20 +662,20 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) ...@@ -658,20 +662,20 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin)
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
num_videos_per_prompt = 1 num_videos_per_prompt = 1
# 1. Check inputs. Raise error if not correct # 1. Check inputs. Raise error if not correct
self.check_inputs( self.check_inputs(
prompt, prompt=prompt,
height, height=height,
width, width=width,
strength, strength=strength,
negative_prompt, negative_prompt=negative_prompt,
callback_on_step_end_tensor_inputs, callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
prompt_embeds, video=video,
negative_prompt_embeds, latents=latents,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
) )
self._guidance_scale = guidance_scale self._guidance_scale = guidance_scale
self._attention_kwargs = attention_kwargs self._attention_kwargs = attention_kwargs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment