"tests/vscode:/vscode.git/clone" did not exist on "88aa7f6ebfcd2529b06b25e93a5340e4cc426a2e"
Unverified Commit be4afa0b authored by Mark Van Aken's avatar Mark Van Aken Committed by GitHub
Browse files

#7535 Update FloatTensor type hints to Tensor (#7883)

* find & replace all FloatTensors to Tensor

* apply formatting

* Update torch.FloatTensor to torch.Tensor in the remaining files

* formatting

* Fix the rest of the places where FloatTensor is used as well as in documentation

* formatting

* Update new file from FloatTensor to Tensor
parent 04f4bd54
...@@ -55,11 +55,11 @@ class UNet3DConditionOutput(BaseOutput): ...@@ -55,11 +55,11 @@ class UNet3DConditionOutput(BaseOutput):
The output of [`UNet3DConditionModel`]. The output of [`UNet3DConditionModel`].
Args: Args:
sample (`torch.FloatTensor` of shape `(batch_size, num_channels, num_frames, height, width)`): sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
""" """
sample: torch.FloatTensor sample: torch.Tensor
class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
...@@ -560,7 +560,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) ...@@ -560,7 +560,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
def forward( def forward(
self, self,
sample: torch.FloatTensor, sample: torch.Tensor,
timestep: Union[torch.Tensor, float, int], timestep: Union[torch.Tensor, float, int],
encoder_hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor,
class_labels: Optional[torch.Tensor] = None, class_labels: Optional[torch.Tensor] = None,
...@@ -570,15 +570,15 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) ...@@ -570,15 +570,15 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
mid_block_additional_residual: Optional[torch.Tensor] = None, mid_block_additional_residual: Optional[torch.Tensor] = None,
return_dict: bool = True, return_dict: bool = True,
) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]: ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]:
r""" r"""
The [`UNet3DConditionModel`] forward method. The [`UNet3DConditionModel`] forward method.
Args: Args:
sample (`torch.FloatTensor`): sample (`torch.Tensor`):
The noisy input tensor with the following shape `(batch, num_channels, num_frames, height, width`. The noisy input tensor with the following shape `(batch, num_channels, num_frames, height, width`.
timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
encoder_hidden_states (`torch.FloatTensor`): encoder_hidden_states (`torch.Tensor`):
The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
class_labels (`torch.Tensor`, *optional*, defaults to `None`): class_labels (`torch.Tensor`, *optional*, defaults to `None`):
Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
......
...@@ -81,8 +81,8 @@ class I2VGenXLTransformerTemporalEncoder(nn.Module): ...@@ -81,8 +81,8 @@ class I2VGenXLTransformerTemporalEncoder(nn.Module):
def forward( def forward(
self, self,
hidden_states: torch.FloatTensor, hidden_states: torch.Tensor,
) -> torch.FloatTensor: ) -> torch.Tensor:
norm_hidden_states = self.norm1(hidden_states) norm_hidden_states = self.norm1(hidden_states)
attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None) attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
hidden_states = attn_output + hidden_states hidden_states = attn_output + hidden_states
...@@ -514,7 +514,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): ...@@ -514,7 +514,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
def forward( def forward(
self, self,
sample: torch.FloatTensor, sample: torch.Tensor,
timestep: Union[torch.Tensor, float, int], timestep: Union[torch.Tensor, float, int],
fps: torch.Tensor, fps: torch.Tensor,
image_latents: torch.Tensor, image_latents: torch.Tensor,
...@@ -523,19 +523,19 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): ...@@ -523,19 +523,19 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
timestep_cond: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
return_dict: bool = True, return_dict: bool = True,
) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]: ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]:
r""" r"""
The [`I2VGenXLUNet`] forward method. The [`I2VGenXLUNet`] forward method.
Args: Args:
sample (`torch.FloatTensor`): sample (`torch.Tensor`):
The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition". fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition".
image_latents (`torch.FloatTensor`): Image encodings from the VAE. image_latents (`torch.Tensor`): Image encodings from the VAE.
image_embeddings (`torch.FloatTensor`): image_embeddings (`torch.Tensor`):
Projection embeddings of the conditioning image computed with a vision encoder. Projection embeddings of the conditioning image computed with a vision encoder.
encoder_hidden_states (`torch.FloatTensor`): encoder_hidden_states (`torch.Tensor`):
The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
......
...@@ -31,7 +31,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name ...@@ -31,7 +31,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@dataclass @dataclass
class Kandinsky3UNetOutput(BaseOutput): class Kandinsky3UNetOutput(BaseOutput):
sample: torch.FloatTensor = None sample: torch.Tensor = None
class Kandinsky3EncoderProj(nn.Module): class Kandinsky3EncoderProj(nn.Module):
......
...@@ -786,7 +786,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): ...@@ -786,7 +786,7 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
def forward( def forward(
self, self,
sample: torch.FloatTensor, sample: torch.Tensor,
timestep: Union[torch.Tensor, float, int], timestep: Union[torch.Tensor, float, int],
encoder_hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor,
timestep_cond: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None,
...@@ -801,10 +801,10 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): ...@@ -801,10 +801,10 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
The [`UNetMotionModel`] forward method. The [`UNetMotionModel`] forward method.
Args: Args:
sample (`torch.FloatTensor`): sample (`torch.Tensor`):
The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
encoder_hidden_states (`torch.FloatTensor`): encoder_hidden_states (`torch.Tensor`):
The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`): timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
......
...@@ -22,11 +22,11 @@ class UNetSpatioTemporalConditionOutput(BaseOutput): ...@@ -22,11 +22,11 @@ class UNetSpatioTemporalConditionOutput(BaseOutput):
The output of [`UNetSpatioTemporalConditionModel`]. The output of [`UNetSpatioTemporalConditionModel`].
Args: Args:
sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): sample (`torch.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
""" """
sample: torch.FloatTensor = None sample: torch.Tensor = None
class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
...@@ -356,7 +356,7 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL ...@@ -356,7 +356,7 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
def forward( def forward(
self, self,
sample: torch.FloatTensor, sample: torch.Tensor,
timestep: Union[torch.Tensor, float, int], timestep: Union[torch.Tensor, float, int],
encoder_hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor,
added_time_ids: torch.Tensor, added_time_ids: torch.Tensor,
...@@ -366,12 +366,12 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL ...@@ -366,12 +366,12 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
The [`UNetSpatioTemporalConditionModel`] forward method. The [`UNetSpatioTemporalConditionModel`] forward method.
Args: Args:
sample (`torch.FloatTensor`): sample (`torch.Tensor`):
The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`. The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
encoder_hidden_states (`torch.FloatTensor`): encoder_hidden_states (`torch.Tensor`):
The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`. The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
added_time_ids: (`torch.FloatTensor`): added_time_ids: (`torch.Tensor`):
The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
embeddings and added to the time embeddings. embeddings and added to the time embeddings.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
......
...@@ -131,7 +131,7 @@ class UpDownBlock2d(nn.Module): ...@@ -131,7 +131,7 @@ class UpDownBlock2d(nn.Module):
@dataclass @dataclass
class StableCascadeUNetOutput(BaseOutput): class StableCascadeUNetOutput(BaseOutput):
sample: torch.FloatTensor = None sample: torch.Tensor = None
class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalModelMixin): class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalModelMixin):
......
...@@ -138,9 +138,7 @@ class Upsample2D(nn.Module): ...@@ -138,9 +138,7 @@ class Upsample2D(nn.Module):
else: else:
self.Conv2d_0 = conv self.Conv2d_0 = conv
def forward( def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None, *args, **kwargs) -> torch.Tensor:
self, hidden_states: torch.FloatTensor, output_size: Optional[int] = None, *args, **kwargs
) -> torch.FloatTensor:
if len(args) > 0 or kwargs.get("scale", None) is not None: if len(args) > 0 or kwargs.get("scale", None) is not None:
deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
deprecate("scale", "1.0.0", deprecation_message) deprecate("scale", "1.0.0", deprecation_message)
...@@ -217,12 +215,12 @@ class FirUpsample2D(nn.Module): ...@@ -217,12 +215,12 @@ class FirUpsample2D(nn.Module):
def _upsample_2d( def _upsample_2d(
self, self,
hidden_states: torch.FloatTensor, hidden_states: torch.Tensor,
weight: Optional[torch.FloatTensor] = None, weight: Optional[torch.Tensor] = None,
kernel: Optional[torch.FloatTensor] = None, kernel: Optional[torch.Tensor] = None,
factor: int = 2, factor: int = 2,
gain: float = 1, gain: float = 1,
) -> torch.FloatTensor: ) -> torch.Tensor:
"""Fused `upsample_2d()` followed by `Conv2d()`. """Fused `upsample_2d()` followed by `Conv2d()`.
Padding is performed only once at the beginning, not between the operations. The fused op is considerably more Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
...@@ -230,19 +228,19 @@ class FirUpsample2D(nn.Module): ...@@ -230,19 +228,19 @@ class FirUpsample2D(nn.Module):
arbitrary order. arbitrary order.
Args: Args:
hidden_states (`torch.FloatTensor`): hidden_states (`torch.Tensor`):
Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
weight (`torch.FloatTensor`, *optional*): weight (`torch.Tensor`, *optional*):
Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be
performed by `inChannels = x.shape[0] // numGroups`. performed by `inChannels = x.shape[0] // numGroups`.
kernel (`torch.FloatTensor`, *optional*): kernel (`torch.Tensor`, *optional*):
FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
corresponds to nearest-neighbor upsampling. corresponds to nearest-neighbor upsampling.
factor (`int`, *optional*): Integer upsampling factor (default: 2). factor (`int`, *optional*): Integer upsampling factor (default: 2).
gain (`float`, *optional*): Scaling factor for signal magnitude (default: 1.0). gain (`float`, *optional*): Scaling factor for signal magnitude (default: 1.0).
Returns: Returns:
output (`torch.FloatTensor`): output (`torch.Tensor`):
Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same
datatype as `hidden_states`. datatype as `hidden_states`.
""" """
...@@ -310,7 +308,7 @@ class FirUpsample2D(nn.Module): ...@@ -310,7 +308,7 @@ class FirUpsample2D(nn.Module):
return output return output
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
if self.use_conv: if self.use_conv:
height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel) height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1) height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
...@@ -401,11 +399,11 @@ def upfirdn2d_native( ...@@ -401,11 +399,11 @@ def upfirdn2d_native(
def upsample_2d( def upsample_2d(
hidden_states: torch.FloatTensor, hidden_states: torch.Tensor,
kernel: Optional[torch.FloatTensor] = None, kernel: Optional[torch.Tensor] = None,
factor: int = 2, factor: int = 2,
gain: float = 1, gain: float = 1,
) -> torch.FloatTensor: ) -> torch.Tensor:
r"""Upsample2D a batch of 2D images with the given filter. r"""Upsample2D a batch of 2D images with the given filter.
Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified
...@@ -413,9 +411,9 @@ def upsample_2d( ...@@ -413,9 +411,9 @@ def upsample_2d(
a: multiple of the upsampling factor. a: multiple of the upsampling factor.
Args: Args:
hidden_states (`torch.FloatTensor`): hidden_states (`torch.Tensor`):
Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`. Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
kernel (`torch.FloatTensor`, *optional*): kernel (`torch.Tensor`, *optional*):
FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
corresponds to nearest-neighbor upsampling. corresponds to nearest-neighbor upsampling.
factor (`int`, *optional*, default to `2`): factor (`int`, *optional*, default to `2`):
...@@ -424,7 +422,7 @@ def upsample_2d( ...@@ -424,7 +422,7 @@ def upsample_2d(
Scaling factor for signal magnitude (default: 1.0). Scaling factor for signal magnitude (default: 1.0).
Returns: Returns:
output (`torch.FloatTensor`): output (`torch.Tensor`):
Tensor of the shape `[N, C, H * factor, W * factor]` Tensor of the shape `[N, C, H * factor, W * factor]`
""" """
assert isinstance(factor, int) and factor >= 1 assert isinstance(factor, int) and factor >= 1
......
...@@ -30,11 +30,11 @@ class VQEncoderOutput(BaseOutput): ...@@ -30,11 +30,11 @@ class VQEncoderOutput(BaseOutput):
Output of VQModel encoding method. Output of VQModel encoding method.
Args: Args:
latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): latents (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
The encoded output sample from the last layer of the model. The encoded output sample from the last layer of the model.
""" """
latents: torch.FloatTensor latents: torch.Tensor
class VQModel(ModelMixin, ConfigMixin): class VQModel(ModelMixin, ConfigMixin):
...@@ -127,7 +127,7 @@ class VQModel(ModelMixin, ConfigMixin): ...@@ -127,7 +127,7 @@ class VQModel(ModelMixin, ConfigMixin):
) )
@apply_forward_hook @apply_forward_hook
def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: def encode(self, x: torch.Tensor, return_dict: bool = True) -> VQEncoderOutput:
h = self.encoder(x) h = self.encoder(x)
h = self.quant_conv(h) h = self.quant_conv(h)
...@@ -138,8 +138,8 @@ class VQModel(ModelMixin, ConfigMixin): ...@@ -138,8 +138,8 @@ class VQModel(ModelMixin, ConfigMixin):
@apply_forward_hook @apply_forward_hook
def decode( def decode(
self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None self, h: torch.Tensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
) -> Union[DecoderOutput, torch.FloatTensor]: ) -> Union[DecoderOutput, torch.Tensor]:
# also go through quantization layer # also go through quantization layer
if not force_not_quantize: if not force_not_quantize:
quant, _, _ = self.quantize(h) quant, _, _ = self.quantize(h)
...@@ -156,13 +156,13 @@ class VQModel(ModelMixin, ConfigMixin): ...@@ -156,13 +156,13 @@ class VQModel(ModelMixin, ConfigMixin):
return DecoderOutput(sample=dec) return DecoderOutput(sample=dec)
def forward( def forward(
self, sample: torch.FloatTensor, return_dict: bool = True self, sample: torch.Tensor, return_dict: bool = True
) -> Union[DecoderOutput, Tuple[torch.FloatTensor, ...]]: ) -> Union[DecoderOutput, Tuple[torch.Tensor, ...]]:
r""" r"""
The [`VQModel`] forward method. The [`VQModel`] forward method.
Args: Args:
sample (`torch.FloatTensor`): Input sample. sample (`torch.Tensor`): Input sample.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple. Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple.
......
...@@ -88,7 +88,7 @@ class AmusedPipeline(DiffusionPipeline): ...@@ -88,7 +88,7 @@ class AmusedPipeline(DiffusionPipeline):
negative_encoder_hidden_states: Optional[torch.Tensor] = None, negative_encoder_hidden_states: Optional[torch.Tensor] = None,
output_type="pil", output_type="pil",
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
micro_conditioning_aesthetic_score: int = 6, micro_conditioning_aesthetic_score: int = 6,
...@@ -122,16 +122,16 @@ class AmusedPipeline(DiffusionPipeline): ...@@ -122,16 +122,16 @@ class AmusedPipeline(DiffusionPipeline):
latents (`torch.IntTensor`, *optional*): latents (`torch.IntTensor`, *optional*):
Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image
gneration. If not provided, the starting latents will be completely masked. gneration. If not provided, the starting latents will be completely masked.
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument. A single vector from the provided, text embeddings are generated from the `prompt` input argument. A single vector from the
pooled and projected final hidden states. pooled and projected final hidden states.
encoder_hidden_states (`torch.FloatTensor`, *optional*): encoder_hidden_states (`torch.Tensor`, *optional*):
Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): negative_encoder_hidden_states (`torch.Tensor`, *optional*):
Analogous to `encoder_hidden_states` for the positive prompt. Analogous to `encoder_hidden_states` for the positive prompt.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`. The output format of the generated image. Choose between `PIL.Image` or `np.array`.
...@@ -140,7 +140,7 @@ class AmusedPipeline(DiffusionPipeline): ...@@ -140,7 +140,7 @@ class AmusedPipeline(DiffusionPipeline):
plain tuple. plain tuple.
callback (`Callable`, *optional*): callback (`Callable`, *optional*):
A function that calls every `callback_steps` steps during inference. The function is called with the A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
callback_steps (`int`, *optional*, defaults to 1): callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function is called. If not specified, the callback is called at The frequency at which the `callback` function is called. If not specified, the callback is called at
every step. every step.
......
...@@ -102,7 +102,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline): ...@@ -102,7 +102,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
negative_encoder_hidden_states: Optional[torch.Tensor] = None, negative_encoder_hidden_states: Optional[torch.Tensor] = None,
output_type="pil", output_type="pil",
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
micro_conditioning_aesthetic_score: int = 6, micro_conditioning_aesthetic_score: int = 6,
...@@ -115,7 +115,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline): ...@@ -115,7 +115,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
Args: Args:
prompt (`str` or `List[str]`, *optional*): prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
`Image`, numpy array or tensor representing an image batch to be used as the starting point. For both `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
...@@ -141,16 +141,16 @@ class AmusedImg2ImgPipeline(DiffusionPipeline): ...@@ -141,16 +141,16 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic. generation deterministic.
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument. A single vector from the provided, text embeddings are generated from the `prompt` input argument. A single vector from the
pooled and projected final hidden states. pooled and projected final hidden states.
encoder_hidden_states (`torch.FloatTensor`, *optional*): encoder_hidden_states (`torch.Tensor`, *optional*):
Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): negative_encoder_hidden_states (`torch.Tensor`, *optional*):
Analogous to `encoder_hidden_states` for the positive prompt. Analogous to `encoder_hidden_states` for the positive prompt.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`. The output format of the generated image. Choose between `PIL.Image` or `np.array`.
...@@ -159,7 +159,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline): ...@@ -159,7 +159,7 @@ class AmusedImg2ImgPipeline(DiffusionPipeline):
plain tuple. plain tuple.
callback (`Callable`, *optional*): callback (`Callable`, *optional*):
A function that calls every `callback_steps` steps during inference. The function is called with the A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
callback_steps (`int`, *optional*, defaults to 1): callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function is called. If not specified, the callback is called at The frequency at which the `callback` function is called. If not specified, the callback is called at
every step. every step.
......
...@@ -119,7 +119,7 @@ class AmusedInpaintPipeline(DiffusionPipeline): ...@@ -119,7 +119,7 @@ class AmusedInpaintPipeline(DiffusionPipeline):
negative_encoder_hidden_states: Optional[torch.Tensor] = None, negative_encoder_hidden_states: Optional[torch.Tensor] = None,
output_type="pil", output_type="pil",
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
micro_conditioning_aesthetic_score: int = 6, micro_conditioning_aesthetic_score: int = 6,
...@@ -132,13 +132,13 @@ class AmusedInpaintPipeline(DiffusionPipeline): ...@@ -132,13 +132,13 @@ class AmusedInpaintPipeline(DiffusionPipeline):
Args: Args:
prompt (`str` or `List[str]`, *optional*): prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
`Image`, numpy array or tensor representing an image batch to be used as the starting point. For both `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
latents as `image`, but if passing latents directly it is not encoded again. latents as `image`, but if passing latents directly it is not encoded again.
mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
`Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
...@@ -165,16 +165,16 @@ class AmusedInpaintPipeline(DiffusionPipeline): ...@@ -165,16 +165,16 @@ class AmusedInpaintPipeline(DiffusionPipeline):
generator (`torch.Generator`, *optional*): generator (`torch.Generator`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic. generation deterministic.
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument. A single vector from the provided, text embeddings are generated from the `prompt` input argument. A single vector from the
pooled and projected final hidden states. pooled and projected final hidden states.
encoder_hidden_states (`torch.FloatTensor`, *optional*): encoder_hidden_states (`torch.Tensor`, *optional*):
Pre-generated penultimate hidden states from the text encoder providing additional text conditioning. Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
negative_encoder_hidden_states (`torch.FloatTensor`, *optional*): negative_encoder_hidden_states (`torch.Tensor`, *optional*):
Analogous to `encoder_hidden_states` for the positive prompt. Analogous to `encoder_hidden_states` for the positive prompt.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`. The output format of the generated image. Choose between `PIL.Image` or `np.array`.
...@@ -183,7 +183,7 @@ class AmusedInpaintPipeline(DiffusionPipeline): ...@@ -183,7 +183,7 @@ class AmusedInpaintPipeline(DiffusionPipeline):
plain tuple. plain tuple.
callback (`Callable`, *optional*): callback (`Callable`, *optional*):
A function that calls every `callback_steps` steps during inference. The function is called with the A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
callback_steps (`int`, *optional*, defaults to 1): callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function is called. If not specified, the callback is called at The frequency at which the `callback` function is called. If not specified, the callback is called at
every step. every step.
......
...@@ -148,8 +148,8 @@ class AnimateDiffPipeline( ...@@ -148,8 +148,8 @@ class AnimateDiffPipeline(
num_images_per_prompt, num_images_per_prompt,
do_classifier_free_guidance, do_classifier_free_guidance,
negative_prompt=None, negative_prompt=None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None, clip_skip: Optional[int] = None,
): ):
...@@ -169,10 +169,10 @@ class AnimateDiffPipeline( ...@@ -169,10 +169,10 @@ class AnimateDiffPipeline(
The prompt or prompts not to guide the image generation. If not defined, one has to pass The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`). less than `1`).
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument. provided, text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
...@@ -563,11 +563,11 @@ class AnimateDiffPipeline( ...@@ -563,11 +563,11 @@ class AnimateDiffPipeline(
num_videos_per_prompt: Optional[int] = 1, num_videos_per_prompt: Optional[int] = 1,
eta: float = 0.0, eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None, latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None, ip_adapter_image: Optional[PipelineImageInput] = None,
ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
output_type: Optional[str] = "pil", output_type: Optional[str] = "pil",
return_dict: bool = True, return_dict: bool = True,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
...@@ -604,27 +604,26 @@ class AnimateDiffPipeline( ...@@ -604,27 +604,26 @@ class AnimateDiffPipeline(
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic. generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.Tensor`, *optional*):
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor is generated by sampling using the supplied random `generator`. Latents should be of shape tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
`(batch_size, num_channel, num_frames, height, width)`. `(batch_size, num_channel, num_frames, height, width)`.
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument. provided, text embeddings are generated from the `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*): ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters. Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
provided, embeddings are computed from the `ip_adapter_image` input argument. provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
`np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
of a plain tuple. of a plain tuple.
......
...@@ -312,10 +312,10 @@ class AnimateDiffSDXLPipeline( ...@@ -312,10 +312,10 @@ class AnimateDiffSDXLPipeline(
do_classifier_free_guidance: bool = True, do_classifier_free_guidance: bool = True,
negative_prompt: Optional[str] = None, negative_prompt: Optional[str] = None,
negative_prompt_2: Optional[str] = None, negative_prompt_2: Optional[str] = None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.Tensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None, clip_skip: Optional[int] = None,
): ):
...@@ -341,17 +341,17 @@ class AnimateDiffSDXLPipeline( ...@@ -341,17 +341,17 @@ class AnimateDiffSDXLPipeline(
negative_prompt_2 (`str` or `List[str]`, *optional*): negative_prompt_2 (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
`text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument. provided, text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
pooled_prompt_embeds (`torch.FloatTensor`, *optional*): pooled_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
If not provided, pooled text embeddings will be generated from `prompt` input argument. If not provided, pooled text embeddings will be generated from `prompt` input argument.
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
input argument. input argument.
...@@ -784,7 +784,7 @@ class AnimateDiffSDXLPipeline( ...@@ -784,7 +784,7 @@ class AnimateDiffSDXLPipeline(
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding( def get_guidance_scale_embedding(
self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
) -> torch.FloatTensor: ) -> torch.Tensor:
""" """
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
...@@ -797,7 +797,7 @@ class AnimateDiffSDXLPipeline( ...@@ -797,7 +797,7 @@ class AnimateDiffSDXLPipeline(
Data type of the generated embeddings. Data type of the generated embeddings.
Returns: Returns:
`torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
""" """
assert len(w.shape) == 1 assert len(w.shape) == 1
w = w * 1000.0 w = w * 1000.0
...@@ -866,13 +866,13 @@ class AnimateDiffSDXLPipeline( ...@@ -866,13 +866,13 @@ class AnimateDiffSDXLPipeline(
num_videos_per_prompt: Optional[int] = 1, num_videos_per_prompt: Optional[int] = 1,
eta: float = 0.0, eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None, latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.Tensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None, ip_adapter_image: Optional[PipelineImageInput] = None,
ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
output_type: Optional[str] = "pil", output_type: Optional[str] = "pil",
return_dict: bool = True, return_dict: bool = True,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
...@@ -949,27 +949,27 @@ class AnimateDiffSDXLPipeline( ...@@ -949,27 +949,27 @@ class AnimateDiffSDXLPipeline(
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic. to make generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.Tensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`. tensor will ge generated by sampling using the supplied random `generator`.
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument. provided, text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
pooled_prompt_embeds (`torch.FloatTensor`, *optional*): pooled_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
If not provided, pooled text embeddings will be generated from `prompt` input argument. If not provided, pooled text embeddings will be generated from `prompt` input argument.
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
input argument. input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*): ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters. Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the
`ip_adapter_image` input argument. `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
......
...@@ -252,8 +252,8 @@ class AnimateDiffVideoToVideoPipeline( ...@@ -252,8 +252,8 @@ class AnimateDiffVideoToVideoPipeline(
num_images_per_prompt, num_images_per_prompt,
do_classifier_free_guidance, do_classifier_free_guidance,
negative_prompt=None, negative_prompt=None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None, clip_skip: Optional[int] = None,
): ):
...@@ -273,10 +273,10 @@ class AnimateDiffVideoToVideoPipeline( ...@@ -273,10 +273,10 @@ class AnimateDiffVideoToVideoPipeline(
The prompt or prompts not to guide the image generation. If not defined, one has to pass The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`). less than `1`).
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument. provided, text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
...@@ -741,11 +741,11 @@ class AnimateDiffVideoToVideoPipeline( ...@@ -741,11 +741,11 @@ class AnimateDiffVideoToVideoPipeline(
num_videos_per_prompt: Optional[int] = 1, num_videos_per_prompt: Optional[int] = 1,
eta: float = 0.0, eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None, latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None, ip_adapter_image: Optional[PipelineImageInput] = None,
ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
output_type: Optional[str] = "pil", output_type: Optional[str] = "pil",
return_dict: bool = True, return_dict: bool = True,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
...@@ -790,27 +790,26 @@ class AnimateDiffVideoToVideoPipeline( ...@@ -790,27 +790,26 @@ class AnimateDiffVideoToVideoPipeline(
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic. generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.Tensor`, *optional*):
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor is generated by sampling using the supplied random `generator`. Latents should be of shape tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
`(batch_size, num_channel, num_frames, height, width)`. `(batch_size, num_channel, num_frames, height, width)`.
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument. provided, text embeddings are generated from the `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*): ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters. Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
provided, embeddings are computed from the `ip_adapter_image` input argument. provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`): output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
`np.array`.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple. Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple.
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
......
...@@ -103,8 +103,8 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin): ...@@ -103,8 +103,8 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
num_waveforms_per_prompt, num_waveforms_per_prompt,
do_classifier_free_guidance, do_classifier_free_guidance,
negative_prompt=None, negative_prompt=None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -122,10 +122,10 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin): ...@@ -122,10 +122,10 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
The prompt or prompts not to guide the audio generation. If not defined, one has to pass The prompt or prompts not to guide the audio generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`). less than `1`).
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument. provided, text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
...@@ -360,11 +360,11 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin): ...@@ -360,11 +360,11 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
num_waveforms_per_prompt: Optional[int] = 1, num_waveforms_per_prompt: Optional[int] = 1,
eta: float = 0.0, eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None, latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: Optional[int] = 1, callback_steps: Optional[int] = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
output_type: Optional[str] = "np", output_type: Optional[str] = "np",
...@@ -394,21 +394,21 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin): ...@@ -394,21 +394,21 @@ class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic. generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.Tensor`, *optional*):
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor is generated by sampling using the supplied random `generator`. tensor is generated by sampling using the supplied random `generator`.
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument. provided, text embeddings are generated from the `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
return_dict (`bool`, *optional*, defaults to `True`): return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple. Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
callback (`Callable`, *optional*): callback (`Callable`, *optional*):
A function that calls every `callback_steps` steps during inference. The function is called with the A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
callback_steps (`int`, *optional*, defaults to 1): callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function is called. If not specified, the callback is called at The frequency at which the `callback` function is called. If not specified, the callback is called at
every step. every step.
......
...@@ -64,7 +64,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput): ...@@ -64,7 +64,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput):
""" """
Args: Args:
Class for AudioLDM2 projection layer's outputs. Class for AudioLDM2 projection layer's outputs.
hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states obtained by linearly projecting the hidden-states for each of the text Sequence of hidden-states obtained by linearly projecting the hidden-states for each of the text
encoders and subsequently concatenating them together. encoders and subsequently concatenating them together.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
...@@ -75,7 +75,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput): ...@@ -75,7 +75,7 @@ class AudioLDM2ProjectionModelOutput(BaseOutput):
- 0 for tokens that are **masked**. - 0 for tokens that are **masked**.
""" """
hidden_states: torch.FloatTensor hidden_states: torch.Tensor
attention_mask: Optional[torch.LongTensor] = None attention_mask: Optional[torch.LongTensor] = None
...@@ -125,8 +125,8 @@ class AudioLDM2ProjectionModel(ModelMixin, ConfigMixin): ...@@ -125,8 +125,8 @@ class AudioLDM2ProjectionModel(ModelMixin, ConfigMixin):
def forward( def forward(
self, self,
hidden_states: Optional[torch.FloatTensor] = None, hidden_states: Optional[torch.Tensor] = None,
hidden_states_1: Optional[torch.FloatTensor] = None, hidden_states_1: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.LongTensor] = None,
attention_mask_1: Optional[torch.LongTensor] = None, attention_mask_1: Optional[torch.LongTensor] = None,
): ):
...@@ -680,7 +680,7 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad ...@@ -680,7 +680,7 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad
def forward( def forward(
self, self,
sample: torch.FloatTensor, sample: torch.Tensor,
timestep: Union[torch.Tensor, float, int], timestep: Union[torch.Tensor, float, int],
encoder_hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor,
class_labels: Optional[torch.Tensor] = None, class_labels: Optional[torch.Tensor] = None,
...@@ -696,10 +696,10 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad ...@@ -696,10 +696,10 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad
The [`AudioLDM2UNet2DConditionModel`] forward method. The [`AudioLDM2UNet2DConditionModel`] forward method.
Args: Args:
sample (`torch.FloatTensor`): sample (`torch.Tensor`):
The noisy input tensor with the following shape `(batch, channel, height, width)`. The noisy input tensor with the following shape `(batch, channel, height, width)`.
timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
encoder_hidden_states (`torch.FloatTensor`): encoder_hidden_states (`torch.Tensor`):
The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
encoder_attention_mask (`torch.Tensor`): encoder_attention_mask (`torch.Tensor`):
A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
...@@ -710,7 +710,7 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad ...@@ -710,7 +710,7 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad
tuple. tuple.
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
encoder_hidden_states_1 (`torch.FloatTensor`, *optional*): encoder_hidden_states_1 (`torch.Tensor`, *optional*):
A second set of encoder hidden states with shape `(batch, sequence_length_2, feature_dim_2)`. Can be A second set of encoder hidden states with shape `(batch, sequence_length_2, feature_dim_2)`. Can be
used to condition the model on a different set of embeddings to `encoder_hidden_states`. used to condition the model on a different set of embeddings to `encoder_hidden_states`.
encoder_attention_mask_1 (`torch.Tensor`, *optional*): encoder_attention_mask_1 (`torch.Tensor`, *optional*):
...@@ -1091,14 +1091,14 @@ class CrossAttnDownBlock2D(nn.Module): ...@@ -1091,14 +1091,14 @@ class CrossAttnDownBlock2D(nn.Module):
def forward( def forward(
self, self,
hidden_states: torch.FloatTensor, hidden_states: torch.Tensor,
temb: Optional[torch.FloatTensor] = None, temb: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.Tensor] = None,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states_1: Optional[torch.FloatTensor] = None, encoder_hidden_states_1: Optional[torch.Tensor] = None,
encoder_attention_mask_1: Optional[torch.FloatTensor] = None, encoder_attention_mask_1: Optional[torch.Tensor] = None,
): ):
output_states = () output_states = ()
num_layers = len(self.resnets) num_layers = len(self.resnets)
...@@ -1270,15 +1270,15 @@ class UNetMidBlock2DCrossAttn(nn.Module): ...@@ -1270,15 +1270,15 @@ class UNetMidBlock2DCrossAttn(nn.Module):
def forward( def forward(
self, self,
hidden_states: torch.FloatTensor, hidden_states: torch.Tensor,
temb: Optional[torch.FloatTensor] = None, temb: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.Tensor] = None,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states_1: Optional[torch.FloatTensor] = None, encoder_hidden_states_1: Optional[torch.Tensor] = None,
encoder_attention_mask_1: Optional[torch.FloatTensor] = None, encoder_attention_mask_1: Optional[torch.Tensor] = None,
) -> torch.FloatTensor: ) -> torch.Tensor:
hidden_states = self.resnets[0](hidden_states, temb) hidden_states = self.resnets[0](hidden_states, temb)
num_attention_per_layer = len(self.attentions) // (len(self.resnets) - 1) num_attention_per_layer = len(self.attentions) // (len(self.resnets) - 1)
...@@ -1437,16 +1437,16 @@ class CrossAttnUpBlock2D(nn.Module): ...@@ -1437,16 +1437,16 @@ class CrossAttnUpBlock2D(nn.Module):
def forward( def forward(
self, self,
hidden_states: torch.FloatTensor, hidden_states: torch.Tensor,
res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], res_hidden_states_tuple: Tuple[torch.Tensor, ...],
temb: Optional[torch.FloatTensor] = None, temb: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
upsample_size: Optional[int] = None, upsample_size: Optional[int] = None,
attention_mask: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states_1: Optional[torch.FloatTensor] = None, encoder_hidden_states_1: Optional[torch.Tensor] = None,
encoder_attention_mask_1: Optional[torch.FloatTensor] = None, encoder_attention_mask_1: Optional[torch.Tensor] = None,
): ):
num_layers = len(self.resnets) num_layers = len(self.resnets)
num_attention_per_layer = len(self.attentions) // num_layers num_attention_per_layer = len(self.attentions) // num_layers
......
...@@ -273,7 +273,7 @@ class AudioLDM2Pipeline(DiffusionPipeline): ...@@ -273,7 +273,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs. Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs.
Parameters: Parameters:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
max_new_tokens (`int`): max_new_tokens (`int`):
Number of new tokens to generate. Number of new tokens to generate.
...@@ -282,7 +282,7 @@ class AudioLDM2Pipeline(DiffusionPipeline): ...@@ -282,7 +282,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
function of the model. function of the model.
Return: Return:
`inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
The sequence of generated hidden-states. The sequence of generated hidden-states.
""" """
max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
...@@ -311,10 +311,10 @@ class AudioLDM2Pipeline(DiffusionPipeline): ...@@ -311,10 +311,10 @@ class AudioLDM2Pipeline(DiffusionPipeline):
do_classifier_free_guidance, do_classifier_free_guidance,
transcription=None, transcription=None,
negative_prompt=None, negative_prompt=None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
generated_prompt_embeds: Optional[torch.FloatTensor] = None, generated_prompt_embeds: Optional[torch.Tensor] = None,
negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None, negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.LongTensor] = None,
negative_attention_mask: Optional[torch.LongTensor] = None, negative_attention_mask: Optional[torch.LongTensor] = None,
max_new_tokens: Optional[int] = None, max_new_tokens: Optional[int] = None,
...@@ -337,18 +337,18 @@ class AudioLDM2Pipeline(DiffusionPipeline): ...@@ -337,18 +337,18 @@ class AudioLDM2Pipeline(DiffusionPipeline):
The prompt or prompts not to guide the audio generation. If not defined, one has to pass The prompt or prompts not to guide the audio generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`). less than `1`).
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.* Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.*
prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument. prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs,
*e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
`negative_prompt` input argument. `negative_prompt` input argument.
generated_prompt_embeds (`torch.FloatTensor`, *optional*): generated_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
*e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
argument. argument.
negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*): negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
`negative_prompt` input argument. `negative_prompt` input argument.
...@@ -361,11 +361,11 @@ class AudioLDM2Pipeline(DiffusionPipeline): ...@@ -361,11 +361,11 @@ class AudioLDM2Pipeline(DiffusionPipeline):
max_new_tokens (`int`, *optional*, defaults to None): max_new_tokens (`int`, *optional*, defaults to None):
The number of new tokens to generate with the GPT2 language model. The number of new tokens to generate with the GPT2 language model.
Returns: Returns:
prompt_embeds (`torch.FloatTensor`): prompt_embeds (`torch.Tensor`):
Text embeddings from the Flan T5 model. Text embeddings from the Flan T5 model.
attention_mask (`torch.LongTensor`): attention_mask (`torch.LongTensor`):
Attention mask to be applied to the `prompt_embeds`. Attention mask to be applied to the `prompt_embeds`.
generated_prompt_embeds (`torch.FloatTensor`): generated_prompt_embeds (`torch.Tensor`):
Text embeddings generated from the GPT2 langauge model. Text embeddings generated from the GPT2 langauge model.
Example: Example:
...@@ -821,16 +821,16 @@ class AudioLDM2Pipeline(DiffusionPipeline): ...@@ -821,16 +821,16 @@ class AudioLDM2Pipeline(DiffusionPipeline):
num_waveforms_per_prompt: Optional[int] = 1, num_waveforms_per_prompt: Optional[int] = 1,
eta: float = 0.0, eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None, latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None,
generated_prompt_embeds: Optional[torch.FloatTensor] = None, generated_prompt_embeds: Optional[torch.Tensor] = None,
negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None, negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.LongTensor] = None,
negative_attention_mask: Optional[torch.LongTensor] = None, negative_attention_mask: Optional[torch.LongTensor] = None,
max_new_tokens: Optional[int] = None, max_new_tokens: Optional[int] = None,
return_dict: bool = True, return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: Optional[int] = 1, callback_steps: Optional[int] = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
output_type: Optional[str] = "np", output_type: Optional[str] = "np",
...@@ -865,21 +865,21 @@ class AudioLDM2Pipeline(DiffusionPipeline): ...@@ -865,21 +865,21 @@ class AudioLDM2Pipeline(DiffusionPipeline):
generator (`torch.Generator` or `List[torch.Generator]`, *optional*): generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic. generation deterministic.
latents (`torch.FloatTensor`, *optional*): latents (`torch.Tensor`, *optional*):
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor is generated by sampling using the supplied random `generator`. tensor is generated by sampling using the supplied random `generator`.
prompt_embeds (`torch.FloatTensor`, *optional*): prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument. provided, text embeddings are generated from the `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*): negative_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
generated_prompt_embeds (`torch.FloatTensor`, *optional*): generated_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs, Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
*e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
argument. argument.
negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*): negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
`negative_prompt` input argument. `negative_prompt` input argument.
...@@ -897,7 +897,7 @@ class AudioLDM2Pipeline(DiffusionPipeline): ...@@ -897,7 +897,7 @@ class AudioLDM2Pipeline(DiffusionPipeline):
plain tuple. plain tuple.
callback (`Callable`, *optional*): callback (`Callable`, *optional*):
A function that calls every `callback_steps` steps during inference. The function is called with the A function that calls every `callback_steps` steps during inference. The function is called with the
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
callback_steps (`int`, *optional*, defaults to 1): callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function is called. If not specified, the callback is called at The frequency at which the `callback` function is called. If not specified, the callback is called at
every step. every step.
......
...@@ -298,7 +298,7 @@ class BlipImageProcessor(BaseImageProcessor): ...@@ -298,7 +298,7 @@ class BlipImageProcessor(BaseImageProcessor):
return encoded_outputs return encoded_outputs
# Follows diffusers.VaeImageProcessor.postprocess # Follows diffusers.VaeImageProcessor.postprocess
def postprocess(self, sample: torch.FloatTensor, output_type: str = "pil"): def postprocess(self, sample: torch.Tensor, output_type: str = "pil"):
if output_type not in ["pt", "np", "pil"]: if output_type not in ["pt", "np", "pil"]:
raise ValueError( raise ValueError(
f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']" f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
......
...@@ -117,7 +117,7 @@ class Blip2VisionEmbeddings(nn.Module): ...@@ -117,7 +117,7 @@ class Blip2VisionEmbeddings(nn.Module):
self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
batch_size = pixel_values.shape[0] batch_size = pixel_values.shape[0]
target_dtype = self.patch_embedding.weight.dtype target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
...@@ -376,7 +376,7 @@ class Blip2VisionModel(Blip2PreTrainedModel): ...@@ -376,7 +376,7 @@ class Blip2VisionModel(Blip2PreTrainedModel):
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig)
def forward( def forward(
self, self,
pixel_values: Optional[torch.FloatTensor] = None, pixel_values: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None, output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None, output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None, return_dict: Optional[bool] = None,
...@@ -524,15 +524,15 @@ class Blip2QFormerModel(Blip2PreTrainedModel): ...@@ -524,15 +524,15 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
return_dict=None, return_dict=None,
): ):
r""" r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder. the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**, - 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**. - 0 for tokens that are **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
......
...@@ -186,7 +186,7 @@ class ContextCLIPTextEmbeddings(nn.Module): ...@@ -186,7 +186,7 @@ class ContextCLIPTextEmbeddings(nn.Module):
ctx_begin_pos: list, ctx_begin_pos: list,
input_ids: Optional[torch.LongTensor] = None, input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor: ) -> torch.Tensor:
if ctx_embeddings is None: if ctx_embeddings is None:
ctx_len = 0 ctx_len = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment