"src/vscode:/vscode.git/clone" did not exist on "852dc76d6d84e4bf5f3e81701a68013c61714dd5"
Unverified Commit 4842f5d8 authored by sunxunle's avatar sunxunle Committed by GitHub
Browse files

chore: remove redundant words (#10609)


Signed-off-by: default avatarsunxunle <sunxunle@ampere.tech>
parent 328e0d20
...@@ -115,7 +115,7 @@ export_to_video(frames, "mochi.mp4", fps=30) ...@@ -115,7 +115,7 @@ export_to_video(frames, "mochi.mp4", fps=30)
## Reproducing the results from the Genmo Mochi repo ## Reproducing the results from the Genmo Mochi repo
The [Genmo Mochi implementation](https://github.com/genmoai/mochi/tree/main) uses different precision values for each stage in the inference process. The text encoder and VAE use `torch.float32`, while the DiT uses `torch.bfloat16` with the [attention kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html#torch.nn.attention.sdpa_kernel) set to `EFFICIENT_ATTENTION`. Diffusers pipelines currently do not support setting different `dtypes` for different stages of the pipeline. In order to run inference in the same way as the the original implementation, please refer to the following example. The [Genmo Mochi implementation](https://github.com/genmoai/mochi/tree/main) uses different precision values for each stage in the inference process. The text encoder and VAE use `torch.float32`, while the DiT uses `torch.bfloat16` with the [attention kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html#torch.nn.attention.sdpa_kernel) set to `EFFICIENT_ATTENTION`. Diffusers pipelines currently do not support setting different `dtypes` for different stages of the pipeline. In order to run inference in the same way as the original implementation, please refer to the following example.
<Tip> <Tip>
The original Mochi implementation zeros out empty prompts. However, enabling this option and placing the entire pipeline under autocast can lead to numerical overflows with the T5 text encoder. The original Mochi implementation zeros out empty prompts. However, enabling this option and placing the entire pipeline under autocast can lead to numerical overflows with the T5 text encoder.
......
...@@ -73,7 +73,7 @@ def _download(url: str, root: str): ...@@ -73,7 +73,7 @@ def _download(url: str, root: str):
loop.update(len(buffer)) loop.update(len(buffer))
if insecure_hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256: if insecure_hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match") raise RuntimeError("Model has been downloaded but the SHA256 checksum does not match")
return download_target return download_target
......
...@@ -258,7 +258,7 @@ def get_polynomial_decay_schedule_with_warmup( ...@@ -258,7 +258,7 @@ def get_polynomial_decay_schedule_with_warmup(
lr_init = optimizer.defaults["lr"] lr_init = optimizer.defaults["lr"]
if not (lr_init > lr_end): if not (lr_init > lr_end):
raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})") raise ValueError(f"lr_end ({lr_end}) must be smaller than initial lr ({lr_init})")
def lr_lambda(current_step: int): def lr_lambda(current_step: int):
if current_step < num_warmup_steps: if current_step < num_warmup_steps:
......
...@@ -158,7 +158,7 @@ class PAGMixin: ...@@ -158,7 +158,7 @@ class PAGMixin:
), ),
): ):
r""" r"""
Set the the self-attention layers to apply PAG. Raise ValueError if the input is invalid. Set the self-attention layers to apply PAG. Raise ValueError if the input is invalid.
Args: Args:
pag_applied_layers (`str` or `List[str]`): pag_applied_layers (`str` or `List[str]`):
......
...@@ -67,7 +67,7 @@ class VideoProcessor(VaeImageProcessor): ...@@ -67,7 +67,7 @@ class VideoProcessor(VaeImageProcessor):
# ensure the input is a list of videos: # ensure the input is a list of videos:
# - if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray) # - if it is a batch of videos (5d torch.Tensor or np.ndarray), it is converted to a list of videos (a list of 4d torch.Tensor or np.ndarray)
# - if it is is a single video, it is convereted to a list of one video. # - if it is a single video, it is convereted to a list of one video.
if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5: if isinstance(video, (np.ndarray, torch.Tensor)) and video.ndim == 5:
video = list(video) video = list(video)
elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video): elif isinstance(video, list) and is_valid_image(video[0]) or is_valid_image_imagelist(video):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment