Commit 08a21d59 authored by chenpangpang's avatar chenpangpang
Browse files

feat: 初始提交

parent 1a6b26f1
Pipeline #2165 failed with stages
in 0 seconds
{
"last_node_id": 6,
"last_link_id": 3,
"nodes": [
{
"id": 1,
"type": "Ruyi_LoadModel",
"pos": {
"0": 210,
"1": 162
},
"size": {
"0": 315,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "ruyi_model",
"type": "RUYI_MODEL",
"links": [
1
],
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "Ruyi_LoadModel"
},
"widgets_values": [
"Ruyi-Mini-7B",
"yes",
"yes"
]
},
{
"id": 4,
"type": "VHS_VideoCombine",
"pos": {
"0": 1045,
"1": 133
},
"size": [
404.73553466796875,
601.8645528157551
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 3
},
{
"name": "audio",
"type": "AUDIO",
"link": null,
"shape": 7
},
{
"name": "meta_batch",
"type": "VHS_BatchManager",
"link": null,
"shape": 7
},
{
"name": "vae",
"type": "VAE",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "Filenames",
"type": "VHS_FILENAMES",
"links": null
}
],
"properties": {
"Node name for S&R": "VHS_VideoCombine"
},
"widgets_values": {
"frame_rate": 24,
"loop_count": 0,
"filename_prefix": "Ruyi-I2V-StartFrame",
"format": "video/h264-mp4",
"pix_fmt": "yuv420p",
"crf": 19,
"save_metadata": true,
"pingpong": false,
"save_output": true,
"videopreview": {
"hidden": false,
"paused": false,
"params": {
"filename": "Ruyi-I2V-StartFrame_00001.mp4",
"subfolder": "",
"type": "output",
"format": "video/h264-mp4",
"frame_rate": 24
},
"muted": false
}
}
},
{
"id": 3,
"type": "LoadImage",
"pos": {
"0": 200,
"1": 439
},
"size": {
"0": 315,
"1": 314
},
"flags": {},
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "IMAGE",
"type": "IMAGE",
"links": [
2
],
"slot_index": 0
},
{
"name": "MASK",
"type": "MASK",
"links": null
}
],
"properties": {
"Node name for S&R": "LoadImage"
},
"widgets_values": [
"example_03.jpg",
"image"
]
},
{
"id": 2,
"type": "Ruyi_I2VSampler",
"pos": {
"0": 628,
"1": 284
},
"size": {
"0": 327.5999755859375,
"1": 338
},
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "ruyi_model",
"type": "RUYI_MODEL",
"link": 1
},
{
"name": "start_img",
"type": "IMAGE",
"link": 2
},
{
"name": "end_img",
"type": "IMAGE",
"link": null,
"shape": 7
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
3
],
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "Ruyi_I2VSampler"
},
"widgets_values": [
120,
512,
925247271358454,
"randomize",
25,
7,
"DDIM",
"2",
"static",
"normal_mode",
"5"
]
}
],
"links": [
[
1,
1,
0,
2,
0,
"RUYI_MODEL"
],
[
2,
3,
0,
2,
1,
"IMAGE"
],
[
3,
2,
0,
4,
0,
"IMAGE"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 1,
"offset": [
0,
0
]
}
},
"version": 0.4
}
transformer_additional_kwargs:
basic_block_type: "basic"
after_norm: false
time_position_encoding: true
noise_scheduler_kwargs:
beta_start: 0.00085
beta_end: 0.03
beta_schedule: "scaled_linear"
steps_offset: 1
prediction_type: "v_prediction"
clip_sample: false
vae_kwargs:
enable_magvit: true
import os
import torch
from PIL import Image
from diffusers import (EulerDiscreteScheduler, EulerAncestralDiscreteScheduler,
DPMSolverMultistepScheduler, PNDMScheduler, DDIMScheduler)
from omegaconf import OmegaConf
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from safetensors.torch import load_file as load_safetensors
from huggingface_hub import snapshot_download
from ruyi.data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
from ruyi.models.autoencoder_magvit import AutoencoderKLMagvit
from ruyi.models.transformer3d import HunyuanTransformer3DModel
from ruyi.pipeline.pipeline_ruyi_inpaint import RuyiInpaintPipeline
from ruyi.utils.lora_utils import merge_lora, unmerge_lora
from ruyi.utils.utils import get_image_to_video_latent, save_videos_grid
# Input and output
start_image_path = "assets/girl_01.jpg"
end_image_path = "assets/girl_02.jpg" # Can be None for start-image-to-video
output_video_path = "outputs/example_01.mp4"
# Video settings
video_length = 120 # The max video length is 120 frames (24 frames per second)
base_resolution = 512 # # The pixels in the generated video are approximately 512 x 512. Values in the range of [384, 896] typically produce good video quality.
video_size = None # Override base_resolution. Format: [height, width], e.g., [384, 672]
# Control settings
aspect_ratio = "16:9" # Choose in ["16:9", "9:16"], note that this is only the hint
motion = "auto" # Motion control, choose in ["1", "2", "3", "4", "auto"]
camera_direction = "auto" # Camera control, choose in ["static", "left", "right", "up", "down", "auto"]
# Sampler settings
steps = 25
cfg = 7.0
scheduler_name = "DDIM" # Choose in ["Euler", "Euler A", "DPM++", "PNDM","DDIM"]
# GPU memory settings
low_gpu_memory_mode = False # Low gpu memory mode
gpu_offload_steps = 5 # Choose in [0, 10, 7, 5, 1], the latter number requires less GPU memory but longer time
# Random seed
seed = 42 # The Answer to the Ultimate Question of Life, The Universe, and Everything
# Model settings
config_path = "config/default.yaml"
model_name = "Ruyi-Mini-7B"
model_type = "Inpaint"
model_path = f"models/{model_name}" # (Down)load mode in this path
auto_download = True # Automatically download the model if the pipeline creation fails
auto_update = True # If auto_download is enabled, check for updates and update the model if necessary
# LoRA settings
lora_path = None
lora_weight = 1.0
# Other settings
weight_dtype = torch.bfloat16
device = torch.device("cuda")
def get_control_embeddings(pipeline, aspect_ratio, motion, camera_direction):
# Default keys
p_default_key = "p.default"
n_default_key = "n.default"
# Load embeddings
if motion == "auto":
motion = "0"
p_key = f"p.{aspect_ratio.replace(':', 'x')}movie{motion}{camera_direction}"
embeddings = pipeline.embeddings
# Get embeddings
positive_embeds = embeddings.get(f"{p_key}.emb1", embeddings[f"{p_default_key}.emb1"])
positive_attention_mask = embeddings.get(f"{p_key}.mask1", embeddings[f"{p_default_key}.mask1"])
positive_embeds_2 = embeddings.get(f"{p_key}.emb2", embeddings[f"{p_default_key}.emb2"])
positive_attention_mask_2 = embeddings.get(f"{p_key}.mask2", embeddings[f"{p_default_key}.mask2"])
negative_embeds = embeddings[f"{n_default_key}.emb1"]
negative_attention_mask = embeddings[f"{n_default_key}.mask1"]
negative_embeds_2 = embeddings[f"{n_default_key}.emb2"]
negative_attention_mask_2 = embeddings[f"{n_default_key}.mask2"]
return {
"positive_embeds": positive_embeds,
"positive_attention_mask": positive_attention_mask,
"positive_embeds_2": positive_embeds_2,
"positive_attention_mask_2": positive_attention_mask_2,
"negative_embeds": negative_embeds,
"negative_attention_mask": negative_attention_mask,
"negative_embeds_2": negative_embeds_2,
"negative_attention_mask_2": negative_attention_mask_2,
}
def try_setup_pipeline(model_path, weight_dtype, config):
try:
# Get Vae
vae = AutoencoderKLMagvit.from_pretrained(
model_path,
subfolder="vae"
).to(weight_dtype)
print("Vae loaded ...")
# Get Transformer
transformer_additional_kwargs = OmegaConf.to_container(config['transformer_additional_kwargs'])
transformer = HunyuanTransformer3DModel.from_pretrained_2d(
model_path,
subfolder="transformer",
transformer_additional_kwargs=transformer_additional_kwargs
).to(weight_dtype)
print("Transformer loaded ...")
# Load Clip
clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
model_path, subfolder="image_encoder"
).to(weight_dtype)
clip_image_processor = CLIPImageProcessor.from_pretrained(
model_path, subfolder="image_encoder"
)
# Load sampler and create pipeline
Choosen_Scheduler = DDIMScheduler
scheduler = Choosen_Scheduler.from_pretrained(
model_path,
subfolder="scheduler"
)
pipeline = RuyiInpaintPipeline.from_pretrained(
model_path,
vae=vae,
transformer=transformer,
scheduler=scheduler,
torch_dtype=weight_dtype,
clip_image_encoder=clip_image_encoder,
clip_image_processor=clip_image_processor,
)
# Load embeddings
embeddings = load_safetensors(os.path.join(model_path, "embeddings.safetensors"))
pipeline.embeddings = embeddings
print("Pipeline loaded ...")
return pipeline
except Exception as e:
print("[Ruyi] Setup pipeline failed:", e)
return None
# Load config
config = OmegaConf.load(config_path)
# Load images
start_img = [Image.open(start_image_path).convert("RGB")]
end_img = [Image.open(end_image_path).convert("RGB")] if end_image_path is not None else None
# Check for update
repo_id = f"IamCreateAI/{model_name}"
if auto_download and auto_update:
print(f"Checking for {model_name} updates ...")
# Download the model
snapshot_download(repo_id=repo_id, local_dir=model_path)
# Init model
pipeline = try_setup_pipeline(model_path, weight_dtype, config)
if pipeline is None and auto_download:
print(f"Downloading {model_name} ...")
# Download the model
snapshot_download(repo_id=repo_id, local_dir=model_path)
pipeline = try_setup_pipeline(model_path, weight_dtype, config)
if pipeline is None:
message = (f"[Load Model Failed] "
f"Please download Ruyi model from huggingface repo '{repo_id}', "
f"And put it into '{model_path}'.")
if not auto_download:
message += "\nOr just set auto_download to 'True'."
raise FileNotFoundError(message)
# Setup GPU memory mode
if low_gpu_memory_mode:
pipeline.enable_sequential_cpu_offload()
else:
pipeline.enable_model_cpu_offload()
# Prepare LoRA config
loras = {
'models': [lora_path] if lora_path is not None else [],
'weights': [lora_weight] if lora_path is not None else [],
}
# Count most suitable height and width
if video_size is None:
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]
else:
height, width = video_size
# Set hidden states offload steps
pipeline.transformer.hidden_cache_size = gpu_offload_steps
# Load Sampler
if scheduler_name == "DPM++":
noise_scheduler = DPMSolverMultistepScheduler.from_pretrained(model_path, subfolder='scheduler')
elif scheduler_name == "Euler":
noise_scheduler = EulerDiscreteScheduler.from_pretrained(model_path, subfolder='scheduler')
elif scheduler_name == "Euler A":
noise_scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder='scheduler')
elif scheduler_name == "PNDM":
noise_scheduler = PNDMScheduler.from_pretrained(model_path, subfolder='scheduler')
elif scheduler_name == "DDIM":
noise_scheduler = DDIMScheduler.from_pretrained(model_path, subfolder='scheduler')
pipeline.scheduler = noise_scheduler
# Set random seed
generator= torch.Generator(device).manual_seed(seed)
# Load control embeddings
embeddings = get_control_embeddings(pipeline, aspect_ratio, motion, camera_direction)
with torch.no_grad():
video_length = int(video_length // pipeline.vae.mini_batch_encoder * pipeline.vae.mini_batch_encoder) if video_length != 1 else 1
input_video, input_video_mask, clip_image = get_image_to_video_latent(start_img, end_img, video_length=video_length, sample_size=(height, width))
for _lora_path, _lora_weight in zip(loras.get("models", []), loras.get("weights", [])):
pipeline = merge_lora(pipeline, _lora_path, _lora_weight)
sample = pipeline(
prompt_embeds = embeddings["positive_embeds"],
prompt_attention_mask = embeddings["positive_attention_mask"],
prompt_embeds_2 = embeddings["positive_embeds_2"],
prompt_attention_mask_2 = embeddings["positive_attention_mask_2"],
negative_prompt_embeds = embeddings["negative_embeds"],
negative_prompt_attention_mask = embeddings["negative_attention_mask"],
negative_prompt_embeds_2 = embeddings["negative_embeds_2"],
negative_prompt_attention_mask_2 = embeddings["negative_attention_mask_2"],
video_length = video_length,
height = height,
width = width,
generator = generator,
guidance_scale = cfg,
num_inference_steps = steps,
video = input_video,
mask_video = input_video_mask,
clip_image = clip_image,
).videos
for _lora_path, _lora_weight in zip(loras.get("models", []), loras.get("weights", [])):
pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight)
# Save the video
output_folder = os.path.dirname(output_video_path)
if output_folder != '':
os.makedirs(output_folder, exist_ok=True)
save_videos_grid(sample, output_video_path, fps=24)
import os
import torch
from PIL import Image
from diffusers import (EulerDiscreteScheduler, EulerAncestralDiscreteScheduler,
DPMSolverMultistepScheduler, PNDMScheduler, DDIMScheduler)
from omegaconf import OmegaConf
from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
from safetensors.torch import load_file as load_safetensors
from huggingface_hub import snapshot_download
from ruyi.data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
from ruyi.models.autoencoder_magvit import AutoencoderKLMagvit
from ruyi.models.transformer3d import HunyuanTransformer3DModel
from ruyi.pipeline.pipeline_ruyi_inpaint import RuyiInpaintPipeline
from ruyi.utils.lora_utils import merge_lora, unmerge_lora
from ruyi.utils.utils import get_image_to_video_latent, save_videos_grid
# Input and output
start_image_path = "assets/girl_01.jpg"
end_image_path = "assets/girl_02.jpg" # Can be None for start-image-to-video
output_video_path = "outputs/example_01.mp4"
# Video settings
video_length = 120 # The max video length is 120 frames (24 frames per second)
base_resolution = 512 # # The pixels in the generated video are approximately 512 x 512. Values in the range of [384, 896] typically produce good video quality.
video_size = None # Override base_resolution. Format: [height, width], e.g., [384, 672]
# Control settings
aspect_ratio = "9:16" # Choose in ["16:9", "9:16"], note that this is only the hint
motion = "auto" # Motion control, choose in ["1", "2", "3", "4", "auto"]
camera_direction = "auto" # Camera control, choose in ["static", "left", "right", "up", "down", "auto"]
# Sampler settings
steps = 25
cfg = 7.0
scheduler_name = "DDIM" # Choose in ["Euler", "Euler A", "DPM++", "PNDM","DDIM"]
# GPU memory settings
low_gpu_memory_mode = False # Low gpu memory mode
gpu_offload_steps = 0 # Choose in [0, 10, 7, 5, 1], the latter number requires less GPU memory but longer time
# Random seed
seed = 42 # The Answer to the Ultimate Question of Life, The Universe, and Everything
# Model settings
config_path = "config/default.yaml"
model_name = "Ruyi-Mini-7B"
model_type = "Inpaint"
model_path = f"models/{model_name}" # (Down)load mode in this path
auto_download = True # Automatically download the model if the pipeline creation fails
auto_update = True # If auto_download is enabled, check for updates and update the model if necessary
# LoRA settings
lora_path = None
lora_weight = 1.0
# Other settings
weight_dtype = torch.bfloat16
device = torch.device("cuda")
def get_control_embeddings(pipeline, aspect_ratio, motion, camera_direction):
# Default keys
p_default_key = "p.default"
n_default_key = "n.default"
# Load embeddings
if motion == "auto":
motion = "0"
p_key = f"p.{aspect_ratio.replace(':', 'x')}movie{motion}{camera_direction}"
embeddings = pipeline.embeddings
# Get embeddings
positive_embeds = embeddings.get(f"{p_key}.emb1", embeddings[f"{p_default_key}.emb1"])
positive_attention_mask = embeddings.get(f"{p_key}.mask1", embeddings[f"{p_default_key}.mask1"])
positive_embeds_2 = embeddings.get(f"{p_key}.emb2", embeddings[f"{p_default_key}.emb2"])
positive_attention_mask_2 = embeddings.get(f"{p_key}.mask2", embeddings[f"{p_default_key}.mask2"])
negative_embeds = embeddings[f"{n_default_key}.emb1"]
negative_attention_mask = embeddings[f"{n_default_key}.mask1"]
negative_embeds_2 = embeddings[f"{n_default_key}.emb2"]
negative_attention_mask_2 = embeddings[f"{n_default_key}.mask2"]
return {
"positive_embeds": positive_embeds,
"positive_attention_mask": positive_attention_mask,
"positive_embeds_2": positive_embeds_2,
"positive_attention_mask_2": positive_attention_mask_2,
"negative_embeds": negative_embeds,
"negative_attention_mask": negative_attention_mask,
"negative_embeds_2": negative_embeds_2,
"negative_attention_mask_2": negative_attention_mask_2,
}
def try_setup_pipeline(model_path, weight_dtype, config):
try:
# Get Vae
vae = AutoencoderKLMagvit.from_pretrained(
model_path,
subfolder="vae"
).to(weight_dtype)
print("Vae loaded ...")
# Get Transformer
transformer_additional_kwargs = OmegaConf.to_container(config['transformer_additional_kwargs'])
transformer = HunyuanTransformer3DModel.from_pretrained_2d(
model_path,
subfolder="transformer",
transformer_additional_kwargs=transformer_additional_kwargs
).to(weight_dtype)
print("Transformer loaded ...")
# Load Clip
clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
model_path, subfolder="image_encoder"
).to(weight_dtype)
clip_image_processor = CLIPImageProcessor.from_pretrained(
model_path, subfolder="image_encoder"
)
# Load sampler and create pipeline
Choosen_Scheduler = DDIMScheduler
scheduler = Choosen_Scheduler.from_pretrained(
model_path,
subfolder="scheduler"
)
pipeline = RuyiInpaintPipeline.from_pretrained(
model_path,
vae=vae,
transformer=transformer,
scheduler=scheduler,
torch_dtype=weight_dtype,
clip_image_encoder=clip_image_encoder,
clip_image_processor=clip_image_processor,
)
# Load embeddings
embeddings = load_safetensors(os.path.join(model_path, "embeddings.safetensors"))
pipeline.embeddings = embeddings
print("Pipeline loaded ...")
return pipeline
except Exception as e:
print("[Ruyi] Setup pipeline failed:", e)
return None
# Load config
config = OmegaConf.load(config_path)
# Load images
start_img = [Image.open(start_image_path).convert("RGB")]
end_img = [Image.open(end_image_path).convert("RGB")] if end_image_path is not None else None
# Check for update
repo_id = f"IamCreateAI/{model_name}"
if auto_download and auto_update:
print(f"Checking for {model_name} updates ...")
# Download the model
snapshot_download(repo_id=repo_id, local_dir=model_path)
# Init model
pipeline = try_setup_pipeline(model_path, weight_dtype, config)
if pipeline is None and auto_download:
print(f"Downloading {model_name} ...")
# Download the model
snapshot_download(repo_id=repo_id, local_dir=model_path)
pipeline = try_setup_pipeline(model_path, weight_dtype, config)
if pipeline is None:
message = (f"[Load Model Failed] "
f"Please download Ruyi model from huggingface repo '{repo_id}', "
f"And put it into '{model_path}'.")
if not auto_download:
message += "\nOr just set auto_download to 'True'."
raise FileNotFoundError(message)
# Setup GPU memory mode
if low_gpu_memory_mode:
pipeline.enable_sequential_cpu_offload()
else:
pipeline.enable_model_cpu_offload()
# Prepare LoRA config
loras = {
'models': [lora_path] if lora_path is not None else [],
'weights': [lora_weight] if lora_path is not None else [],
}
# Count most suitable height and width
if video_size is None:
aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
height, width = [int(x / 16) * 16 for x in closest_size]
else:
height, width = video_size
# Set hidden states offload steps
pipeline.transformer.hidden_cache_size = gpu_offload_steps
# Load Sampler
if scheduler_name == "DPM++":
noise_scheduler = DPMSolverMultistepScheduler.from_pretrained(model_path, subfolder='scheduler')
elif scheduler_name == "Euler":
noise_scheduler = EulerDiscreteScheduler.from_pretrained(model_path, subfolder='scheduler')
elif scheduler_name == "Euler A":
noise_scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder='scheduler')
elif scheduler_name == "PNDM":
noise_scheduler = PNDMScheduler.from_pretrained(model_path, subfolder='scheduler')
elif scheduler_name == "DDIM":
noise_scheduler = DDIMScheduler.from_pretrained(model_path, subfolder='scheduler')
pipeline.scheduler = noise_scheduler
# Set random seed
generator= torch.Generator(device).manual_seed(seed)
# Load control embeddings
embeddings = get_control_embeddings(pipeline, aspect_ratio, motion, camera_direction)
with torch.no_grad():
video_length = int(video_length // pipeline.vae.mini_batch_encoder * pipeline.vae.mini_batch_encoder) if video_length != 1 else 1
input_video, input_video_mask, clip_image = get_image_to_video_latent(start_img, end_img, video_length=video_length, sample_size=(height, width))
for _lora_path, _lora_weight in zip(loras.get("models", []), loras.get("weights", [])):
pipeline = merge_lora(pipeline, _lora_path, _lora_weight)
sample = pipeline(
prompt_embeds = embeddings["positive_embeds"],
prompt_attention_mask = embeddings["positive_attention_mask"],
prompt_embeds_2 = embeddings["positive_embeds_2"],
prompt_attention_mask_2 = embeddings["positive_attention_mask_2"],
negative_prompt_embeds = embeddings["negative_embeds"],
negative_prompt_attention_mask = embeddings["negative_attention_mask"],
negative_prompt_embeds_2 = embeddings["negative_embeds_2"],
negative_prompt_attention_mask_2 = embeddings["negative_attention_mask_2"],
video_length = video_length,
height = height,
width = width,
generator = generator,
guidance_scale = cfg,
num_inference_steps = steps,
video = input_video,
mask_video = input_video_mask,
clip_image = clip_image,
).videos
for _lora_path, _lora_weight in zip(loras.get("models", []), loras.get("weights", [])):
pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight)
# Save the video
output_folder = os.path.dirname(output_video_path)
if output_folder != '':
os.makedirs(output_folder, exist_ok=True)
save_videos_grid(sample, output_video_path, fps=24)
[project]
name = "ruyi-models"
description = "ComfyUI wrapper nodes for Ruyi, an image-to-video model by CreateAI."
version = "1.0.1"
license = {file = "LICENSE"}
dependencies = ["Pillow", "einops", "safetensors", "timm", "tomesd", "torch", "torchdiffeq", "torchsde", "decord", "datasets", "numpy", "scikit-image", "opencv-python", "omegaconf", "SentencePiece", "albumentations", "imageio[ffmpeg]", "imageio[pyav]", "tensorboard", "beautifulsoup4", "ftfy", "func_timeout", "huggingface_hub", "accelerate>=0.26.0", "diffusers>=0.28.2", "transformers>=4.37.2"]
[project.urls]
Repository = "https://github.com/IamCreateAI/Ruyi-Models"
# Used by Comfy Registry https://comfyregistry.org
[tool.comfy]
PublisherId = "CreateAI"
DisplayName = "Ruyi-Models"
Icon = ""
Pillow
einops
safetensors
timm
tomesd
torch
torchdiffeq
torchsde
decord
datasets
numpy
scikit-image
opencv-python
omegaconf
SentencePiece
albumentations
imageio[ffmpeg]
imageio[pyav]
tensorboard
beautifulsoup4
ftfy
func_timeout
huggingface_hub
accelerate>=0.26.0
diffusers>=0.28.2
transformers>=4.37.2
# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import (Generic, Iterable, Iterator, List, Optional, Sequence,
Sized, TypeVar, Union)
import cv2
import numpy as np
import torch
from PIL import Image
from torch.utils.data import BatchSampler, Dataset, Sampler
ASPECT_RATIO_512 = {
'0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0],
'0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
'0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
'0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
'0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
'1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
'1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
'1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
'2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0],
'3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0]
}
ASPECT_RATIO_RANDOM_CROP_512 = {
'0.42': [320.0, 768.0], '0.5': [352.0, 704.0],
'0.57': [384.0, 672.0], '0.68': [416.0, 608.0], '0.78': [448.0, 576.0], '0.88': [480.0, 544.0],
'0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0],
'1.13': [544.0, 480.0], '1.29': [576.0, 448.0], '1.46': [608.0, 416.0], '1.75': [672.0, 384.0],
'2.0': [704.0, 352.0], '2.4': [768.0, 320.0]
}
ASPECT_RATIO_RANDOM_CROP_PROB = [
1, 2,
4, 4, 4, 4,
8, 8, 8,
4, 4, 4, 4,
2, 1
]
ASPECT_RATIO_RANDOM_CROP_PROB = np.array(ASPECT_RATIO_RANDOM_CROP_PROB) / sum(ASPECT_RATIO_RANDOM_CROP_PROB)
def get_closest_ratio(height: float, width: float, ratios: dict = ASPECT_RATIO_512):
aspect_ratio = height / width
closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
return ratios[closest_ratio], float(closest_ratio)
def get_image_size_without_loading(path):
with Image.open(path) as img:
return img.size # (width, height)
class RandomSampler(Sampler[int]):
r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
If with replacement, then user can specify :attr:`num_samples` to draw.
Args:
data_source (Dataset): dataset to sample from
replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
num_samples (int): number of samples to draw, default=`len(dataset)`.
generator (Generator): Generator used in sampling.
"""
data_source: Sized
replacement: bool
def __init__(self, data_source: Sized, replacement: bool = False,
num_samples: Optional[int] = None, generator=None) -> None:
self.data_source = data_source
self.replacement = replacement
self._num_samples = num_samples
self.generator = generator
self._pos_start = 0
if not isinstance(self.replacement, bool):
raise TypeError(f"replacement should be a boolean value, but got replacement={self.replacement}")
if not isinstance(self.num_samples, int) or self.num_samples <= 0:
raise ValueError(f"num_samples should be a positive integer value, but got num_samples={self.num_samples}")
@property
def num_samples(self) -> int:
# dataset size might change at runtime
if self._num_samples is None:
return len(self.data_source)
return self._num_samples
def __iter__(self) -> Iterator[int]:
n = len(self.data_source)
if self.generator is None:
seed = int(torch.empty((), dtype=torch.int64).random_().item())
generator = torch.Generator()
generator.manual_seed(seed)
else:
generator = self.generator
if self.replacement:
for _ in range(self.num_samples // 32):
yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist()
yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist()
else:
for _ in range(self.num_samples // n):
xx = torch.randperm(n, generator=generator).tolist()
if self._pos_start >= n:
self._pos_start = 0
print("xx top 10", xx[:10], self._pos_start)
for idx in range(self._pos_start, n):
yield xx[idx]
self._pos_start = (self._pos_start + 1) % n
self._pos_start = 0
yield from torch.randperm(n, generator=generator).tolist()[:self.num_samples % n]
def __len__(self) -> int:
return self.num_samples
class AspectRatioBatchImageSampler(BatchSampler):
"""A sampler wrapper for grouping images with similar aspect ratio into a same batch.
Args:
sampler (Sampler): Base sampler.
dataset (Dataset): Dataset providing data information.
batch_size (int): Size of mini-batch.
drop_last (bool): If ``True``, the sampler will drop the last batch if
its size would be less than ``batch_size``.
aspect_ratios (dict): The predefined aspect ratios.
"""
def __init__(
self,
sampler: Sampler,
dataset: Dataset,
batch_size: int,
train_folder: str = None,
aspect_ratios: dict = ASPECT_RATIO_512,
drop_last: bool = False,
config=None,
**kwargs
) -> None:
if not isinstance(sampler, Sampler):
raise TypeError('sampler should be an instance of ``Sampler``, '
f'but got {sampler}')
if not isinstance(batch_size, int) or batch_size <= 0:
raise ValueError('batch_size should be a positive integer value, '
f'but got batch_size={batch_size}')
self.sampler = sampler
self.dataset = dataset
self.train_folder = train_folder
self.batch_size = batch_size
self.aspect_ratios = aspect_ratios
self.drop_last = drop_last
self.config = config
# buckets for each aspect ratio
self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
# [str(k) for k, v in aspect_ratios]
self.current_available_bucket_keys = list(aspect_ratios.keys())
def __iter__(self):
for idx in self.sampler:
try:
image_dict = self.dataset[idx]
width, height = image_dict.get("weight", None), image_dict.get("height", None)
if width is None or height is None:
image_id, name = image_dict['file_path'], image_dict['text']
if self.train_folder is None:
image_dir = image_id
else:
image_dir = os.path.join(self.train_folder, image_id)
width, height = get_image_size_without_loading(image_dir)
ratio = height / width # self.dataset[idx]
else:
height = int(height)
width = int(width)
ratio = height / width # self.dataset[idx]
except Exception as e:
print(e)
continue
# find the closest aspect ratio
closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
if closest_ratio not in self.current_available_bucket_keys:
continue
bucket = self._aspect_ratio_buckets[closest_ratio]
bucket.append(idx)
# yield a batch of indices in the same aspect ratio group
if len(bucket) == self.batch_size:
yield bucket[:]
del bucket[:]
class AspectRatioBatchSampler(BatchSampler):
"""A sampler wrapper for grouping images with similar aspect ratio into a same batch.
Args:
sampler (Sampler): Base sampler.
dataset (Dataset): Dataset providing data information.
batch_size (int): Size of mini-batch.
drop_last (bool): If ``True``, the sampler will drop the last batch if
its size would be less than ``batch_size``.
aspect_ratios (dict): The predefined aspect ratios.
"""
def __init__(
self,
sampler: Sampler,
dataset: Dataset,
batch_size: int,
video_folder: str = None,
train_data_format: str = "webvid",
aspect_ratios: dict = ASPECT_RATIO_512,
drop_last: bool = False,
config=None,
**kwargs
) -> None:
if not isinstance(sampler, Sampler):
raise TypeError('sampler should be an instance of ``Sampler``, '
f'but got {sampler}')
if not isinstance(batch_size, int) or batch_size <= 0:
raise ValueError('batch_size should be a positive integer value, '
f'but got batch_size={batch_size}')
self.sampler = sampler
self.dataset = dataset
self.video_folder = video_folder
self.train_data_format = train_data_format
self.batch_size = batch_size
self.aspect_ratios = aspect_ratios
self.drop_last = drop_last
self.config = config
# buckets for each aspect ratio
self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
# [str(k) for k, v in aspect_ratios]
self.current_available_bucket_keys = list(aspect_ratios.keys())
def __iter__(self):
for idx in self.sampler:
try:
video_dict = self.dataset[idx]
width, more = video_dict.get("width", None), video_dict.get("height", None)
if width is None or height is None:
if self.train_data_format == "normal":
video_id, name = video_dict['file_path'], video_dict['text']
if self.video_folder is None:
video_dir = video_id
else:
video_dir = os.path.join(self.video_folder, video_id)
else:
videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
video_dir = os.path.join(self.video_folder, f"{videoid}.mp4")
cap = cv2.VideoCapture(video_dir)
# 获取视频尺寸
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # 浮点数转换为整数
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # 浮点数转换为整数
ratio = height / width # self.dataset[idx]
else:
height = int(height)
width = int(width)
ratio = height / width # self.dataset[idx]
except Exception as e:
print(e)
continue
# find the closest aspect ratio
closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
if closest_ratio not in self.current_available_bucket_keys:
continue
bucket = self._aspect_ratio_buckets[closest_ratio]
bucket.append(idx)
# yield a batch of indices in the same aspect ratio group
if len(bucket) == self.batch_size:
yield bucket[:]
del bucket[:]
class AspectRatioBatchImageVideoSampler(BatchSampler):
"""A sampler wrapper for grouping images with similar aspect ratio into a same batch.
Args:
sampler (Sampler): Base sampler.
dataset (Dataset): Dataset providing data information.
batch_size (int): Size of mini-batch.
drop_last (bool): If ``True``, the sampler will drop the last batch if
its size would be less than ``batch_size``.
aspect_ratios (dict): The predefined aspect ratios.
"""
def __init__(self,
sampler: Sampler,
dataset: Dataset,
batch_size: int,
train_folder: str = None,
aspect_ratios: dict = ASPECT_RATIO_512,
drop_last: bool = False
) -> None:
if not isinstance(sampler, Sampler):
raise TypeError('sampler should be an instance of ``Sampler``, '
f'but got {sampler}')
if not isinstance(batch_size, int) or batch_size <= 0:
raise ValueError('batch_size should be a positive integer value, '
f'but got batch_size={batch_size}')
self.sampler = sampler
self.dataset = dataset
self.train_folder = train_folder
self.batch_size = batch_size
self.aspect_ratios = aspect_ratios
self.drop_last = drop_last
# buckets for each aspect ratio
self.current_available_bucket_keys = list(aspect_ratios.keys())
self.bucket = {
'image':{ratio: [] for ratio in aspect_ratios},
'video':{ratio: [] for ratio in aspect_ratios}
}
def __iter__(self):
for idx in self.sampler:
content_type = self.dataset[idx].get('type', 'image')
if content_type == 'image':
try:
image_dict = self.dataset[idx]
width, height = image_dict.get("width", None), image_dict.get("height", None)
if width is None or height is None:
image_id, name = image_dict['file_path'], image_dict['text']
if self.train_folder is None:
image_dir = image_id
else:
image_dir = os.path.join(self.train_folder, image_id)
width, height = get_image_size_without_loading(image_dir)
ratio = height / width # self.dataset[idx]
else:
height = int(height)
width = int(width)
ratio = height / width # self.dataset[idx]
except Exception as e:
print(e)
continue
# find the closest aspect ratio
closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
if closest_ratio not in self.current_available_bucket_keys:
continue
bucket = self.bucket['image'][closest_ratio]
bucket.append(idx)
# yield a batch of indices in the same aspect ratio group
if len(bucket) == self.batch_size:
yield bucket[:]
del bucket[:]
else:
try:
video_dict = self.dataset[idx]
width, height = video_dict.get("width", None), video_dict.get("height", None)
if width is None or height is None:
video_id, name = video_dict['file_path'], video_dict['text']
if self.train_folder is None:
video_dir = video_id
else:
video_dir = os.path.join(self.train_folder, video_id)
cap = cv2.VideoCapture(video_dir)
# 获取视频尺寸
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # 浮点数转换为整数
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # 浮点数转换为整数
ratio = height / width # self.dataset[idx]
else:
height = int(height)
width = int(width)
ratio = height / width # self.dataset[idx]
except Exception as e:
print(e)
continue
# find the closest aspect ratio
closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
if closest_ratio not in self.current_available_bucket_keys:
continue
bucket = self.bucket['video'][closest_ratio]
bucket.append(idx)
# yield a batch of indices in the same aspect ratio group
if len(bucket) == self.batch_size:
yield bucket[:]
del bucket[:]
\ No newline at end of file
from .attention import *
from .transformer2d import *
from .transformer3d import *
from .autoencoder_magvit import *
from .embeddings import *
from .motion_module import *
from .norm import *
from .patch import *
from .resampler import *
This diff is collapsed.
This diff is collapsed.
import math
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from diffusers.utils import deprecate
from diffusers.models.activations import FP32SiLU, get_activation
from diffusers.models.attention_processor import Attention
def get_timestep_embedding(
timesteps: torch.Tensor,
embedding_dim: int,
flip_sin_to_cos: bool = False,
downscale_freq_shift: float = 1,
scale: float = 1,
max_period: int = 10000,
):
"""
This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
:param timesteps: a 1-D Tensor of N indices, one per batch element.
These may be fractional.
:param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
embeddings. :return: an [N x dim] Tensor of positional embeddings.
"""
assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
half_dim = embedding_dim // 2
exponent = -math.log(max_period) * torch.arange(
start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
)
exponent = exponent / (half_dim - downscale_freq_shift)
emb = torch.exp(exponent)
emb = timesteps[:, None].float() * emb[None, :]
# scale embeddings
emb = scale * emb
# concat sine and cosine embeddings
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
# flip sine and cosine embeddings
if flip_sin_to_cos:
emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
# zero pad
if embedding_dim % 2 == 1:
emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
return emb
class Timesteps(nn.Module):
def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
super().__init__()
self.num_channels = num_channels
self.flip_sin_to_cos = flip_sin_to_cos
self.downscale_freq_shift = downscale_freq_shift
def forward(self, timesteps):
t_emb = get_timestep_embedding(
timesteps,
self.num_channels,
flip_sin_to_cos=self.flip_sin_to_cos,
downscale_freq_shift=self.downscale_freq_shift,
)
return t_emb
class TimestepEmbedding(nn.Module):
def __init__(
self,
in_channels: int,
time_embed_dim: int,
act_fn: str = "silu",
out_dim: int = None,
post_act_fn: Optional[str] = None,
cond_proj_dim=None,
sample_proj_bias=True,
):
super().__init__()
self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
if cond_proj_dim is not None:
self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
else:
self.cond_proj = None
self.act = get_activation(act_fn)
if out_dim is not None:
time_embed_dim_out = out_dim
else:
time_embed_dim_out = time_embed_dim
self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
if post_act_fn is None:
self.post_act = None
else:
self.post_act = get_activation(post_act_fn)
def forward(self, sample, condition=None):
if condition is not None:
sample = sample + self.cond_proj(condition)
sample = self.linear_1(sample)
if self.act is not None:
sample = self.act(sample)
sample = self.linear_2(sample)
if self.post_act is not None:
sample = self.post_act(sample)
return sample
class PixArtAlphaTextProjection(nn.Module):
"""
Projects caption embeddings. Also handles dropout for classifier-free guidance.
Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
"""
def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
super().__init__()
if out_features is None:
out_features = hidden_size
self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
if act_fn == "gelu_tanh":
self.act_1 = nn.GELU(approximate="tanh")
elif act_fn == "silu_fp32":
self.act_1 = FP32SiLU()
else:
raise ValueError(f"Unknown activation function: {act_fn}")
self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
def forward(self, caption):
hidden_states = self.linear_1(caption)
hidden_states = self.act_1(hidden_states)
hidden_states = self.linear_2(hidden_states)
return hidden_states
import torch
import torch.nn as nn
import torch.nn.functional as F
class HunyuanDiTAttentionPool(nn.Module):
def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
super().__init__()
self.positional_embedding = nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim**0.5)
self.k_proj = nn.Linear(embed_dim, embed_dim)
self.q_proj = nn.Linear(embed_dim, embed_dim)
self.v_proj = nn.Linear(embed_dim, embed_dim)
self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
self.num_heads = num_heads
def forward(self, x):
x = torch.cat([x.mean(dim=1, keepdim=True), x], dim=1)
x = x + self.positional_embedding[None, :, :].to(x.dtype)
query = self.q_proj(x[:, :1])
key = self.k_proj(x)
value = self.v_proj(x)
batch_size, _, _ = query.size()
query = query.reshape(batch_size, -1, self.num_heads, query.size(-1) // self.num_heads).transpose(1, 2) # (1, H, N, E/H)
key = key.reshape(batch_size, -1, self.num_heads, key.size(-1) // self.num_heads).transpose(1, 2) # (L+1, H, N, E/H)
value = value.reshape(batch_size, -1, self.num_heads, value.size(-1) // self.num_heads).transpose(1, 2) # (L+1, H, N, E/H)
x = F.scaled_dot_product_attention(query=query, key=key, value=value, attn_mask=None, dropout_p=0.0, is_causal=False)
x = x.transpose(1, 2).reshape(batch_size, 1, -1)
x = x.to(query.dtype)
x = self.c_proj(x)
return x.squeeze(1)
class HunyuanCombinedTimestepTextSizeStyleEmbedding(nn.Module):
def __init__(self, embedding_dim, pooled_projection_dim=1024, seq_len=256, cross_attention_dim=2048):
super().__init__()
self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
self.pooler = HunyuanDiTAttentionPool(
seq_len, cross_attention_dim, num_heads=8, output_dim=pooled_projection_dim
)
# Here we use a default learned embedder layer for future extension.
self.style_embedder = nn.Embedding(1, embedding_dim)
extra_in_dim = 256 * 6 + embedding_dim + pooled_projection_dim
self.extra_embedder = PixArtAlphaTextProjection(
in_features=extra_in_dim,
hidden_size=embedding_dim * 4,
out_features=embedding_dim,
act_fn="silu_fp32",
)
def forward(self, timestep, encoder_hidden_states, image_meta_size, style, hidden_dtype=None):
timesteps_proj = self.time_proj(timestep)
timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype)) # (N, 256)
# extra condition1: text
pooled_projections = self.pooler(encoder_hidden_states) # (N, 1024)
# extra condition2: image meta size embdding
image_meta_size = get_timestep_embedding(image_meta_size.view(-1), 256, True, 0)
image_meta_size = image_meta_size.to(dtype=hidden_dtype)
image_meta_size = image_meta_size.view(-1, 6 * 256) # (N, 1536)
# extra condition3: style embedding
style_embedding = self.style_embedder(style) # (N, embedding_dim)
# Concatenate all extra vectors
extra_cond = torch.cat([pooled_projections, image_meta_size, style_embedding], dim=1)
conditioning = timesteps_emb + self.extra_embedder(extra_cond) # [B, D]
return conditioning
\ No newline at end of file
This diff is collapsed.
from typing import Any, Dict, Optional, Tuple
import torch
import torch.nn.functional as F
from diffusers.models.embeddings import TimestepEmbedding, Timesteps
from torch import nn
def zero_module(module):
# Zero out the parameters of a module and return it.
for p in module.parameters():
p.detach().zero_()
return module
class FP32LayerNorm(nn.LayerNorm):
def forward(self, inputs: torch.Tensor) -> torch.Tensor:
origin_dtype = inputs.dtype
if hasattr(self, 'weight') and self.weight is not None:
return F.layer_norm(
inputs.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps
).to(origin_dtype)
else:
return F.layer_norm(
inputs.float(), self.normalized_shape, None, None, self.eps
).to(origin_dtype)
class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
"""
For PixArt-Alpha.
Reference:
https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
"""
def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
super().__init__()
self.outdim = size_emb_dim
self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
self.use_additional_conditions = use_additional_conditions
if use_additional_conditions:
self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
self.resolution_embedder.linear_2 = zero_module(self.resolution_embedder.linear_2)
self.aspect_ratio_embedder.linear_2 = zero_module(self.aspect_ratio_embedder.linear_2)
def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
timesteps_proj = self.time_proj(timestep)
timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype)) # (N, D)
if self.use_additional_conditions:
resolution_emb = self.additional_condition_proj(resolution.flatten()).to(hidden_dtype)
resolution_emb = self.resolution_embedder(resolution_emb).reshape(batch_size, -1)
aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).to(hidden_dtype)
aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb).reshape(batch_size, -1)
conditioning = timesteps_emb + torch.cat([resolution_emb, aspect_ratio_emb], dim=1)
else:
conditioning = timesteps_emb
return conditioning
class AdaLayerNormSingle(nn.Module):
r"""
Norm layer adaptive layer norm single (adaLN-single).
As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
Parameters:
embedding_dim (`int`): The size of each embedding vector.
use_additional_conditions (`bool`): To use additional conditions for normalization or not.
"""
def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
super().__init__()
self.emb = PixArtAlphaCombinedTimestepSizeEmbeddings(
embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
)
self.silu = nn.SiLU()
self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
def forward(
self,
timestep: torch.Tensor,
added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
batch_size: Optional[int] = None,
hidden_dtype: Optional[torch.dtype] = None,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
# No modulation happening here.
embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
return self.linear(self.silu(embedded_timestep)), embedded_timestep
class AdaLayerNormShift(nn.Module):
r"""
Norm layer modified to incorporate timestep embeddings.
Parameters:
embedding_dim (`int`): The size of each embedding vector.
num_embeddings (`int`): The size of the embeddings dictionary.
"""
def __init__(self, embedding_dim: int, elementwise_affine=True, eps=1e-6):
super().__init__()
self.silu = nn.SiLU()
self.linear = nn.Linear(embedding_dim, embedding_dim)
self.norm = FP32LayerNorm(embedding_dim, elementwise_affine=elementwise_affine, eps=eps)
def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
shift = self.linear(self.silu(emb.to(torch.float32)).to(emb.dtype))
x = self.norm(x) + shift.unsqueeze(dim=1)
return x
\ No newline at end of file
This diff is collapsed.
# Copyright (c) Alibaba Cloud.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.init import normal_
def get_abs_pos(abs_pos, tgt_size):
# abs_pos: L, C
# tgt_size: M
# return: M, C
src_size = int(math.sqrt(abs_pos.size(0)))
tgt_size = int(math.sqrt(tgt_size))
dtype = abs_pos.dtype
if src_size != tgt_size:
return F.interpolate(
abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
size=(tgt_size, tgt_size),
mode="bicubic",
align_corners=False,
).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
else:
return abs_pos
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
"""
grid_size: int of the grid height and width
return:
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
grid_h = np.arange(grid_size, dtype=np.float32)
grid_w = np.arange(grid_size, dtype=np.float32)
grid = np.meshgrid(grid_w, grid_h) # here w goes first
grid = np.stack(grid, axis=0)
grid = grid.reshape([2, 1, grid_size, grid_size])
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
if cls_token:
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
return pos_embed
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
assert embed_dim % 2 == 0
# use half of dimensions to encode grid_h
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
return emb
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float32)
omega /= embed_dim / 2.
omega = 1. / 10000**omega # (D/2,)
pos = pos.reshape(-1) # (M,)
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
emb_sin = np.sin(out) # (M, D/2)
emb_cos = np.cos(out) # (M, D/2)
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
return emb
class Resampler(nn.Module):
"""
A 2D perceiver-resampler network with one cross attention layers by
(grid_size**2) learnable queries and 2d sincos pos_emb
Outputs:
A tensor with the shape of (grid_size**2, embed_dim)
"""
def __init__(
self,
grid_size,
embed_dim,
num_heads,
kv_dim=None,
norm_layer=nn.LayerNorm
):
super().__init__()
self.num_queries = grid_size ** 2
self.embed_dim = embed_dim
self.num_heads = num_heads
self.pos_embed = nn.Parameter(
torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
).requires_grad_(False)
self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
normal_(self.query, std=.02)
if kv_dim is not None and kv_dim != embed_dim:
self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
else:
self.kv_proj = nn.Identity()
self.attn = nn.MultiheadAttention(embed_dim, num_heads)
self.ln_q = norm_layer(embed_dim)
self.ln_kv = norm_layer(embed_dim)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward(self, x, key_padding_mask=None):
pos_embed = get_abs_pos(self.pos_embed, x.size(1))
x = self.kv_proj(x)
x = self.ln_kv(x).permute(1, 0, 2)
N = x.shape[1]
q = self.ln_q(self.query)
out = self.attn(
self._repeat(q, N) + self.pos_embed.unsqueeze(1),
x + pos_embed.unsqueeze(1),
x,
key_padding_mask=key_padding_mask)[0]
return out.permute(1, 0, 2)
def _repeat(self, query, N: int):
return query.unsqueeze(1).repeat(1, N, 1)
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment