feat: 初始提交

08a21d59 · chenpangpang · 1a6b26f1 · 08a21d59 · 08a21d59 · 08a21d59
Commit 08a21d59 authored Dec 27, 2024 by chenpangpang
20 changed files
--- a/Ruyi-Models/comfyui/workflows/workflow-ruyi-i2v-start-frame.json
+++ b/Ruyi-Models/comfyui/workflows/workflow-ruyi-i2v-start-frame.json
+{
+  "last_node_id": 6,
+  "last_link_id": 3,
+  "nodes": [
+    {
+      "id": 1,
+      "type": "Ruyi_LoadModel",
+      "pos": {
+        "0": 210,
+        "1": 162
+      },
+      "size": {
+        "0": 315,
+        "1": 82
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "ruyi_model",
+          "type": "RUYI_MODEL",
+          "links": [
+            1
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Ruyi_LoadModel"
+      },
+      "widgets_values": [
+        "Ruyi-Mini-7B",
+        "yes",
+        "yes"
+      ]
+    },
+    {
+      "id": 4,
+      "type": "VHS_VideoCombine",
+      "pos": {
+        "0": 1045,
+        "1": 133
+      },
+      "size": [
+        404.73553466796875,
+        601.8645528157551
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 3
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null,
+          "shape": 7
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_VideoCombine"
+      },
+      "widgets_values": {
+        "frame_rate": 24,
+        "loop_count": 0,
+        "filename_prefix": "Ruyi-I2V-StartFrame",
+        "format": "video/h264-mp4",
+        "pix_fmt": "yuv420p",
+        "crf": 19,
+        "save_metadata": true,
+        "pingpong": false,
+        "save_output": true,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "Ruyi-I2V-StartFrame_00001.mp4",
+            "subfolder": "",
+            "type": "output",
+            "format": "video/h264-mp4",
+            "frame_rate": 24
+          },
+          "muted": false
+        }
+      }
+    },
+    {
+      "id": 3,
+      "type": "LoadImage",
+      "pos": {
+        "0": 200,
+        "1": 439
+      },
+      "size": {
+        "0": 315,
+        "1": 314
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            2
+          ],
+          "slot_index": 0
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "example_03.jpg",
+        "image"
+      ]
+    },
+    {
+      "id": 2,
+      "type": "Ruyi_I2VSampler",
+      "pos": {
+        "0": 628,
+        "1": 284
+      },
+      "size": {
+        "0": 327.5999755859375,
+        "1": 338
+      },
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "ruyi_model",
+          "type": "RUYI_MODEL",
+          "link": 1
+        },
+        {
+          "name": "start_img",
+          "type": "IMAGE",
+          "link": 2
+        },
+        {
+          "name": "end_img",
+          "type": "IMAGE",
+          "link": null,
+          "shape": 7
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            3
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Ruyi_I2VSampler"
+      },
+      "widgets_values": [
+        120,
+        512,
+        925247271358454,
+        "randomize",
+        25,
+        7,
+        "DDIM",
+        "2",
+        "static",
+        "normal_mode",
+        "5"
+      ]
+    }
+  ],
+  "links": [
+    [
+      1,
+      1,
+      0,
+      2,
+      0,
+      "RUYI_MODEL"
+    ],
+    [
+      2,
+      3,
+      0,
+      2,
+      1,
+      "IMAGE"
+    ],
+    [
+      3,
+      2,
+      0,
+      4,
+      0,
+      "IMAGE"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 1,
+      "offset": [
+        0,
+        0
+      ]
+    }
+  },
+  "version": 0.4
+}
--- a/Ruyi-Models/config/default.yaml
+++ b/Ruyi-Models/config/default.yaml
+transformer_additional_kwargs:
+  basic_block_type:                           "basic"
+  after_norm:                                 false
+  time_position_encoding:                     true
+
+noise_scheduler_kwargs:
+  beta_start:    0.00085
+  beta_end:      0.03
+  beta_schedule: "scaled_linear"
+  steps_offset:  1
+  prediction_type: "v_prediction"
+  clip_sample: false
+
+vae_kwargs:
+  enable_magvit: true
--- a/Ruyi-Models/predict_i2v.py
+++ b/Ruyi-Models/predict_i2v.py
+import os
+
+import torch
+from PIL import Image
+from diffusers import (EulerDiscreteScheduler, EulerAncestralDiscreteScheduler,
+                       DPMSolverMultistepScheduler, PNDMScheduler, DDIMScheduler)
+from omegaconf import OmegaConf
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from safetensors.torch import load_file as load_safetensors
+from huggingface_hub import snapshot_download
+
+from ruyi.data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
+from ruyi.models.autoencoder_magvit import AutoencoderKLMagvit
+from ruyi.models.transformer3d import HunyuanTransformer3DModel
+from ruyi.pipeline.pipeline_ruyi_inpaint import RuyiInpaintPipeline
+from ruyi.utils.lora_utils import merge_lora, unmerge_lora
+from ruyi.utils.utils import get_image_to_video_latent, save_videos_grid
+
+# Input and output
+start_image_path    = "assets/girl_01.jpg"
+end_image_path      = "assets/girl_02.jpg" # Can be None for start-image-to-video
+output_video_path   = "outputs/example_01.mp4"
+
+# Video settings
+video_length        = 120       # The max video length is 120 frames (24 frames per second)
+base_resolution     = 512       # # The pixels in the generated video are approximately 512 x 512. Values in the range of [384, 896] typically produce good video quality.
+video_size          = None      # Override base_resolution. Format: [height, width], e.g., [384, 672]
+# Control settings
+aspect_ratio        = "16:9"    # Choose in ["16:9", "9:16"], note that this is only the hint
+motion              = "auto"    # Motion control, choose in ["1", "2", "3", "4", "auto"]
+camera_direction    = "auto"    # Camera control, choose in ["static", "left", "right", "up", "down", "auto"]
+# Sampler settings
+steps               = 25
+cfg                 = 7.0
+scheduler_name      = "DDIM"    # Choose in ["Euler", "Euler A", "DPM++", "PNDM","DDIM"]
+
+# GPU memory settings
+low_gpu_memory_mode = False     # Low gpu memory mode
+gpu_offload_steps   = 5         # Choose in [0, 10, 7, 5, 1], the latter number requires less GPU memory but longer time
+
+# Random seed
+seed                = 42        # The Answer to the Ultimate Question of Life, The Universe, and Everything
+
+# Model settings
+config_path         = "config/default.yaml"
+model_name          = "Ruyi-Mini-7B"
+model_type          = "Inpaint"
+model_path          = f"models/{model_name}"    # (Down)load mode in this path
+auto_download       = True                      # Automatically download the model if the pipeline creation fails
+auto_update         = True                      # If auto_download is enabled, check for updates and update the model if necessary
+
+# LoRA settings
+lora_path           = None
+lora_weight         = 1.0
+
+# Other settings
+weight_dtype = torch.bfloat16
+device = torch.device("cuda")
+
+
+def get_control_embeddings(pipeline, aspect_ratio, motion, camera_direction):
+    # Default keys
+    p_default_key = "p.default"
+    n_default_key = "n.default"
+
+    # Load embeddings
+    if motion == "auto":
+        motion = "0"
+    p_key = f"p.{aspect_ratio.replace(':', 'x')}movie{motion}{camera_direction}"
+    embeddings = pipeline.embeddings
+
+    # Get embeddings
+    positive_embeds = embeddings.get(f"{p_key}.emb1", embeddings[f"{p_default_key}.emb1"])
+    positive_attention_mask = embeddings.get(f"{p_key}.mask1", embeddings[f"{p_default_key}.mask1"])
+    positive_embeds_2 = embeddings.get(f"{p_key}.emb2", embeddings[f"{p_default_key}.emb2"])
+    positive_attention_mask_2 = embeddings.get(f"{p_key}.mask2", embeddings[f"{p_default_key}.mask2"])
+
+    negative_embeds = embeddings[f"{n_default_key}.emb1"]
+    negative_attention_mask = embeddings[f"{n_default_key}.mask1"]
+    negative_embeds_2 = embeddings[f"{n_default_key}.emb2"]
+    negative_attention_mask_2 = embeddings[f"{n_default_key}.mask2"]
+
+    return {
+        "positive_embeds": positive_embeds,
+        "positive_attention_mask": positive_attention_mask,
+        "positive_embeds_2": positive_embeds_2,
+        "positive_attention_mask_2": positive_attention_mask_2,
+
+        "negative_embeds": negative_embeds,
+        "negative_attention_mask": negative_attention_mask,
+        "negative_embeds_2": negative_embeds_2,
+        "negative_attention_mask_2": negative_attention_mask_2,
+    }
+
+
+def try_setup_pipeline(model_path, weight_dtype, config):
+    try:
+        # Get Vae
+        vae = AutoencoderKLMagvit.from_pretrained(
+            model_path, 
+            subfolder="vae"
+        ).to(weight_dtype)
+        print("Vae loaded ...")
+
+        # Get Transformer
+        transformer_additional_kwargs = OmegaConf.to_container(config['transformer_additional_kwargs'])
+        transformer = HunyuanTransformer3DModel.from_pretrained_2d(
+            model_path, 
+            subfolder="transformer",
+            transformer_additional_kwargs=transformer_additional_kwargs
+        ).to(weight_dtype)
+        print("Transformer loaded ...")
+
+        # Load Clip
+        clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            model_path, subfolder="image_encoder"
+        ).to(weight_dtype)
+        clip_image_processor = CLIPImageProcessor.from_pretrained(
+            model_path, subfolder="image_encoder"
+        )
+
+        # Load sampler and create pipeline
+        Choosen_Scheduler = DDIMScheduler
+        scheduler = Choosen_Scheduler.from_pretrained(
+            model_path, 
+            subfolder="scheduler"
+        )
+        pipeline = RuyiInpaintPipeline.from_pretrained(
+            model_path,
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+            torch_dtype=weight_dtype,
+            clip_image_encoder=clip_image_encoder,
+            clip_image_processor=clip_image_processor,
+        )
+
+        # Load embeddings
+        embeddings = load_safetensors(os.path.join(model_path, "embeddings.safetensors"))
+        pipeline.embeddings = embeddings
+        print("Pipeline loaded ...")
+
+        return pipeline
+    except Exception as e:
+        print("[Ruyi] Setup pipeline failed:", e)
+        return None
+
+
+# Load config
+config = OmegaConf.load(config_path)
+
+# Load images
+start_img = [Image.open(start_image_path).convert("RGB")]
+end_img   = [Image.open(end_image_path).convert("RGB")] if end_image_path is not None else None
+
+# Check for update
+repo_id = f"IamCreateAI/{model_name}"
+if auto_download and auto_update:
+    print(f"Checking for {model_name} updates ...")
+
+    # Download the model
+    snapshot_download(repo_id=repo_id, local_dir=model_path)
+
+# Init model
+pipeline = try_setup_pipeline(model_path, weight_dtype, config)
+if pipeline is None and auto_download:
+    print(f"Downloading {model_name} ...")
+
+    # Download the model
+    snapshot_download(repo_id=repo_id, local_dir=model_path)
+
+    pipeline = try_setup_pipeline(model_path, weight_dtype, config)
+
+if pipeline is None:
+    message = (f"[Load Model Failed] "
+               f"Please download Ruyi model from huggingface repo '{repo_id}', "
+               f"And put it into '{model_path}'.")
+    if not auto_download:
+        message += "\nOr just set auto_download to 'True'."
+    raise FileNotFoundError(message)
+
+# Setup GPU memory mode
+if low_gpu_memory_mode:
+    pipeline.enable_sequential_cpu_offload()
+else:
+    pipeline.enable_model_cpu_offload()
+
+# Prepare LoRA config
+loras = {
+    'models': [lora_path] if lora_path is not None else [],
+    'weights': [lora_weight] if lora_path is not None else [],
+}
+
+# Count most suitable height and width
+if video_size is None:
+    aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+    original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
+    closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
+    height, width = [int(x / 16) * 16 for x in closest_size]
+else:
+    height, width = video_size
+
+# Set hidden states offload steps
+pipeline.transformer.hidden_cache_size = gpu_offload_steps
+
+# Load Sampler
+if scheduler_name == "DPM++":
+    noise_scheduler = DPMSolverMultistepScheduler.from_pretrained(model_path, subfolder='scheduler')
+elif scheduler_name == "Euler":
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(model_path, subfolder='scheduler')
+elif scheduler_name == "Euler A":
+    noise_scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder='scheduler')
+elif scheduler_name == "PNDM":
+    noise_scheduler = PNDMScheduler.from_pretrained(model_path, subfolder='scheduler')
+elif scheduler_name == "DDIM":
+    noise_scheduler = DDIMScheduler.from_pretrained(model_path, subfolder='scheduler')
+pipeline.scheduler = noise_scheduler
+
+# Set random seed
+generator= torch.Generator(device).manual_seed(seed)
+
+# Load control embeddings
+embeddings = get_control_embeddings(pipeline, aspect_ratio, motion, camera_direction)
+
+with torch.no_grad():
+    video_length = int(video_length // pipeline.vae.mini_batch_encoder * pipeline.vae.mini_batch_encoder) if video_length != 1 else 1
+    input_video, input_video_mask, clip_image = get_image_to_video_latent(start_img, end_img, video_length=video_length, sample_size=(height, width))
+
+    for _lora_path, _lora_weight in zip(loras.get("models", []), loras.get("weights", [])):
+        pipeline = merge_lora(pipeline, _lora_path, _lora_weight)
+    
+    sample = pipeline(
+        prompt_embeds = embeddings["positive_embeds"],
+        prompt_attention_mask = embeddings["positive_attention_mask"],
+        prompt_embeds_2 = embeddings["positive_embeds_2"],
+        prompt_attention_mask_2 = embeddings["positive_attention_mask_2"],
+
+        negative_prompt_embeds = embeddings["negative_embeds"],
+        negative_prompt_attention_mask = embeddings["negative_attention_mask"],
+        negative_prompt_embeds_2 = embeddings["negative_embeds_2"],
+        negative_prompt_attention_mask_2 = embeddings["negative_attention_mask_2"],
+
+        video_length = video_length,
+        height      = height,
+        width       = width,
+        generator   = generator,
+        guidance_scale = cfg,
+        num_inference_steps = steps,
+
+        video        = input_video,
+        mask_video   = input_video_mask,
+        clip_image   = clip_image, 
+    ).videos
+
+    for _lora_path, _lora_weight in zip(loras.get("models", []), loras.get("weights", [])):
+        pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight)
+
+# Save the video
+output_folder = os.path.dirname(output_video_path)
+if output_folder != '':
+    os.makedirs(output_folder, exist_ok=True)
+save_videos_grid(sample, output_video_path, fps=24)
--- a/Ruyi-Models/predict_i2v_80g.py
+++ b/Ruyi-Models/predict_i2v_80g.py
+import os
+
+import torch
+from PIL import Image
+from diffusers import (EulerDiscreteScheduler, EulerAncestralDiscreteScheduler,
+                       DPMSolverMultistepScheduler, PNDMScheduler, DDIMScheduler)
+from omegaconf import OmegaConf
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from safetensors.torch import load_file as load_safetensors
+from huggingface_hub import snapshot_download
+
+from ruyi.data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
+from ruyi.models.autoencoder_magvit import AutoencoderKLMagvit
+from ruyi.models.transformer3d import HunyuanTransformer3DModel
+from ruyi.pipeline.pipeline_ruyi_inpaint import RuyiInpaintPipeline
+from ruyi.utils.lora_utils import merge_lora, unmerge_lora
+from ruyi.utils.utils import get_image_to_video_latent, save_videos_grid
+
+# Input and output
+start_image_path    = "assets/girl_01.jpg"
+end_image_path      = "assets/girl_02.jpg" # Can be None for start-image-to-video
+output_video_path   = "outputs/example_01.mp4"
+
+# Video settings
+video_length        = 120       # The max video length is 120 frames (24 frames per second)
+base_resolution     = 512       # # The pixels in the generated video are approximately 512 x 512. Values in the range of [384, 896] typically produce good video quality.
+video_size          = None      # Override base_resolution. Format: [height, width], e.g., [384, 672]
+# Control settings
+aspect_ratio        = "9:16"    # Choose in ["16:9", "9:16"], note that this is only the hint
+motion              = "auto"    # Motion control, choose in ["1", "2", "3", "4", "auto"]
+camera_direction    = "auto"    # Camera control, choose in ["static", "left", "right", "up", "down", "auto"]
+# Sampler settings
+steps               = 25
+cfg                 = 7.0
+scheduler_name      = "DDIM"    # Choose in ["Euler", "Euler A", "DPM++", "PNDM","DDIM"]
+
+# GPU memory settings
+low_gpu_memory_mode = False     # Low gpu memory mode
+gpu_offload_steps   = 0         # Choose in [0, 10, 7, 5, 1], the latter number requires less GPU memory but longer time
+
+# Random seed
+seed                = 42        # The Answer to the Ultimate Question of Life, The Universe, and Everything
+
+# Model settings
+config_path         = "config/default.yaml"
+model_name          = "Ruyi-Mini-7B"
+model_type          = "Inpaint"
+model_path          = f"models/{model_name}"    # (Down)load mode in this path
+auto_download       = True                      # Automatically download the model if the pipeline creation fails
+auto_update         = True                      # If auto_download is enabled, check for updates and update the model if necessary
+
+# LoRA settings
+lora_path           = None
+lora_weight         = 1.0
+
+# Other settings
+weight_dtype = torch.bfloat16
+device = torch.device("cuda")
+
+
+def get_control_embeddings(pipeline, aspect_ratio, motion, camera_direction):
+    # Default keys
+    p_default_key = "p.default"
+    n_default_key = "n.default"
+
+    # Load embeddings
+    if motion == "auto":
+        motion = "0"
+    p_key = f"p.{aspect_ratio.replace(':', 'x')}movie{motion}{camera_direction}"
+    embeddings = pipeline.embeddings
+
+    # Get embeddings
+    positive_embeds = embeddings.get(f"{p_key}.emb1", embeddings[f"{p_default_key}.emb1"])
+    positive_attention_mask = embeddings.get(f"{p_key}.mask1", embeddings[f"{p_default_key}.mask1"])
+    positive_embeds_2 = embeddings.get(f"{p_key}.emb2", embeddings[f"{p_default_key}.emb2"])
+    positive_attention_mask_2 = embeddings.get(f"{p_key}.mask2", embeddings[f"{p_default_key}.mask2"])
+
+    negative_embeds = embeddings[f"{n_default_key}.emb1"]
+    negative_attention_mask = embeddings[f"{n_default_key}.mask1"]
+    negative_embeds_2 = embeddings[f"{n_default_key}.emb2"]
+    negative_attention_mask_2 = embeddings[f"{n_default_key}.mask2"]
+
+    return {
+        "positive_embeds": positive_embeds,
+        "positive_attention_mask": positive_attention_mask,
+        "positive_embeds_2": positive_embeds_2,
+        "positive_attention_mask_2": positive_attention_mask_2,
+
+        "negative_embeds": negative_embeds,
+        "negative_attention_mask": negative_attention_mask,
+        "negative_embeds_2": negative_embeds_2,
+        "negative_attention_mask_2": negative_attention_mask_2,
+    }
+
+
+def try_setup_pipeline(model_path, weight_dtype, config):
+    try:
+        # Get Vae
+        vae = AutoencoderKLMagvit.from_pretrained(
+            model_path, 
+            subfolder="vae"
+        ).to(weight_dtype)
+        print("Vae loaded ...")
+
+        # Get Transformer
+        transformer_additional_kwargs = OmegaConf.to_container(config['transformer_additional_kwargs'])
+        transformer = HunyuanTransformer3DModel.from_pretrained_2d(
+            model_path, 
+            subfolder="transformer",
+            transformer_additional_kwargs=transformer_additional_kwargs
+        ).to(weight_dtype)
+        print("Transformer loaded ...")
+
+        # Load Clip
+        clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            model_path, subfolder="image_encoder"
+        ).to(weight_dtype)
+        clip_image_processor = CLIPImageProcessor.from_pretrained(
+            model_path, subfolder="image_encoder"
+        )
+
+        # Load sampler and create pipeline
+        Choosen_Scheduler = DDIMScheduler
+        scheduler = Choosen_Scheduler.from_pretrained(
+            model_path, 
+            subfolder="scheduler"
+        )
+        pipeline = RuyiInpaintPipeline.from_pretrained(
+            model_path,
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+            torch_dtype=weight_dtype,
+            clip_image_encoder=clip_image_encoder,
+            clip_image_processor=clip_image_processor,
+        )
+
+        # Load embeddings
+        embeddings = load_safetensors(os.path.join(model_path, "embeddings.safetensors"))
+        pipeline.embeddings = embeddings
+        print("Pipeline loaded ...")
+
+        return pipeline
+    except Exception as e:
+        print("[Ruyi] Setup pipeline failed:", e)
+        return None
+
+
+# Load config
+config = OmegaConf.load(config_path)
+
+# Load images
+start_img = [Image.open(start_image_path).convert("RGB")]
+end_img   = [Image.open(end_image_path).convert("RGB")] if end_image_path is not None else None
+
+# Check for update
+repo_id = f"IamCreateAI/{model_name}"
+if auto_download and auto_update:
+    print(f"Checking for {model_name} updates ...")
+
+    # Download the model
+    snapshot_download(repo_id=repo_id, local_dir=model_path)
+
+# Init model
+pipeline = try_setup_pipeline(model_path, weight_dtype, config)
+if pipeline is None and auto_download:
+    print(f"Downloading {model_name} ...")
+
+    # Download the model
+    snapshot_download(repo_id=repo_id, local_dir=model_path)
+
+    pipeline = try_setup_pipeline(model_path, weight_dtype, config)
+
+if pipeline is None:
+    message = (f"[Load Model Failed] "
+               f"Please download Ruyi model from huggingface repo '{repo_id}', "
+               f"And put it into '{model_path}'.")
+    if not auto_download:
+        message += "\nOr just set auto_download to 'True'."
+    raise FileNotFoundError(message)
+
+# Setup GPU memory mode
+if low_gpu_memory_mode:
+    pipeline.enable_sequential_cpu_offload()
+else:
+    pipeline.enable_model_cpu_offload()
+
+# Prepare LoRA config
+loras = {
+    'models': [lora_path] if lora_path is not None else [],
+    'weights': [lora_weight] if lora_path is not None else [],
+}
+
+# Count most suitable height and width
+if video_size is None:
+    aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+    original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
+    closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
+    height, width = [int(x / 16) * 16 for x in closest_size]
+else:
+    height, width = video_size
+
+# Set hidden states offload steps
+pipeline.transformer.hidden_cache_size = gpu_offload_steps
+
+# Load Sampler
+if scheduler_name == "DPM++":
+    noise_scheduler = DPMSolverMultistepScheduler.from_pretrained(model_path, subfolder='scheduler')
+elif scheduler_name == "Euler":
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(model_path, subfolder='scheduler')
+elif scheduler_name == "Euler A":
+    noise_scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder='scheduler')
+elif scheduler_name == "PNDM":
+    noise_scheduler = PNDMScheduler.from_pretrained(model_path, subfolder='scheduler')
+elif scheduler_name == "DDIM":
+    noise_scheduler = DDIMScheduler.from_pretrained(model_path, subfolder='scheduler')
+pipeline.scheduler = noise_scheduler
+
+# Set random seed
+generator= torch.Generator(device).manual_seed(seed)
+
+# Load control embeddings
+embeddings = get_control_embeddings(pipeline, aspect_ratio, motion, camera_direction)
+
+with torch.no_grad():
+    video_length = int(video_length // pipeline.vae.mini_batch_encoder * pipeline.vae.mini_batch_encoder) if video_length != 1 else 1
+    input_video, input_video_mask, clip_image = get_image_to_video_latent(start_img, end_img, video_length=video_length, sample_size=(height, width))
+
+    for _lora_path, _lora_weight in zip(loras.get("models", []), loras.get("weights", [])):
+        pipeline = merge_lora(pipeline, _lora_path, _lora_weight)
+    
+    sample = pipeline(
+        prompt_embeds = embeddings["positive_embeds"],
+        prompt_attention_mask = embeddings["positive_attention_mask"],
+        prompt_embeds_2 = embeddings["positive_embeds_2"],
+        prompt_attention_mask_2 = embeddings["positive_attention_mask_2"],
+
+        negative_prompt_embeds = embeddings["negative_embeds"],
+        negative_prompt_attention_mask = embeddings["negative_attention_mask"],
+        negative_prompt_embeds_2 = embeddings["negative_embeds_2"],
+        negative_prompt_attention_mask_2 = embeddings["negative_attention_mask_2"],
+
+        video_length = video_length,
+        height      = height,
+        width       = width,
+        generator   = generator,
+        guidance_scale = cfg,
+        num_inference_steps = steps,
+
+        video        = input_video,
+        mask_video   = input_video_mask,
+        clip_image   = clip_image, 
+    ).videos
+
+    for _lora_path, _lora_weight in zip(loras.get("models", []), loras.get("weights", [])):
+        pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight)
+
+# Save the video
+output_folder = os.path.dirname(output_video_path)
+if output_folder != '':
+    os.makedirs(output_folder, exist_ok=True)
+save_videos_grid(sample, output_video_path, fps=24)
--- a/Ruyi-Models/pyproject.toml
+++ b/Ruyi-Models/pyproject.toml
+[project]
+name = "ruyi-models"
+description = "ComfyUI wrapper nodes for Ruyi, an image-to-video model by CreateAI."
+version = "1.0.1"
+license = {file = "LICENSE"}
+dependencies = ["Pillow", "einops", "safetensors", "timm", "tomesd", "torch", "torchdiffeq", "torchsde", "decord", "datasets", "numpy", "scikit-image", "opencv-python", "omegaconf", "SentencePiece", "albumentations", "imageio[ffmpeg]", "imageio[pyav]", "tensorboard", "beautifulsoup4", "ftfy", "func_timeout", "huggingface_hub", "accelerate>=0.26.0", "diffusers>=0.28.2", "transformers>=4.37.2"]
+
+[project.urls]
+Repository = "https://github.com/IamCreateAI/Ruyi-Models"
+#  Used by Comfy Registry https://comfyregistry.org
+
+[tool.comfy]
+PublisherId = "CreateAI"
+DisplayName = "Ruyi-Models"
+Icon = ""
--- a/Ruyi-Models/requirements.txt
+++ b/Ruyi-Models/requirements.txt
+Pillow
+einops
+safetensors
+timm
+tomesd
+torch
+torchdiffeq
+torchsde
+decord
+datasets
+numpy
+scikit-image
+opencv-python
+omegaconf
+SentencePiece
+albumentations
+imageio[ffmpeg]
+imageio[pyav]
+tensorboard
+beautifulsoup4
+ftfy
+func_timeout
+huggingface_hub
+accelerate>=0.26.0
+diffusers>=0.28.2
+transformers>=4.37.2
--- a/Ruyi-Models/ruyi/__init__.py
+++ b/Ruyi-Models/ruyi/__init__.py
--- a/Ruyi-Models/ruyi/data/bucket_sampler.py
+++ b/Ruyi-Models/ruyi/data/bucket_sampler.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import (Generic, Iterable, Iterator, List, Optional, Sequence,
+                    Sized, TypeVar, Union)
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import BatchSampler, Dataset, Sampler
+
+ASPECT_RATIO_512 = {
+    '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0],
+    '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
+    '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
+    '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
+    '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
+    '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
+    '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
+    '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
+    '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0],
+    '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0]
+}
+ASPECT_RATIO_RANDOM_CROP_512 = {
+    '0.42': [320.0, 768.0], '0.5': [352.0, 704.0], 
+    '0.57': [384.0, 672.0], '0.68': [416.0, 608.0], '0.78': [448.0, 576.0], '0.88': [480.0, 544.0], 
+    '0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], 
+    '1.13': [544.0, 480.0], '1.29': [576.0, 448.0], '1.46': [608.0, 416.0], '1.75': [672.0, 384.0], 
+    '2.0': [704.0, 352.0],  '2.4': [768.0, 320.0]
+}
+ASPECT_RATIO_RANDOM_CROP_PROB = [
+    1, 2,
+    4, 4, 4, 4,
+    8, 8, 8,
+    4, 4, 4, 4,
+    2, 1
+]
+ASPECT_RATIO_RANDOM_CROP_PROB = np.array(ASPECT_RATIO_RANDOM_CROP_PROB) / sum(ASPECT_RATIO_RANDOM_CROP_PROB)
+
+def get_closest_ratio(height: float, width: float, ratios: dict = ASPECT_RATIO_512):
+    aspect_ratio = height / width
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
+    return ratios[closest_ratio], float(closest_ratio)
+
+def get_image_size_without_loading(path):
+    with Image.open(path) as img:
+        return img.size  # (width, height)
+
+class RandomSampler(Sampler[int]):
+    r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+
+    If with replacement, then user can specify :attr:`num_samples` to draw.
+
+    Args:
+        data_source (Dataset): dataset to sample from
+        replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
+        num_samples (int): number of samples to draw, default=`len(dataset)`.
+        generator (Generator): Generator used in sampling.
+    """
+
+    data_source: Sized
+    replacement: bool
+
+    def __init__(self, data_source: Sized, replacement: bool = False,
+                 num_samples: Optional[int] = None, generator=None) -> None:
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.generator = generator
+        self._pos_start = 0
+
+        if not isinstance(self.replacement, bool):
+            raise TypeError(f"replacement should be a boolean value, but got replacement={self.replacement}")
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError(f"num_samples should be a positive integer value, but got num_samples={self.num_samples}")
+
+    @property
+    def num_samples(self) -> int:
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self) -> Iterator[int]:
+        n = len(self.data_source)
+        if self.generator is None:
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+            generator = torch.Generator()
+            generator.manual_seed(seed)
+        else:
+            generator = self.generator
+
+        if self.replacement:
+            for _ in range(self.num_samples // 32):
+                yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist()
+            yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist()
+        else:
+            for _ in range(self.num_samples // n):
+                xx = torch.randperm(n, generator=generator).tolist()
+                if self._pos_start >= n:
+                    self._pos_start = 0
+                print("xx top 10", xx[:10], self._pos_start)
+                for idx in range(self._pos_start, n):
+                    yield xx[idx]
+                    self._pos_start = (self._pos_start + 1) % n
+                self._pos_start = 0
+            yield from torch.randperm(n, generator=generator).tolist()[:self.num_samples % n]
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+class AspectRatioBatchImageSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+    def __init__(
+        self,
+        sampler: Sampler,
+        dataset: Dataset,
+        batch_size: int,
+        train_folder: str = None,
+        aspect_ratios: dict = ASPECT_RATIO_512,
+        drop_last: bool = False,
+        config=None,
+        **kwargs
+    ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.train_folder = train_folder
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+        self.config = config
+        # buckets for each aspect ratio 
+        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
+        # [str(k) for k, v in aspect_ratios] 
+        self.current_available_bucket_keys = list(aspect_ratios.keys())
+
+    def __iter__(self):
+        for idx in self.sampler:
+            try:
+                image_dict = self.dataset[idx]
+
+                width, height = image_dict.get("weight", None), image_dict.get("height", None)
+                if width is None or height is None:
+                    image_id, name = image_dict['file_path'], image_dict['text']
+                    if self.train_folder is None:
+                        image_dir = image_id
+                    else:
+                        image_dir = os.path.join(self.train_folder, image_id)
+
+                    width, height = get_image_size_without_loading(image_dir)
+
+                    ratio = height / width # self.dataset[idx]
+                else:
+                    height = int(height)
+                    width = int(width)
+                    ratio = height / width # self.dataset[idx]
+            except Exception as e:
+                print(e)
+                continue
+            # find the closest aspect ratio
+            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+            if closest_ratio not in self.current_available_bucket_keys:
+                continue
+            bucket = self._aspect_ratio_buckets[closest_ratio]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+class AspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+    def __init__(
+        self,
+        sampler: Sampler,
+        dataset: Dataset,
+        batch_size: int,
+        video_folder: str = None,
+        train_data_format: str = "webvid",
+        aspect_ratios: dict = ASPECT_RATIO_512,
+        drop_last: bool = False,
+        config=None,
+        **kwargs
+    ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.video_folder = video_folder
+        self.train_data_format = train_data_format
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+        self.config = config
+        # buckets for each aspect ratio 
+        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
+        # [str(k) for k, v in aspect_ratios] 
+        self.current_available_bucket_keys = list(aspect_ratios.keys())
+
+    def __iter__(self):
+        for idx in self.sampler:
+            try:
+                video_dict = self.dataset[idx]
+                width, more = video_dict.get("width", None), video_dict.get("height", None)
+
+                if width is None or height is None:
+                    if self.train_data_format == "normal":
+                        video_id, name = video_dict['file_path'], video_dict['text']
+                        if self.video_folder is None:
+                            video_dir = video_id
+                        else:
+                            video_dir = os.path.join(self.video_folder, video_id)
+                    else:
+                        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
+                        video_dir = os.path.join(self.video_folder, f"{videoid}.mp4")
+                    cap = cv2.VideoCapture(video_dir)
+
+                    # 获取视频尺寸
+                    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))   # 浮点数转换为整数
+                    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # 浮点数转换为整数
+                    
+                    ratio = height / width # self.dataset[idx]
+                else:
+                    height = int(height)
+                    width = int(width)
+                    ratio = height / width # self.dataset[idx]
+            except Exception as e:
+                print(e)
+                continue
+            # find the closest aspect ratio
+            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+            if closest_ratio not in self.current_available_bucket_keys:
+                continue
+            bucket = self._aspect_ratio_buckets[closest_ratio]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+class AspectRatioBatchImageVideoSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 dataset: Dataset,
+                 batch_size: int,
+                 train_folder: str = None,
+                 aspect_ratios: dict = ASPECT_RATIO_512,
+                 drop_last: bool = False
+                ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.train_folder = train_folder
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+
+        # buckets for each aspect ratio
+        self.current_available_bucket_keys = list(aspect_ratios.keys())
+        self.bucket = {
+            'image':{ratio: [] for ratio in aspect_ratios}, 
+            'video':{ratio: [] for ratio in aspect_ratios}
+        }
+
+    def __iter__(self):
+        for idx in self.sampler:
+            content_type = self.dataset[idx].get('type', 'image')
+            if content_type == 'image':
+                try:
+                    image_dict = self.dataset[idx]
+
+                    width, height = image_dict.get("width", None), image_dict.get("height", None)
+                    if width is None or height is None:
+                        image_id, name = image_dict['file_path'], image_dict['text']
+                        if self.train_folder is None:
+                            image_dir = image_id
+                        else:
+                            image_dir = os.path.join(self.train_folder, image_id)
+
+                        width, height = get_image_size_without_loading(image_dir)
+
+                        ratio = height / width # self.dataset[idx]
+                    else:
+                        height = int(height)
+                        width = int(width)
+                        ratio = height / width # self.dataset[idx]
+                except Exception as e:
+                    print(e)
+                    continue
+                # find the closest aspect ratio
+                closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+                if closest_ratio not in self.current_available_bucket_keys:
+                    continue
+                bucket = self.bucket['image'][closest_ratio]
+                bucket.append(idx)
+                # yield a batch of indices in the same aspect ratio group
+                if len(bucket) == self.batch_size:
+                    yield bucket[:]
+                    del bucket[:]
+            else:
+                try:
+                    video_dict = self.dataset[idx]
+                    width, height = video_dict.get("width", None), video_dict.get("height", None)
+
+                    if width is None or height is None:
+                        video_id, name = video_dict['file_path'], video_dict['text']
+                        if self.train_folder is None:
+                            video_dir = video_id
+                        else:
+                            video_dir = os.path.join(self.train_folder, video_id)
+                        cap = cv2.VideoCapture(video_dir)
+
+                        # 获取视频尺寸
+                        width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))   # 浮点数转换为整数
+                        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # 浮点数转换为整数
+                        
+                        ratio = height / width # self.dataset[idx]
+                    else:
+                        height = int(height)
+                        width = int(width)
+                        ratio = height / width # self.dataset[idx]
+                except Exception as e:
+                    print(e)
+                    continue
+                # find the closest aspect ratio
+                closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+                if closest_ratio not in self.current_available_bucket_keys:
+                    continue
+                bucket = self.bucket['video'][closest_ratio]
+                bucket.append(idx)
+                # yield a batch of indices in the same aspect ratio group
+                if len(bucket) == self.batch_size:
+                    yield bucket[:]
+                    del bucket[:]
\ No newline at end of file
--- a/Ruyi-Models/ruyi/models/__init__.py
+++ b/Ruyi-Models/ruyi/models/__init__.py
+from .attention import *
+from .transformer2d import *
+from .transformer3d import *
+from .autoencoder_magvit import *
+from .embeddings import *
+from .motion_module import *
+from .norm import *
+from .patch import *
+from .resampler import *
+
--- a/Ruyi-Models/ruyi/models/attention.py
+++ b/Ruyi-Models/ruyi/models/attention.py
--- a/Ruyi-Models/ruyi/models/autoencoder_magvit.py
+++ b/Ruyi-Models/ruyi/models/autoencoder_magvit.py
--- a/Ruyi-Models/ruyi/models/embeddings.py
+++ b/Ruyi-Models/ruyi/models/embeddings.py
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.utils import deprecate
+from diffusers.models.activations import FP32SiLU, get_activation
+from diffusers.models.attention_processor import Attention
+
+
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+
+        self.act = get_activation(act_fn)
+
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+
+        if self.act is not None:
+            sample = self.act(sample)
+
+        sample = self.linear_2(sample)
+
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+
+
+class PixArtAlphaTextProjection(nn.Module):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+
+    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
+        super().__init__()
+        if out_features is None:
+            out_features = hidden_size
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        if act_fn == "gelu_tanh":
+            self.act_1 = nn.GELU(approximate="tanh")
+        elif act_fn == "silu_fp32":
+            self.act_1 = FP32SiLU()
+        else:
+            raise ValueError(f"Unknown activation function: {act_fn}")
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
+
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class HunyuanDiTAttentionPool(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = torch.cat([x.mean(dim=1, keepdim=True), x], dim=1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)
+
+        query = self.q_proj(x[:, :1])
+        key = self.k_proj(x)
+        value = self.v_proj(x)
+        batch_size, _, _ = query.size()
+
+        query = query.reshape(batch_size, -1, self.num_heads, query.size(-1) // self.num_heads).transpose(1, 2)  # (1, H, N, E/H)
+        key = key.reshape(batch_size, -1, self.num_heads, key.size(-1) // self.num_heads).transpose(1, 2)  # (L+1, H, N, E/H)
+        value = value.reshape(batch_size, -1, self.num_heads, value.size(-1) // self.num_heads).transpose(1, 2)  # (L+1, H, N, E/H)
+
+        x = F.scaled_dot_product_attention(query=query, key=key, value=value, attn_mask=None, dropout_p=0.0, is_causal=False)
+        x = x.transpose(1, 2).reshape(batch_size, 1, -1)
+        x = x.to(query.dtype)
+        x = self.c_proj(x) 
+
+        return x.squeeze(1) 
+
+
+class HunyuanCombinedTimestepTextSizeStyleEmbedding(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim=1024, seq_len=256, cross_attention_dim=2048):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+        self.pooler = HunyuanDiTAttentionPool(
+            seq_len, cross_attention_dim, num_heads=8, output_dim=pooled_projection_dim
+        )
+        # Here we use a default learned embedder layer for future extension.
+        self.style_embedder = nn.Embedding(1, embedding_dim)
+        extra_in_dim = 256 * 6 + embedding_dim + pooled_projection_dim
+        self.extra_embedder = PixArtAlphaTextProjection(
+            in_features=extra_in_dim,
+            hidden_size=embedding_dim * 4,
+            out_features=embedding_dim,
+            act_fn="silu_fp32",
+        )
+
+    def forward(self, timestep, encoder_hidden_states, image_meta_size, style, hidden_dtype=None):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, 256)
+
+        # extra condition1: text
+        pooled_projections = self.pooler(encoder_hidden_states)  # (N, 1024)
+
+        # extra condition2: image meta size embdding
+        image_meta_size = get_timestep_embedding(image_meta_size.view(-1), 256, True, 0)
+        image_meta_size = image_meta_size.to(dtype=hidden_dtype)
+        image_meta_size = image_meta_size.view(-1, 6 * 256)  # (N, 1536)
+
+        # extra condition3: style embedding
+        style_embedding = self.style_embedder(style)  # (N, embedding_dim)
+
+        # Concatenate all extra vectors
+        extra_cond = torch.cat([pooled_projections, image_meta_size, style_embedding], dim=1)
+        conditioning = timesteps_emb + self.extra_embedder(extra_cond)  # [B, D]
+
+        return conditioning
\ No newline at end of file
--- a/Ruyi-Models/ruyi/models/motion_module.py
+++ b/Ruyi-Models/ruyi/models/motion_module.py
--- a/Ruyi-Models/ruyi/models/norm.py
+++ b/Ruyi-Models/ruyi/models/norm.py
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from torch import nn
+
+
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+class FP32LayerNorm(nn.LayerNorm):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        origin_dtype = inputs.dtype
+        if hasattr(self, 'weight') and self.weight is not None:
+            return F.layer_norm(
+                inputs.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps
+            ).to(origin_dtype)
+        else:
+            return F.layer_norm(
+                inputs.float(), self.normalized_shape, None, None, self.eps
+            ).to(origin_dtype)
+
+class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
+    """
+    For PixArt-Alpha.
+
+    Reference:
+    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
+    """
+
+    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
+        super().__init__()
+
+        self.outdim = size_emb_dim
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+        self.use_additional_conditions = use_additional_conditions
+        if use_additional_conditions:
+            self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+            self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+            self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+            
+            self.resolution_embedder.linear_2 = zero_module(self.resolution_embedder.linear_2)
+            self.aspect_ratio_embedder.linear_2 = zero_module(self.aspect_ratio_embedder.linear_2)
+
+    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+
+        if self.use_additional_conditions:
+            resolution_emb = self.additional_condition_proj(resolution.flatten()).to(hidden_dtype)
+            resolution_emb = self.resolution_embedder(resolution_emb).reshape(batch_size, -1)
+            aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).to(hidden_dtype)
+            aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb).reshape(batch_size, -1)
+            conditioning = timesteps_emb + torch.cat([resolution_emb, aspect_ratio_emb], dim=1)
+        else:
+            conditioning = timesteps_emb
+
+        return conditioning
+
+class AdaLayerNormSingle(nn.Module):
+    r"""
+    Norm layer adaptive layer norm single (adaLN-single).
+
+    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
+    """
+
+    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
+        super().__init__()
+
+        self.emb = PixArtAlphaCombinedTimestepSizeEmbeddings(
+            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
+        )
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        batch_size: Optional[int] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # No modulation happening here.
+        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
+        return self.linear(self.silu(embedded_timestep)), embedded_timestep
+
+class AdaLayerNormShift(nn.Module):
+    r"""
+    Norm layer modified to incorporate timestep embeddings.
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, elementwise_affine=True, eps=1e-6):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim)
+        self.norm = FP32LayerNorm(embedding_dim, elementwise_affine=elementwise_affine, eps=eps)
+
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        shift = self.linear(self.silu(emb.to(torch.float32)).to(emb.dtype))
+        x = self.norm(x) + shift.unsqueeze(dim=1)
+        return x
\ No newline at end of file
--- a/Ruyi-Models/ruyi/models/patch.py
+++ b/Ruyi-Models/ruyi/models/patch.py
--- a/Ruyi-Models/ruyi/models/resampler.py
+++ b/Ruyi-Models/ruyi/models/resampler.py
+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import normal_
+
+
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+    def __init__(
+            self,
+            grid_size,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
+        ).requires_grad_(False)
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        normal_(self.query, std=.02)
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, key_padding_mask=None):
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+            x + pos_embed.unsqueeze(1),
+            x,
+            key_padding_mask=key_padding_mask)[0]
+        return out.permute(1, 0, 2)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
\ No newline at end of file
--- a/Ruyi-Models/ruyi/models/transformer2d.py
+++ b/Ruyi-Models/ruyi/models/transformer2d.py
--- a/Ruyi-Models/ruyi/models/transformer3d.py
+++ b/Ruyi-Models/ruyi/models/transformer3d.py
--- a/Ruyi-Models/ruyi/pipeline/pipeline_ruyi_inpaint.py
+++ b/Ruyi-Models/ruyi/pipeline/pipeline_ruyi_inpaint.py
--- a/Ruyi-Models/ruyi/utils/__init__.py
+++ b/Ruyi-Models/ruyi/utils/__init__.py