# Prediction interface for Cog ⚙️ # https://cog.run/python import argparse import os import subprocess import time import imageio import numpy as np import torch import torchvision from cog import BasePredictor, Input, Path from einops import rearrange from fastvideo.models.hunyuan.inference import HunyuanVideoSampler MODEL_CACHE = 'FastHunyuan' os.environ['MODEL_BASE'] = './' + MODEL_CACHE MODEL_URL = "https://weights.replicate.delivery/default/FastVideo/FastHunyuan/model.tar" def download_weights(url, dest): start = time.time() print("downloading url: ", url) print("downloading to: ", dest) subprocess.check_call(["pget", "-xf", url, dest], close_fds=False) print("downloading took: ", time.time() - start) class Predictor(BasePredictor): def setup(self): """Load the model into memory""" print("Model Base: " + os.environ['MODEL_BASE']) # Download weights if not os.path.exists(MODEL_CACHE): download_weights(MODEL_URL, MODEL_CACHE) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = argparse.Namespace( num_frames=125, height=720, width=1280, num_inference_steps=6, fps=24, denoise_type='flow', seed=1024, neg_prompt=None, guidance_scale=1.0, embedded_cfg_scale=6.0, flow_shift=17, batch_size=1, num_videos=1, load_key='module', use_cpu_offload=False, dit_weight='FastHunyuan/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt', reproduce=True, disable_autocast=False, flow_reverse=True, flow_solver='euler', use_linear_quadratic_schedule=False, linear_schedule_end=25, model='HYVideo-T/2-cfgdistill', latent_channels=16, precision='bf16', rope_theta=256, vae='884-16c-hy', vae_precision='fp16', vae_tiling=True, text_encoder='llm', text_encoder_precision='fp16', text_states_dim=4096, text_len=256, tokenizer='llm', prompt_template='dit-llm-encode', prompt_template_video='dit-llm-encode-video', hidden_state_skip_layer=2, apply_final_norm=False, text_encoder_2='clipL', text_encoder_precision_2='fp16', text_states_dim_2=768, tokenizer_2='clipL', text_len_2=77, model_path=MODEL_CACHE, ) self.model = HunyuanVideoSampler.from_pretrained(MODEL_CACHE, args=args) def predict( self, prompt: str = Input(description="Text prompt for video generation", default="A cat walks on the grass, realistic style."), negative_prompt: str = Input(description="Text prompt to specify what you don't want in the video.", default=""), width: int = Input(description="Width of output video", default=1280, ge=256), height: int = Input(description="Height of output video", default=720, ge=256), num_frames: int = Input(description="Number of frames to generate", default=125, ge=16), num_inference_steps: int = Input(description="Number of denoising steps", default=6, ge=1, le=50), guidance_scale: float = Input(description="Classifier free guidance scale", default=1.0, ge=0.1, le=10.0), embedded_cfg_scale: float = Input(description="Embedded classifier free guidance scale", default=6.0, ge=0.1, le=10.0), flow_shift: int = Input(description="Flow shift parameter", default=17, ge=1, le=20), fps: int = Input(description="Frames per second of output video", default=24, ge=1, le=60), seed: int = Input(description="0 for Random seed. Set for reproducible generation", default=0), ) -> Path: """Run video generation""" if seed <= 0: seed = int.from_bytes(os.urandom(2), "big") print(f"Using seed: {seed}") outputs = self.model.predict( prompt=prompt, height=height, width=width, video_length=num_frames, seed=seed, negative_prompt=negative_prompt, infer_steps=num_inference_steps, guidance_scale=guidance_scale, embedded_guidance_scale=embedded_cfg_scale, flow_shift=flow_shift, flow_reverse=True, batch_size=1, num_videos_per_prompt=1, ) # Process output video videos = rearrange(outputs["samples"], "b c t h w -> t b c h w") frames = [] for x in videos: x = torchvision.utils.make_grid(x, nrow=6) x = x.transpose(0, 1).transpose(1, 2).squeeze(-1) frames.append((x * 255).numpy().astype(np.uint8)) # Save video output_path = Path("/tmp/output.mp4") imageio.mimsave(str(output_path), frames, fps=fps) return Path(output_path)