Commit 0513d03d authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #3321 canceled with stages
import os
import torch
import diffusers
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
from packaging import version
from xfuser.logger import init_logger
logger = init_logger(__name__)
if TYPE_CHECKING:
MASTER_ADDR: str = ""
MASTER_PORT: Optional[int] = None
CUDA_HOME: Optional[str] = None
LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: Optional[str] = None
XDIT_LOGGING_LEVEL: str = "INFO"
CUDA_VERSION: version.Version
TORCH_VERSION: version.Version
environment_variables: Dict[str, Callable[[], Any]] = {
# ================== Runtime Env Vars ==================
# used in distributed environment to determine the master address
"MASTER_ADDR": lambda: os.getenv("MASTER_ADDR", ""),
# used in distributed environment to manually set the communication port
"MASTER_PORT": lambda: (
int(os.getenv("MASTER_PORT", "0")) if "MASTER_PORT" in os.environ else None
),
# path to cudatoolkit home directory, under which should be bin, include,
# and lib directories.
"CUDA_HOME": lambda: os.environ.get("CUDA_HOME", None),
# local rank of the process in the distributed setting, used to determine
# the GPU device id
"LOCAL_RANK": lambda: int(os.environ.get("LOCAL_RANK", "0")),
# used to control the visible devices in the distributed setting
"CUDA_VISIBLE_DEVICES": lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
# this is used for configuring the default logging level
"XDIT_LOGGING_LEVEL": lambda: os.getenv("XDIT_LOGGING_LEVEL", "INFO"),
}
variables: Dict[str, Callable[[], Any]] = {
# ================== Other Vars ==================
# used in version checking
# "CUDA_VERSION": lambda: version.parse(torch.version.cuda),
"CUDA_VERSION": "gfx936",
"TORCH_VERSION": lambda: version.parse(
version.parse(torch.__version__).base_version
),
}
class PackagesEnvChecker:
_instance = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(PackagesEnvChecker, cls).__new__(cls)
cls._instance.initialize()
return cls._instance
def initialize(self):
self.packages_info = {
"has_flash_attn": self.check_flash_attn(),
"has_long_ctx_attn": self.check_long_ctx_attn(),
"diffusers_version": self.check_diffusers_version(),
}
def check_flash_attn(self):
try:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpu_name = torch.cuda.get_device_name(device)
if "Turing" in gpu_name or "Tesla" in gpu_name or "T4" in gpu_name:
return False
else:
from flash_attn import flash_attn_func
from flash_attn import __version__
if __version__ < "2.6.0":
raise ImportError(f"install flash_attn >= 2.6.0")
return True
except ImportError:
logger.warning(
f'Flash Attention library "flash_attn" not found, '
f"using pytorch attention implementation"
)
return False
def check_long_ctx_attn(self):
try:
from yunchang import (
set_seq_parallel_pg,
ring_flash_attn_func,
UlyssesAttention,
LongContextAttention,
LongContextAttentionQKVPacked,
)
return True
except ImportError:
logger.warning(
f'Ring Flash Attention library "yunchang" not found, '
f"using pytorch attention implementation"
)
return False
def check_diffusers_version(self):
if version.parse(
version.parse(diffusers.__version__).base_version
) < version.parse("0.30.0"):
raise RuntimeError(
f"Diffusers version: {version.parse(version.parse(diffusers.__version__).base_version)} is not supported,"
f"please upgrade to version > 0.30.0"
)
return version.parse(version.parse(diffusers.__version__).base_version)
def get_packages_info(self):
return self.packages_info
PACKAGES_CHECKER = PackagesEnvChecker()
def __getattr__(name):
# lazy evaluation of environment variables
if name in environment_variables:
return environment_variables[name]()
if name in variables:
return variables[name]()
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def __dir__():
return list(environment_variables.keys())
opencv-python==4.9.0.80
diffusers==0.31.0
transformers==4.46.3
tokenizers==0.20.3
accelerate==1.1.1
pandas==2.0.3
numpy==1.24.4
einops==0.7.0
tqdm==4.66.2
loguru==0.7.2
imageio==2.34.0
imageio-ffmpeg==0.5.1
safetensors==0.4.3
gradio==5.0.0
# export GPU_FLUSH_ON_EXECUTION=1
len=129
step=20
for num in 2 4;do
torchrun --nproc_per_node=${num} sample_video.py \
--video-size 1280 720 \
--video-length ${len} \
--infer-steps ${step} \
--prompt "A cat walks on the grass, realistic style." \
--flow-reverse \
--seed 42 \
--ulysses-degree ${num} \
--ring-degree 1 \
--save-path ./results 2>&1 |tee video-logs/bw-video-len_${len}-step_${step}-num-${num}.log
done
import os
import time
from pathlib import Path
from loguru import logger
from datetime import datetime
from hyvideo.utils.file_utils import save_videos_grid
from hyvideo.config import parse_args
from hyvideo.inference import HunyuanVideoSampler
def main():
args = parse_args()
print(args)
models_root_path = Path(args.model_base)
if not models_root_path.exists():
raise ValueError(f"`models_root` not exists: {models_root_path}")
# Create save folder to save the samples
save_path = args.save_path if args.save_path_suffix=="" else f'{args.save_path}_{args.save_path_suffix}'
if not os.path.exists(save_path):
os.makedirs(save_path, exist_ok=True)
# Load models
hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(models_root_path, args=args)
# Get the updated args
args = hunyuan_video_sampler.args
# warmup
outputs = hunyuan_video_sampler.predict(
prompt=args.prompt,
height=args.video_size[0],
width=args.video_size[1],
video_length=args.video_length,
seed=args.seed,
negative_prompt=args.neg_prompt,
infer_steps=2,
guidance_scale=args.cfg_scale,
num_videos_per_prompt=args.num_videos,
flow_shift=args.flow_shift,
batch_size=args.batch_size,
embedded_guidance_scale=args.embedded_cfg_scale
)
# Start sampling
# TODO: batch inference check
outputs = hunyuan_video_sampler.predict(
prompt=args.prompt,
height=args.video_size[0],
width=args.video_size[1],
video_length=args.video_length,
seed=args.seed,
negative_prompt=args.neg_prompt,
infer_steps=args.infer_steps,
guidance_scale=args.cfg_scale,
num_videos_per_prompt=args.num_videos,
flow_shift=args.flow_shift,
batch_size=args.batch_size,
embedded_guidance_scale=args.embedded_cfg_scale
)
samples = outputs['samples']
# Save samples
if 'LOCAL_RANK' not in os.environ or int(os.environ['LOCAL_RANK']) == 0:
for i, sample in enumerate(samples):
sample = samples[i].unsqueeze(0)
time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%H:%M:%S")
cur_save_path = f"{save_path}/{time_flag}_seed{outputs['seeds'][i]}_{outputs['prompts'][i][:100].replace('/','')}.mp4"
save_videos_grid(sample, cur_save_path, fps=24)
logger.info(f'Sample save to: {cur_save_path}')
if __name__ == "__main__":
main()
import os
import time
from pathlib import Path
from loguru import logger
from datetime import datetime
from hyvideo.utils.file_utils import save_videos_grid
from hyvideo.config import parse_args
from hyvideo.inference import HunyuanVideoSampler
from torch.profiler import profile, ProfilerActivity
def main():
args = parse_args()
print(args)
models_root_path = Path(args.model_base)
if not models_root_path.exists():
raise ValueError(f"`models_root` not exists: {models_root_path}")
# Create save folder to save the samples
save_path = args.save_path if args.save_path_suffix=="" else f'{args.save_path}_{args.save_path_suffix}'
if not os.path.exists(save_path):
os.makedirs(save_path, exist_ok=True)
# Load models
hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(models_root_path, args=args)
# Get the updated args
args = hunyuan_video_sampler.args
# warmup
outputs = hunyuan_video_sampler.predict(
prompt=args.prompt,
height=args.video_size[0],
width=args.video_size[1],
video_length=args.video_length,
seed=args.seed,
negative_prompt=args.neg_prompt,
infer_steps=2,
guidance_scale=args.cfg_scale,
num_videos_per_prompt=args.num_videos,
flow_shift=args.flow_shift,
batch_size=args.batch_size,
embedded_guidance_scale=args.embedded_cfg_scale
)
# Start sampling
# TODO: batch inference check
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
record_shapes=False,with_stack=False,with_modules=False,profile_memory=False) as p:
outputs = hunyuan_video_sampler.predict(
prompt=args.prompt,
height=args.video_size[0],
width=args.video_size[1],
video_length=args.video_length,
seed=args.seed,
negative_prompt=args.neg_prompt,
infer_steps=args.infer_steps,
guidance_scale=args.cfg_scale,
num_videos_per_prompt=args.num_videos,
flow_shift=args.flow_shift,
batch_size=args.batch_size,
embedded_guidance_scale=args.embedded_cfg_scale
)
output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=50)
if 'LOCAL_RANK' not in os.environ or int(os.environ['LOCAL_RANK']) == 0:
print(output)
p.export_chrome_trace("prof/bak-video-op_rope_mask-len_{}-step_{}-num-2.json".format(args.video_length,args.infer_steps))
samples = outputs['samples']
# Save samples
if 'LOCAL_RANK' not in os.environ or int(os.environ['LOCAL_RANK']) == 0:
for i, sample in enumerate(samples):
sample = samples[i].unsqueeze(0)
time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%H:%M:%S")
cur_save_path = f"{save_path}/{time_flag}_seed{outputs['seeds'][i]}_{outputs['prompts'][i][:100].replace('/','')}.mp4"
save_videos_grid(sample, cur_save_path, fps=24)
logger.info(f'Sample save to: {cur_save_path}')
if __name__ == "__main__":
main()
#!/bin/bash
# Description: This script demonstrates how to inference a video based on HunyuanVideo model
python3 sample_video.py \
--video-size 720 1280 \
--video-length 129 \
--infer-steps 50 \
--prompt "A cat walks on the grass, realistic style." \
--seed 42 \
--embedded-cfg-scale 6.0 \
--flow-shift 7.0 \
--flow-reverse \
--use-cpu-offload \
--save-path ./results
#!/bin/bash
# Description: This script demonstrates how to inference a video based on HunyuanVideo model
DIT_CKPT_PATH={PATH_TO}/{MODEL_NAME}_model_states_fp8.pt
python3 sample_video.py \
--dit-weight ${DIT_CKPT_PATH} \
--video-size 720 1280 \
--video-length 129 \
--infer-steps 50 \
--prompt "A cat walks on the grass, realistic style." \
--seed 42 \
--embedded-cfg-scale 6.0 \
--flow-shift 7.0 \
--flow-reverse \
--use-cpu-offload \
--use-fp8 \
--save-path ./results
#!/bin/bash
# Description: This script demonstrates how to inference a video based on HunyuanVideo model
# Supported Parallel Configurations
# | --video-size | --video-length | --ulysses-degree x --ring-degree | --nproc_per_node |
# |----------------------|----------------|----------------------------------|------------------|
# | 1280 720 or 720 1280 | 129 | 8x1,4x2,2x4,1x8 | 8 |
# | 1280 720 or 720 1280 | 129 | 1x5 | 5 |
# | 1280 720 or 720 1280 | 129 | 4x1,2x2,1x4 | 4 |
# | 1280 720 or 720 1280 | 129 | 3x1,1x3 | 3 |
# | 1280 720 or 720 1280 | 129 | 2x1,1x2 | 2 |
# | 1104 832 or 832 1104 | 129 | 4x1,2x2,1x4 | 4 |
# | 1104 832 or 832 1104 | 129 | 3x1,1x3 | 3 |
# | 1104 832 or 832 1104 | 129 | 2x1,1x2 | 2 |
# | 960 960 | 129 | 6x1,3x2,2x3,1x6 | 6 |
# | 960 960 | 129 | 4x1,2x2,1x4 | 4 |
# | 960 960 | 129 | 3x1,1x3 | 3 |
# | 960 960 | 129 | 1x2,2x1 | 2 |
# | 960 544 or 544 960 | 129 | 6x1,3x2,2x3,1x6 | 6 |
# | 960 544 or 544 960 | 129 | 4x1,2x2,1x4 | 4 |
# | 960 544 or 544 960 | 129 | 3x1,1x3 | 3 |
# | 960 544 or 544 960 | 129 | 1x2,2x1 | 2 |
# | 832 624 or 624 832 | 129 | 4x1,2x2,1x4 | 4 |
# | 624 832 or 624 832 | 129 | 3x1,1x3 | 3 |
# | 832 624 or 624 832 | 129 | 2x1,1x2 | 2 |
# | 720 720 | 129 | 1x5 | 5 |
# | 720 720 | 129 | 3x1,1x3 | 3 |
export TOKENIZERS_PARALLELISM=false
export NPROC_PER_NODE=8
export ULYSSES_DEGREE=8
export RING_DEGREE=1
torchrun --nproc_per_node=$NPROC_PER_NODE sample_video.py \
--video-size 720 1280 \
--video-length 129 \
--infer-steps 50 \
--prompt "A cat walks on the grass, realistic style." \
--seed 42 \
--embedded-cfg-scale 6.0 \
--flow-shift 7.0 \
--flow-reverse \
--ulysses-degree=$ULYSSES_DEGREE \
--ring-degree=$RING_DEGREE \
--save-path ./results
No preview for this file type
No preview for this file type
import torch
import sys
import os
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
sys.path.append(project_root)
from hyvideo.modules.attenion import attention
from xfuser.core.long_ctx_attention import xFuserLongContextAttention
from xfuser.core.distributed import (
init_distributed_environment,
initialize_model_parallel,
# initialize_runtime_state,
)
def init_dist(backend="nccl"):
local_rank = int(os.environ["LOCAL_RANK"])
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
print(
f"Initializing distributed environment with rank {rank}, world size {world_size}, local rank {local_rank}"
)
torch.cuda.set_device(local_rank)
init_distributed_environment(rank=rank, world_size=world_size)
# dist.init_process_group(backend=backend)
# construct a hybrid sequence parallel config (ulysses=2, ring = world_size // 2)
if world_size > 1:
ring_degree = world_size // 2
ulysses_degree = 2
else:
ring_degree = 1
ulysses_degree = 1
initialize_model_parallel(
sequence_parallel_degree=world_size,
ring_degree=ring_degree,
ulysses_degree=ulysses_degree,
)
return rank, world_size
def test_mm_double_stream_block_attention(rank, world_size):
device = torch.device(f"cuda:{rank}")
dtype = torch.bfloat16
batch_size = 1
seq_len_img = 118800
seq_len_txt = 256
heads_num = 24
head_dim = 128
img_q = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
img_k = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
img_v = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
txt_q = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
txt_k = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
txt_v = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
with torch.no_grad():
torch.distributed.broadcast(img_q, src=0)
torch.distributed.broadcast(img_k, src=0)
torch.distributed.broadcast(img_v, src=0)
torch.distributed.broadcast(txt_q, src=0)
torch.distributed.broadcast(txt_k, src=0)
torch.distributed.broadcast(txt_v, src=0)
q = torch.cat((img_q, txt_q), dim=1)
k = torch.cat((img_k, txt_k), dim=1)
v = torch.cat((img_v, txt_v), dim=1)
cu_seqlens_q = torch.tensor([0, 118811, 119056], device='cuda:0', dtype=torch.int32)
cu_seqlens_kv = torch.tensor([0, 118811, 119056], device='cuda:0', dtype=torch.int32)
max_seqlen_q = 119056
max_seqlen_kv = 119056
mode = "torch" # "torch", "vanilla", "flash"
original_output = attention(
q,
k,
v,
mode=mode,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_kv=cu_seqlens_kv,
max_seqlen_q=max_seqlen_q,
max_seqlen_kv=max_seqlen_kv,
batch_size=batch_size
)
hybrid_seq_parallel_attn = xFuserLongContextAttention()
hybrid_seq_parallel_output = hybrid_seq_parallel_attn(
None,
img_q,
img_k,
img_v,
dropout_p=0.0,
causal=False,
joint_tensor_query=txt_q,
joint_tensor_key=txt_k,
joint_tensor_value=txt_v,
joint_strategy="rear",
)
b, s, a, d = hybrid_seq_parallel_output.shape
hybrid_seq_parallel_output = hybrid_seq_parallel_output.reshape(b, s, -1)
assert original_output.shape == hybrid_seq_parallel_output.shape, f"Shape mismatch: {original_output.shape} vs {hybrid_seq_parallel_output.shape}"
torch.testing.assert_close(original_output, hybrid_seq_parallel_output, rtol=1e-3, atol=1e-3)
print("test_mm_double_stream_block_attention Passed")
def test_mm_single_stream_block_attention(rank, world_size):
device = torch.device(f"cuda:{rank}")
dtype = torch.bfloat16
txt_len = 256
batch_size = 1
seq_len_img = 118800
seq_len_txt = 256
heads_num = 24
head_dim = 128
with torch.no_grad():
img_q = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
img_k = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
txt_q = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
txt_k = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
v = torch.randn(batch_size, seq_len_img + seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
torch.distributed.broadcast(img_q, src=0)
torch.distributed.broadcast(img_k, src=0)
torch.distributed.broadcast(txt_q, src=0)
torch.distributed.broadcast(txt_k, src=0)
torch.distributed.broadcast(v, src=0)
q = torch.cat((img_q, txt_q), dim=1)
k = torch.cat((img_k, txt_k), dim=1)
cu_seqlens_q = torch.tensor([0, 118811, 119056], device='cuda:0', dtype=torch.int32)
cu_seqlens_kv = torch.tensor([0, 118811, 119056], device='cuda:0', dtype=torch.int32)
max_seqlen_q = 119056
max_seqlen_kv = 119056
mode = "torch" # "torch", "vanilla", "flash"
original_output = attention(
q,
k,
v,
mode=mode,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_kv=cu_seqlens_kv,
max_seqlen_q=max_seqlen_q,
max_seqlen_kv=max_seqlen_kv,
batch_size=batch_size
)
hybrid_seq_parallel_attn = xFuserLongContextAttention()
hybrid_seq_parallel_output = hybrid_seq_parallel_attn(
None,
q[:, :-txt_len, :, :],
k[:, :-txt_len, :, :],
v[:, :-txt_len, :, :],
dropout_p=0.0,
causal=False,
joint_tensor_query=q[:, -txt_len:, :, :],
joint_tensor_key=k[:, -txt_len:, :, :],
joint_tensor_value=v[:, -txt_len:, :, :],
joint_strategy="rear",
)
b, s, a, d = hybrid_seq_parallel_output.shape
hybrid_seq_parallel_output = hybrid_seq_parallel_output.reshape(b, s, -1)
assert original_output.shape == hybrid_seq_parallel_output.shape, f"Shape mismatch: {original_output.shape} vs {hybrid_seq_parallel_output.shape}"
torch.testing.assert_close(original_output, hybrid_seq_parallel_output, rtol=1e-3, atol=1e-3)
print("test_mm_single_stream_block_attention Passed")
if __name__ == "__main__":
rank, world_size = init_dist()
test_mm_double_stream_block_attention(rank, world_size)
test_mm_single_stream_block_attention(rank, world_size)
# Copyright (c) OpenMMLab. All rights reserved.
"""This file holding some environment constant for sharing by other files."""
import os
import os.path as osp
import subprocess
import sys
from collections import OrderedDict, defaultdict
import numpy as np
import torch
def is_rocm_pytorch() -> bool:
"""Check whether the PyTorch is compiled on ROCm."""
is_rocm = False
if TORCH_VERSION != 'parrots':
try:
from torch.utils.cpp_extension import ROCM_HOME
is_rocm = True if ((torch.version.hip is not None) and
(ROCM_HOME is not None)) else False
except ImportError:
pass
return is_rocm
TORCH_VERSION = torch.__version__
def get_build_config():
"""Obtain the build information of PyTorch or Parrots."""
if TORCH_VERSION == 'parrots':
from parrots.config import get_build_info
return get_build_info()
else:
return torch.__config__.show()
try:
import torch_musa # noqa: F401
IS_MUSA_AVAILABLE = True
except Exception:
IS_MUSA_AVAILABLE = False
def is_musa_available() -> bool:
return IS_MUSA_AVAILABLE
def is_cuda_available() -> bool:
"""Returns True if cuda devices exist."""
return torch.cuda.is_available()
def _get_cuda_home():
if TORCH_VERSION == 'parrots':
from parrots.utils.build_extension import CUDA_HOME
else:
if is_rocm_pytorch():
from torch.utils.cpp_extension import ROCM_HOME
CUDA_HOME = ROCM_HOME
else:
from torch.utils.cpp_extension import CUDA_HOME
return CUDA_HOME
def _get_musa_home():
return os.environ.get('MUSA_HOME')
def collect_env():
"""Collect the information of the running environments.
Returns:
dict: The environment information. The following fields are contained.
- sys.platform: The variable of ``sys.platform``.
- Python: Python version.
- CUDA available: Bool, indicating if CUDA is available.
- GPU devices: Device type of each GPU.
- CUDA_HOME (optional): The env var ``CUDA_HOME``.
- NVCC (optional): NVCC version.
- GCC: GCC version, "n/a" if GCC is not installed.
- MSVC: Microsoft Virtual C++ Compiler version, Windows only.
- PyTorch: PyTorch version.
- PyTorch compiling details: The output of \
``torch.__config__.show()``.
- TorchVision (optional): TorchVision version.
- OpenCV (optional): OpenCV version.
"""
from distutils import errors
env_info = OrderedDict()
env_info['sys.platform'] = sys.platform
env_info['Python'] = sys.version.replace('\n', '')
cuda_available = is_cuda_available()
musa_available = is_musa_available()
env_info['CUDA available'] = cuda_available
env_info['MUSA available'] = musa_available
env_info['numpy_random_seed'] = np.random.get_state()[1][0]
if cuda_available:
devices = defaultdict(list)
for k in range(torch.cuda.device_count()):
devices[torch.cuda.get_device_name(k)].append(str(k))
for name, device_ids in devices.items():
env_info['GPU ' + ','.join(device_ids)] = name
CUDA_HOME = _get_cuda_home()
env_info['CUDA_HOME'] = CUDA_HOME
if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
if CUDA_HOME == '/opt/rocm':
try:
nvcc = osp.join(CUDA_HOME, 'hip/bin/hipcc')
nvcc = subprocess.check_output(
f'"{nvcc}" --version', shell=True)
nvcc = nvcc.decode('utf-8').strip()
release = nvcc.rfind('HIP version:')
build = nvcc.rfind('')
nvcc = nvcc[release:build].strip()
except subprocess.SubprocessError:
nvcc = 'Not Available'
else:
try:
nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
nvcc = nvcc.decode('utf-8').strip()
release = nvcc.rfind('Cuda compilation tools')
build = nvcc.rfind('Build ')
nvcc = nvcc[release:build].strip()
except subprocess.SubprocessError:
nvcc = 'Not Available'
env_info['NVCC'] = nvcc
elif musa_available:
devices = defaultdict(list)
for k in range(torch.musa.device_count()):
devices[torch.musa.get_device_name(k)].append(str(k))
for name, device_ids in devices.items():
env_info['GPU ' + ','.join(device_ids)] = name
MUSA_HOME = _get_musa_home()
env_info['MUSA_HOME'] = MUSA_HOME
if MUSA_HOME is not None and osp.isdir(MUSA_HOME):
try:
mcc = osp.join(MUSA_HOME, 'bin/mcc')
subprocess.check_output(f'"{mcc}" -v', shell=True)
except subprocess.SubprocessError:
mcc = 'Not Available'
env_info['mcc'] = mcc
try:
# Check C++ Compiler.
# For Unix-like, sysconfig has 'CC' variable like 'gcc -pthread ...',
# indicating the compiler used, we use this to get the compiler name
import io
import sysconfig
cc = sysconfig.get_config_var('CC')
if cc:
cc = osp.basename(cc.split()[0])
cc_info = subprocess.check_output(f'{cc} --version', shell=True)
env_info['GCC'] = cc_info.decode('utf-8').partition(
'\n')[0].strip()
else:
# on Windows, cl.exe is not in PATH. We need to find the path.
# distutils.ccompiler.new_compiler() returns a msvccompiler
# object and after initialization, path to cl.exe is found.
import locale
import os
from distutils.ccompiler import new_compiler
ccompiler = new_compiler()
ccompiler.initialize()
cc = subprocess.check_output(
f'{ccompiler.cc}', stderr=subprocess.STDOUT, shell=True)
encoding = os.device_encoding(
sys.stdout.fileno()) or locale.getpreferredencoding()
env_info['MSVC'] = cc.decode(encoding).partition('\n')[0].strip()
env_info['GCC'] = 'n/a'
except (subprocess.CalledProcessError, errors.DistutilsPlatformError):
env_info['GCC'] = 'n/a'
except io.UnsupportedOperation as e:
# JupyterLab on Windows changes sys.stdout, which has no `fileno` attr
# Refer to: https://github.com/open-mmlab/mmengine/issues/931
# TODO: find a solution to get compiler info in Windows JupyterLab,
# while preserving backward-compatibility in other systems.
env_info['MSVC'] = f'n/a, reason: {str(e)}'
env_info['PyTorch'] = torch.__version__
env_info['PyTorch compiling details'] = get_build_config()
try:
import torchvision
env_info['TorchVision'] = torchvision.__version__
except ModuleNotFoundError:
pass
try:
import cv2
env_info['OpenCV'] = cv2.__version__
except ImportError:
pass
return env_info
if __name__ == '__main__':
for name, val in collect_env().items():
print(f'{name}: {val}')
\ No newline at end of file
Namespace(model='HYVideo-T/2-cfgdistill', latent_channels=16, precision='bf16', rope_theta=256, vae='884-16c-hy', vae_precision='fp16', vae_tiling=True, text_encoder='llm', text_encoder_precision='fp16', text_states_dim=4096, text_len=256, tokenizer='llm', prompt_template='dit-llm-encode', prompt_template_video='dit-llm-encode-video', hidden_state_skip_layer=2, apply_final_norm=False, text_encoder_2='clipL', text_encoder_precision_2='fp16', text_states_dim_2=768, tokenizer_2='clipL', text_len_2=77, denoise_type='flow', flow_shift=7.0, flow_reverse=True, flow_solver='euler', use_linear_quadratic_schedule=False, linear_schedule_end=25, model_base='ckpts', dit_weight='ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt', model_resolution='540p', load_key='module', use_cpu_offload=False, batch_size=1, infer_steps=20, disable_autocast=False, save_path='./results', save_path_suffix='', name_suffix='', num_videos=1, video_size=[1280, 720], video_length=33, prompt='A cat walks on the grass, realistic style.', seed_type='auto', seed=42, neg_prompt=None, cfg_scale=1.0, embedded_cfg_scale=6.0, use_fp8=False, reproduce=False, ulysses_degree=1, ring_degree=1)
2026-02-02 14:09:46.064 | INFO | hyvideo.inference:from_pretrained:154 - Got text-to-video model root path: ckpts
2026-02-02 14:09:46.065 | INFO | hyvideo.inference:from_pretrained:189 - Building model...
2026-02-02 14:09:46.741 | INFO | hyvideo.inference:load_state_dict:340 - Loading torch model ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt...
/workspace/cicd/HunyuanVideo-t2v/hyvideo/inference.py:341: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
2026-02-02 14:10:02.963 | INFO | hyvideo.vae:load_vae:29 - Loading 3D VAE model (884-16c-hy) from: ./ckpts/hunyuan-video-t2v-720p/vae
/workspace/cicd/HunyuanVideo-t2v/hyvideo/vae/__init__.py:39: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
ckpt = torch.load(vae_ckpt, map_location=vae.device)
2026-02-02 14:10:05.461 | INFO | hyvideo.vae:load_vae:55 - VAE to dtype: torch.float16
2026-02-02 14:10:05.633 | INFO | hyvideo.text_encoder:load_text_encoder:28 - Loading text encoder model (llm) from: ./ckpts/text_encoder
Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s] Loading checkpoint shards: 25%|██▌ | 1/4 [00:02<00:07, 2.49s/it] Loading checkpoint shards: 50%|█████ | 2/4 [00:05<00:05, 2.71s/it] Loading checkpoint shards: 75%|███████▌ | 3/4 [00:08<00:02, 2.77s/it] Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00, 1.79s/it] Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00, 2.12s/it]
2026-02-02 14:10:19.819 | INFO | hyvideo.text_encoder:load_text_encoder:50 - Text encoder to dtype: torch.float16
2026-02-02 14:10:23.769 | INFO | hyvideo.text_encoder:load_tokenizer:64 - Loading tokenizer (llm) from: ./ckpts/text_encoder
2026-02-02 14:10:24.283 | INFO | hyvideo.text_encoder:load_text_encoder:28 - Loading text encoder model (clipL) from: ./ckpts/text_encoder_2
2026-02-02 14:10:24.447 | INFO | hyvideo.text_encoder:load_text_encoder:50 - Text encoder to dtype: torch.float16
2026-02-02 14:10:24.500 | INFO | hyvideo.text_encoder:load_tokenizer:64 - Loading tokenizer (clipL) from: ./ckpts/text_encoder_2
2026-02-02 14:10:24.595 | INFO | hyvideo.inference:predict:580 - Input (height, width, video_length) = (1280, 720, 33)
2026-02-02 14:10:24.617 | DEBUG | hyvideo.inference:predict:642 -
height: 1280
width: 720
video_length: 33
prompt: ['A cat walks on the grass, realistic style.']
neg_prompt: ['']
seed: 42
infer_steps: 2
num_videos_per_prompt: 1
guidance_scale: 1.0
n_tokens: 32400
flow_shift: 7.0
embedded_guidance_scale: 6.0
/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:602: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:663.)
attn_output = torch.nn.functional.scaled_dot_product_attention(
0%| | 0/2 [00:00<?, ?it/s] 50%|█████ | 1/2 [00:12<00:12, 12.57s/it] 100%|██████████| 2/2 [00:20<00:00, 9.91s/it] 100%|██████████| 2/2 [00:20<00:00, 10.30s/it]
2026-02-02 14:11:05.154 | INFO | hyvideo.inference:predict:671 - Success, time: 40.5368127822876
2026-02-02 14:11:05.154 | INFO | hyvideo.inference:predict:580 - Input (height, width, video_length) = (1280, 720, 33)
2026-02-02 14:11:05.180 | DEBUG | hyvideo.inference:predict:642 -
height: 1280
width: 720
video_length: 33
prompt: ['A cat walks on the grass, realistic style.']
neg_prompt: ['']
seed: 42
infer_steps: 20
num_videos_per_prompt: 1
guidance_scale: 1.0
n_tokens: 32400
flow_shift: 7.0
embedded_guidance_scale: 6.0
0%| | 0/20 [00:00<?, ?it/s] 5%|▌ | 1/20 [00:08<02:35, 8.17s/it] 10%|█ | 2/20 [00:16<02:25, 8.10s/it] 15%|█▌ | 3/20 [00:24<02:18, 8.13s/it] 20%|██ | 4/20 [00:32<02:10, 8.14s/it] 25%|██▌ | 5/20 [00:40<02:02, 8.14s/it] 30%|███ | 6/20 [00:48<01:54, 8.15s/it] 35%|███▌ | 7/20 [00:57<01:45, 8.15s/it] 40%|████ | 8/20 [01:05<01:37, 8.16s/it] 45%|████▌ | 9/20 [01:13<01:29, 8.16s/it] 50%|█████ | 10/20 [01:21<01:21, 8.16s/it] 55%|█████▌ | 11/20 [01:29<01:13, 8.16s/it] 60%|██████ | 12/20 [01:37<01:05, 8.16s/it] 65%|██████▌ | 13/20 [01:46<00:57, 8.17s/it] 70%|███████ | 14/20 [01:54<00:49, 8.17s/it] 75%|███████▌ | 15/20 [02:02<00:40, 8.17s/it] 80%|████████ | 16/20 [02:10<00:32, 8.17s/it] 85%|████████▌ | 17/20 [02:18<00:24, 8.16s/it] 90%|█████████ | 18/20 [02:26<00:16, 8.16s/it] 95%|█████████▌| 19/20 [02:35<00:08, 8.17s/it] 100%|██████████| 20/20 [02:43<00:00, 8.16s/it] 100%|██████████| 20/20 [02:43<00:00, 8.16s/it]
2026-02-02 14:14:02.787 | INFO | hyvideo.inference:predict:671 - Success, time: 177.60623216629028
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2026-02-02 14:14:03.989 | INFO | __main__:main:72 - Sample save to: ./results/2026-02-02-14:14:02_seed42_A cat walks on the grass, realistic style..mp4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment