Initial commit

0513d03d · jerrrrry · 0513d03d · 0513d03d · 0513d03d · 0513d03d
Commit 0513d03d authored Feb 03, 2026 by jerrrrry
20 changed files
--- a/modified/envs.py
+++ b/modified/envs.py
+import os
+import torch
+import diffusers
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
+from packaging import version
+
+from xfuser.logger import init_logger
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    MASTER_ADDR: str = ""
+    MASTER_PORT: Optional[int] = None
+    CUDA_HOME: Optional[str] = None
+    LOCAL_RANK: int = 0
+    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    XDIT_LOGGING_LEVEL: str = "INFO"
+    CUDA_VERSION: version.Version
+    TORCH_VERSION: version.Version
+
+
+environment_variables: Dict[str, Callable[[], Any]] = {
+    # ================== Runtime Env Vars ==================
+    # used in distributed environment to determine the master address
+    "MASTER_ADDR": lambda: os.getenv("MASTER_ADDR", ""),
+    # used in distributed environment to manually set the communication port
+    "MASTER_PORT": lambda: (
+        int(os.getenv("MASTER_PORT", "0")) if "MASTER_PORT" in os.environ else None
+    ),
+    # path to cudatoolkit home directory, under which should be bin, include,
+    # and lib directories.
+    "CUDA_HOME": lambda: os.environ.get("CUDA_HOME", None),
+    # local rank of the process in the distributed setting, used to determine
+    # the GPU device id
+    "LOCAL_RANK": lambda: int(os.environ.get("LOCAL_RANK", "0")),
+    # used to control the visible devices in the distributed setting
+    "CUDA_VISIBLE_DEVICES": lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
+    # this is used for configuring the default logging level
+    "XDIT_LOGGING_LEVEL": lambda: os.getenv("XDIT_LOGGING_LEVEL", "INFO"),
+}
+
+variables: Dict[str, Callable[[], Any]] = {
+    # ================== Other Vars ==================
+    # used in version checking
+    # "CUDA_VERSION": lambda: version.parse(torch.version.cuda),
+    "CUDA_VERSION": "gfx936",
+    "TORCH_VERSION": lambda: version.parse(
+        version.parse(torch.__version__).base_version
+    ),
+}
+
+
+class PackagesEnvChecker:
+    _instance = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(PackagesEnvChecker, cls).__new__(cls)
+            cls._instance.initialize()
+        return cls._instance
+
+    def initialize(self):
+        self.packages_info = {
+            "has_flash_attn": self.check_flash_attn(),
+            "has_long_ctx_attn": self.check_long_ctx_attn(),
+            "diffusers_version": self.check_diffusers_version(),
+        }
+
+    def check_flash_attn(self):
+        try:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            gpu_name = torch.cuda.get_device_name(device)
+            if "Turing" in gpu_name or "Tesla" in gpu_name or "T4" in gpu_name:
+                return False
+            else:
+                from flash_attn import flash_attn_func
+                from flash_attn import __version__
+
+                if __version__ < "2.6.0":
+                    raise ImportError(f"install flash_attn >= 2.6.0")
+                return True
+        except ImportError:
+            logger.warning(
+                f'Flash Attention library "flash_attn" not found, '
+                f"using pytorch attention implementation"
+            )
+            return False
+
+    def check_long_ctx_attn(self):
+        try:
+            from yunchang import (
+                set_seq_parallel_pg,
+                ring_flash_attn_func,
+                UlyssesAttention,
+                LongContextAttention,
+                LongContextAttentionQKVPacked,
+            )
+
+            return True
+        except ImportError:
+            logger.warning(
+                f'Ring Flash Attention library "yunchang" not found, '
+                f"using pytorch attention implementation"
+            )
+            return False
+
+    def check_diffusers_version(self):
+        if version.parse(
+            version.parse(diffusers.__version__).base_version
+        ) < version.parse("0.30.0"):
+            raise RuntimeError(
+                f"Diffusers version: {version.parse(version.parse(diffusers.__version__).base_version)} is not supported,"
+                f"please upgrade to version > 0.30.0"
+            )
+        return version.parse(version.parse(diffusers.__version__).base_version)
+
+    def get_packages_info(self):
+        return self.packages_info
+
+
+PACKAGES_CHECKER = PackagesEnvChecker()
+
+
+def __getattr__(name):
+    # lazy evaluation of environment variables
+    if name in environment_variables:
+        return environment_variables[name]()
+    if name in variables:
+        return variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return list(environment_variables.keys())
+
--- a/requirements.txt
+++ b/requirements.txt
+opencv-python==4.9.0.80
+diffusers==0.31.0
+transformers==4.46.3
+tokenizers==0.20.3
+accelerate==1.1.1
+pandas==2.0.3
+numpy==1.24.4
+einops==0.7.0
+tqdm==4.66.2
+loguru==0.7.2
+imageio==2.34.0
+imageio-ffmpeg==0.5.1
+safetensors==0.4.3
+gradio==5.0.0
--- a/results/2026-02-02-14:14:02_seed42_A cat walks on the grass, realistic style..mp4
+++ b/results/2026-02-02-14:14:02_seed42_A cat walks on the grass, realistic style..mp4
--- a/results/2026-02-02-14:17:16_seed42_A cat walks on the grass, realistic style..mp4
+++ b/results/2026-02-02-14:17:16_seed42_A cat walks on the grass, realistic style..mp4
--- a/results/2026-02-02-14:19:45_seed42_A cat walks on the grass, realistic style..mp4
+++ b/results/2026-02-02-14:19:45_seed42_A cat walks on the grass, realistic style..mp4
--- a/results/2026-02-02-14:46:14_seed42_A cat walks on the grass, realistic style..mp4
+++ b/results/2026-02-02-14:46:14_seed42_A cat walks on the grass, realistic style..mp4
--- a/results/2026-02-02-14:57:48_seed42_A cat walks on the grass, realistic style..mp4
+++ b/results/2026-02-02-14:57:48_seed42_A cat walks on the grass, realistic style..mp4
--- a/run.sh
+++ b/run.sh
+# export GPU_FLUSH_ON_EXECUTION=1
+
+len=129
+step=20
+
+for num in  2 4;do
+    torchrun --nproc_per_node=${num} sample_video.py \
+        --video-size 1280 720 \
+        --video-length ${len} \
+        --infer-steps ${step} \
+        --prompt "A cat walks on the grass, realistic style." \
+        --flow-reverse \
+        --seed 42 \
+        --ulysses-degree ${num} \
+        --ring-degree 1 \
+        --save-path ./results 2>&1 |tee video-logs/bw-video-len_${len}-step_${step}-num-${num}.log
+done
--- a/sample_video.py
+++ b/sample_video.py
+import os
+import time
+from pathlib import Path
+from loguru import logger
+from datetime import datetime
+
+from hyvideo.utils.file_utils import save_videos_grid
+from hyvideo.config import parse_args
+from hyvideo.inference import HunyuanVideoSampler
+
+
+def main():
+    args = parse_args()
+    print(args)
+    models_root_path = Path(args.model_base)
+    if not models_root_path.exists():
+        raise ValueError(f"`models_root` not exists: {models_root_path}")
+    
+    # Create save folder to save the samples
+    save_path = args.save_path if args.save_path_suffix=="" else f'{args.save_path}_{args.save_path_suffix}'
+    if not os.path.exists(save_path):
+        os.makedirs(save_path, exist_ok=True)
+
+    # Load models
+    hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(models_root_path, args=args)
+    
+    # Get the updated args
+    args = hunyuan_video_sampler.args
+
+
+    # warmup
+    outputs = hunyuan_video_sampler.predict(
+        prompt=args.prompt, 
+        height=args.video_size[0],
+        width=args.video_size[1],
+        video_length=args.video_length,
+        seed=args.seed,
+        negative_prompt=args.neg_prompt,
+        infer_steps=2,
+        guidance_scale=args.cfg_scale,
+        num_videos_per_prompt=args.num_videos,
+        flow_shift=args.flow_shift,
+        batch_size=args.batch_size,
+        embedded_guidance_scale=args.embedded_cfg_scale
+    )
+
+    # Start sampling
+    # TODO: batch inference check
+    outputs = hunyuan_video_sampler.predict(
+        prompt=args.prompt, 
+        height=args.video_size[0],
+        width=args.video_size[1],
+        video_length=args.video_length,
+        seed=args.seed,
+        negative_prompt=args.neg_prompt,
+        infer_steps=args.infer_steps,
+        guidance_scale=args.cfg_scale,
+        num_videos_per_prompt=args.num_videos,
+        flow_shift=args.flow_shift,
+        batch_size=args.batch_size,
+        embedded_guidance_scale=args.embedded_cfg_scale
+    )
+    samples = outputs['samples']
+    
+    # Save samples
+    if 'LOCAL_RANK' not in os.environ or int(os.environ['LOCAL_RANK']) == 0:
+        for i, sample in enumerate(samples):
+            sample = samples[i].unsqueeze(0)
+            time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%H:%M:%S")
+            cur_save_path = f"{save_path}/{time_flag}_seed{outputs['seeds'][i]}_{outputs['prompts'][i][:100].replace('/','')}.mp4"
+            save_videos_grid(sample, cur_save_path, fps=24)
+            logger.info(f'Sample save to: {cur_save_path}')
+
+if __name__ == "__main__":
+    main()
--- a/sample_video_prof.py
+++ b/sample_video_prof.py
+import os
+import time
+from pathlib import Path
+from loguru import logger
+from datetime import datetime
+
+from hyvideo.utils.file_utils import save_videos_grid
+from hyvideo.config import parse_args
+from hyvideo.inference import HunyuanVideoSampler
+
+from torch.profiler import profile, ProfilerActivity
+def main():
+    args = parse_args()
+    print(args)
+    models_root_path = Path(args.model_base)
+    if not models_root_path.exists():
+        raise ValueError(f"`models_root` not exists: {models_root_path}")
+    
+    # Create save folder to save the samples
+    save_path = args.save_path if args.save_path_suffix=="" else f'{args.save_path}_{args.save_path_suffix}'
+    if not os.path.exists(save_path):
+        os.makedirs(save_path, exist_ok=True)
+
+    # Load models
+    hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(models_root_path, args=args)
+    
+    # Get the updated args
+    args = hunyuan_video_sampler.args
+
+    # warmup
+    outputs = hunyuan_video_sampler.predict(
+        prompt=args.prompt, 
+        height=args.video_size[0],
+        width=args.video_size[1],
+        video_length=args.video_length,
+        seed=args.seed,
+        negative_prompt=args.neg_prompt,
+        infer_steps=2,
+        guidance_scale=args.cfg_scale,
+        num_videos_per_prompt=args.num_videos,
+        flow_shift=args.flow_shift,
+        batch_size=args.batch_size,
+        embedded_guidance_scale=args.embedded_cfg_scale
+    )
+
+
+    # Start sampling
+    # TODO: batch inference check
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        record_shapes=False,with_stack=False,with_modules=False,profile_memory=False) as p:
+        outputs = hunyuan_video_sampler.predict(
+            prompt=args.prompt, 
+            height=args.video_size[0],
+            width=args.video_size[1],
+            video_length=args.video_length,
+            seed=args.seed,
+            negative_prompt=args.neg_prompt,
+            infer_steps=args.infer_steps,
+            guidance_scale=args.cfg_scale,
+            num_videos_per_prompt=args.num_videos,
+            flow_shift=args.flow_shift,
+            batch_size=args.batch_size,
+            embedded_guidance_scale=args.embedded_cfg_scale
+        )
+    output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=50)
+    if 'LOCAL_RANK' not in os.environ or int(os.environ['LOCAL_RANK']) == 0:
+        print(output)
+        p.export_chrome_trace("prof/bak-video-op_rope_mask-len_{}-step_{}-num-2.json".format(args.video_length,args.infer_steps))
+
+    samples = outputs['samples']
+    # Save samples
+    if 'LOCAL_RANK' not in os.environ or int(os.environ['LOCAL_RANK']) == 0:
+        for i, sample in enumerate(samples):
+            sample = samples[i].unsqueeze(0)
+            time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%H:%M:%S")
+            cur_save_path = f"{save_path}/{time_flag}_seed{outputs['seeds'][i]}_{outputs['prompts'][i][:100].replace('/','')}.mp4"
+            save_videos_grid(sample, cur_save_path, fps=24)
+            logger.info(f'Sample save to: {cur_save_path}')
+
+if __name__ == "__main__":
+    main()
--- a/scripts/run_sample_video.sh
+++ b/scripts/run_sample_video.sh
+#!/bin/bash
+# Description: This script demonstrates how to inference a video based on HunyuanVideo model
+
+python3 sample_video.py \
+    --video-size 720 1280 \
+    --video-length 129 \
+	--infer-steps 50 \
+    --prompt "A cat walks on the grass, realistic style." \
+    --seed 42 \
+	--embedded-cfg-scale 6.0 \
+    --flow-shift 7.0 \
+    --flow-reverse \
+    --use-cpu-offload \
+    --save-path ./results
--- a/scripts/run_sample_video_fp8.sh
+++ b/scripts/run_sample_video_fp8.sh
+#!/bin/bash
+# Description: This script demonstrates how to inference a video based on HunyuanVideo model
+DIT_CKPT_PATH={PATH_TO}/{MODEL_NAME}_model_states_fp8.pt
+
+python3 sample_video.py \
+    --dit-weight ${DIT_CKPT_PATH} \
+    --video-size 720 1280 \
+    --video-length 129 \
+    --infer-steps 50 \
+    --prompt "A cat walks on the grass, realistic style." \
+    --seed 42 \
+    --embedded-cfg-scale 6.0 \
+    --flow-shift 7.0 \
+    --flow-reverse \
+    --use-cpu-offload \
+    --use-fp8 \
+    --save-path ./results
--- a/scripts/run_sample_video_multigpu.sh
+++ b/scripts/run_sample_video_multigpu.sh
+#!/bin/bash
+# Description: This script demonstrates how to inference a video based on HunyuanVideo model
+
+# Supported Parallel Configurations
+# |     --video-size     | --video-length | --ulysses-degree x --ring-degree | --nproc_per_node |
+# |----------------------|----------------|----------------------------------|------------------|
+# | 1280 720 or 720 1280 | 129            | 8x1,4x2,2x4,1x8                  | 8                |
+# | 1280 720 or 720 1280 | 129            | 1x5                              | 5                |
+# | 1280 720 or 720 1280 | 129            | 4x1,2x2,1x4                      | 4                |
+# | 1280 720 or 720 1280 | 129            | 3x1,1x3                          | 3                |
+# | 1280 720 or 720 1280 | 129            | 2x1,1x2                          | 2                |
+# | 1104 832 or 832 1104 | 129            | 4x1,2x2,1x4                      | 4                |
+# | 1104 832 or 832 1104 | 129            | 3x1,1x3                          | 3                |
+# | 1104 832 or 832 1104 | 129            | 2x1,1x2                          | 2                |
+# | 960 960              | 129            | 6x1,3x2,2x3,1x6                  | 6                |
+# | 960 960              | 129            | 4x1,2x2,1x4                      | 4                |
+# | 960 960              | 129            | 3x1,1x3                          | 3                |
+# | 960 960              | 129            | 1x2,2x1                          | 2                |
+# | 960 544 or 544 960   | 129            | 6x1,3x2,2x3,1x6                  | 6                |
+# | 960 544 or 544 960   | 129            | 4x1,2x2,1x4                      | 4                |
+# | 960 544 or 544 960   | 129            | 3x1,1x3                          | 3                |
+# | 960 544 or 544 960   | 129            | 1x2,2x1                          | 2                |
+# | 832 624 or 624 832   | 129            | 4x1,2x2,1x4                      | 4                |
+# | 624 832 or 624 832   | 129            | 3x1,1x3                          | 3                |
+# | 832 624 or 624 832   | 129            | 2x1,1x2                          | 2                |
+# | 720 720              | 129            | 1x5                              | 5                |
+# | 720 720              | 129            | 3x1,1x3                          | 3                |
+
+export TOKENIZERS_PARALLELISM=false
+
+export NPROC_PER_NODE=8
+export ULYSSES_DEGREE=8
+export RING_DEGREE=1
+
+torchrun --nproc_per_node=$NPROC_PER_NODE sample_video.py \
+	--video-size 720 1280 \
+	--video-length 129 \
+	--infer-steps 50 \
+	--prompt "A cat walks on the grass, realistic style." \
+	--seed 42 \
+	--embedded-cfg-scale 6.0 \
+	--flow-shift 7.0 \
+	--flow-reverse \
+	--ulysses-degree=$ULYSSES_DEGREE \
+	--ring-degree=$RING_DEGREE \
+	--save-path ./results
--- a/test-129len.log
+++ b/test-129len.log
--- a/test.log
+++ b/test.log
--- a/tests/test_attention.py
+++ b/tests/test_attention.py
+import torch
+import sys
+import os
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+sys.path.append(project_root)
+
+from hyvideo.modules.attenion import attention
+from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+from xfuser.core.distributed import (
+    init_distributed_environment,
+    initialize_model_parallel,
+    # initialize_runtime_state,
+)
+
+def init_dist(backend="nccl"):
+    local_rank = int(os.environ["LOCAL_RANK"])
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    print(
+        f"Initializing distributed environment with rank {rank}, world size {world_size}, local rank {local_rank}"
+    )
+
+    torch.cuda.set_device(local_rank)
+    init_distributed_environment(rank=rank, world_size=world_size)
+    # dist.init_process_group(backend=backend)
+       # construct a hybrid sequence parallel config (ulysses=2, ring = world_size // 2)
+
+    if world_size > 1:
+        ring_degree = world_size // 2
+        ulysses_degree = 2
+    else:
+        ring_degree = 1
+        ulysses_degree = 1
+    initialize_model_parallel(
+        sequence_parallel_degree=world_size,
+        ring_degree=ring_degree,
+        ulysses_degree=ulysses_degree,
+    )
+
+    return rank, world_size
+
+def test_mm_double_stream_block_attention(rank, world_size):
+    device = torch.device(f"cuda:{rank}")
+    dtype = torch.bfloat16
+    batch_size = 1
+    seq_len_img = 118800
+    seq_len_txt = 256
+    heads_num = 24
+    head_dim = 128
+
+    img_q = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
+    img_k = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
+    img_v = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
+    txt_q = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
+    txt_k = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
+    txt_v = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
+
+    with torch.no_grad():
+        torch.distributed.broadcast(img_q, src=0)
+        torch.distributed.broadcast(img_k, src=0)
+        torch.distributed.broadcast(img_v, src=0)
+        torch.distributed.broadcast(txt_q, src=0)
+        torch.distributed.broadcast(txt_k, src=0)
+        torch.distributed.broadcast(txt_v, src=0)
+        q = torch.cat((img_q, txt_q), dim=1)
+        k = torch.cat((img_k, txt_k), dim=1)
+        v = torch.cat((img_v, txt_v), dim=1)
+        
+
+        cu_seqlens_q = torch.tensor([0, 118811, 119056], device='cuda:0', dtype=torch.int32)
+        cu_seqlens_kv = torch.tensor([0, 118811, 119056], device='cuda:0', dtype=torch.int32)
+        max_seqlen_q = 119056
+        max_seqlen_kv = 119056
+        mode = "torch" # "torch", "vanilla", "flash"
+
+        original_output = attention(
+            q,
+            k,
+            v,
+            mode=mode,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_kv=max_seqlen_kv,
+            batch_size=batch_size
+        )
+
+        hybrid_seq_parallel_attn = xFuserLongContextAttention()
+        hybrid_seq_parallel_output = hybrid_seq_parallel_attn(
+            None,
+            img_q,
+            img_k,
+            img_v,
+            dropout_p=0.0,
+            causal=False,
+            joint_tensor_query=txt_q,
+            joint_tensor_key=txt_k,
+            joint_tensor_value=txt_v,
+            joint_strategy="rear",
+        )
+
+        b, s, a, d = hybrid_seq_parallel_output.shape
+        hybrid_seq_parallel_output = hybrid_seq_parallel_output.reshape(b, s, -1)
+
+        assert original_output.shape == hybrid_seq_parallel_output.shape, f"Shape mismatch: {original_output.shape} vs {hybrid_seq_parallel_output.shape}"
+
+        torch.testing.assert_close(original_output, hybrid_seq_parallel_output, rtol=1e-3, atol=1e-3)
+        print("test_mm_double_stream_block_attention Passed")
+
+def test_mm_single_stream_block_attention(rank, world_size):
+    device = torch.device(f"cuda:{rank}")
+    dtype = torch.bfloat16
+    txt_len = 256
+    batch_size = 1
+    seq_len_img = 118800
+    seq_len_txt = 256
+    heads_num = 24
+    head_dim = 128
+
+    with torch.no_grad():   
+        img_q = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
+        img_k = torch.randn(batch_size, seq_len_img, heads_num, head_dim, device=device, dtype=dtype)
+        txt_q = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
+        txt_k = torch.randn(batch_size, seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
+        v = torch.randn(batch_size, seq_len_img + seq_len_txt, heads_num, head_dim, device=device, dtype=dtype)
+
+        torch.distributed.broadcast(img_q, src=0)
+        torch.distributed.broadcast(img_k, src=0)
+        torch.distributed.broadcast(txt_q, src=0)
+        torch.distributed.broadcast(txt_k, src=0)
+        torch.distributed.broadcast(v, src=0)
+
+        q = torch.cat((img_q, txt_q), dim=1)
+        k = torch.cat((img_k, txt_k), dim=1)
+
+        cu_seqlens_q = torch.tensor([0, 118811, 119056], device='cuda:0', dtype=torch.int32)
+        cu_seqlens_kv = torch.tensor([0, 118811, 119056], device='cuda:0', dtype=torch.int32)
+        max_seqlen_q = 119056
+        max_seqlen_kv = 119056
+        mode = "torch" # "torch", "vanilla", "flash"
+
+        original_output = attention(
+            q,
+            k,
+            v,
+            mode=mode,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_kv=cu_seqlens_kv,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_kv=max_seqlen_kv,
+            batch_size=batch_size
+        )
+
+        hybrid_seq_parallel_attn = xFuserLongContextAttention()
+        hybrid_seq_parallel_output = hybrid_seq_parallel_attn(
+            None,
+            q[:, :-txt_len, :, :],
+            k[:, :-txt_len, :, :],
+            v[:, :-txt_len, :, :],
+            dropout_p=0.0,
+            causal=False,
+            joint_tensor_query=q[:, -txt_len:, :, :],
+            joint_tensor_key=k[:, -txt_len:, :, :],
+            joint_tensor_value=v[:, -txt_len:, :, :],
+            joint_strategy="rear",
+        )
+        b, s, a, d = hybrid_seq_parallel_output.shape
+        hybrid_seq_parallel_output = hybrid_seq_parallel_output.reshape(b, s, -1)
+
+        assert original_output.shape == hybrid_seq_parallel_output.shape, f"Shape mismatch: {original_output.shape} vs {hybrid_seq_parallel_output.shape}"
+
+        torch.testing.assert_close(original_output, hybrid_seq_parallel_output, rtol=1e-3, atol=1e-3)
+        print("test_mm_single_stream_block_attention Passed")
+
+if __name__ == "__main__":
+    rank, world_size = init_dist()
+    test_mm_double_stream_block_attention(rank, world_size)
+    test_mm_single_stream_block_attention(rank, world_size)
--- a/utils/collect_env.py
+++ b/utils/collect_env.py
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file holding some environment constant for sharing by other files."""
+import os
+import os.path as osp
+import subprocess
+import sys
+from collections import OrderedDict, defaultdict
+
+import numpy as np
+import torch
+
+
+def is_rocm_pytorch() -> bool:
+    """Check whether the PyTorch is compiled on ROCm."""
+    is_rocm = False
+    if TORCH_VERSION != 'parrots':
+        try:
+            from torch.utils.cpp_extension import ROCM_HOME
+            is_rocm = True if ((torch.version.hip is not None) and
+                               (ROCM_HOME is not None)) else False
+        except ImportError:
+            pass
+    return is_rocm
+
+TORCH_VERSION = torch.__version__
+
+def get_build_config():
+    """Obtain the build information of PyTorch or Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.config import get_build_info
+        return get_build_info()
+    else:
+        return torch.__config__.show()
+
+try:
+    import torch_musa  # noqa: F401
+    IS_MUSA_AVAILABLE = True
+except Exception:
+    IS_MUSA_AVAILABLE = False
+
+def is_musa_available() -> bool:
+    return IS_MUSA_AVAILABLE
+
+def is_cuda_available() -> bool:
+    """Returns True if cuda devices exist."""
+    return torch.cuda.is_available()
+
+def _get_cuda_home():
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import CUDA_HOME
+    else:
+        if is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            CUDA_HOME = ROCM_HOME
+        else:
+            from torch.utils.cpp_extension import CUDA_HOME
+    return CUDA_HOME
+
+
+def _get_musa_home():
+    return os.environ.get('MUSA_HOME')
+
+
+def collect_env():
+    """Collect the information of the running environments.
+
+    Returns:
+        dict: The environment information. The following fields are contained.
+
+            - sys.platform: The variable of ``sys.platform``.
+            - Python: Python version.
+            - CUDA available: Bool, indicating if CUDA is available.
+            - GPU devices: Device type of each GPU.
+            - CUDA_HOME (optional): The env var ``CUDA_HOME``.
+            - NVCC (optional): NVCC version.
+            - GCC: GCC version, "n/a" if GCC is not installed.
+            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.
+            - PyTorch: PyTorch version.
+            - PyTorch compiling details: The output of \
+                ``torch.__config__.show()``.
+            - TorchVision (optional): TorchVision version.
+            - OpenCV (optional): OpenCV version.
+    """
+    from distutils import errors
+
+    env_info = OrderedDict()
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = is_cuda_available()
+    musa_available = is_musa_available()
+    env_info['CUDA available'] = cuda_available
+    env_info['MUSA available'] = musa_available
+    env_info['numpy_random_seed'] = np.random.get_state()[1][0]
+
+    if cuda_available:
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        CUDA_HOME = _get_cuda_home()
+        env_info['CUDA_HOME'] = CUDA_HOME
+
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            if CUDA_HOME == '/opt/rocm':
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'hip/bin/hipcc')
+                    nvcc = subprocess.check_output(
+                        f'"{nvcc}" --version', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('HIP version:')
+                    build = nvcc.rfind('')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            else:
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                    nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('Cuda compilation tools')
+                    build = nvcc.rfind('Build ')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+    elif musa_available:
+        devices = defaultdict(list)
+        for k in range(torch.musa.device_count()):
+            devices[torch.musa.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        MUSA_HOME = _get_musa_home()
+        env_info['MUSA_HOME'] = MUSA_HOME
+
+        if MUSA_HOME is not None and osp.isdir(MUSA_HOME):
+            try:
+                mcc = osp.join(MUSA_HOME, 'bin/mcc')
+                subprocess.check_output(f'"{mcc}" -v', shell=True)
+            except subprocess.SubprocessError:
+                mcc = 'Not Available'
+            env_info['mcc'] = mcc
+    try:
+        # Check C++ Compiler.
+        # For Unix-like, sysconfig has 'CC' variable like 'gcc -pthread ...',
+        # indicating the compiler used, we use this to get the compiler name
+        import io
+        import sysconfig
+        cc = sysconfig.get_config_var('CC')
+        if cc:
+            cc = osp.basename(cc.split()[0])
+            cc_info = subprocess.check_output(f'{cc} --version', shell=True)
+            env_info['GCC'] = cc_info.decode('utf-8').partition(
+                '\n')[0].strip()
+        else:
+            # on Windows, cl.exe is not in PATH. We need to find the path.
+            # distutils.ccompiler.new_compiler() returns a msvccompiler
+            # object and after initialization, path to cl.exe is found.
+            import locale
+            import os
+            from distutils.ccompiler import new_compiler
+            ccompiler = new_compiler()
+            ccompiler.initialize()
+            cc = subprocess.check_output(
+                f'{ccompiler.cc}', stderr=subprocess.STDOUT, shell=True)
+            encoding = os.device_encoding(
+                sys.stdout.fileno()) or locale.getpreferredencoding()
+            env_info['MSVC'] = cc.decode(encoding).partition('\n')[0].strip()
+            env_info['GCC'] = 'n/a'
+    except (subprocess.CalledProcessError, errors.DistutilsPlatformError):
+        env_info['GCC'] = 'n/a'
+    except io.UnsupportedOperation as e:
+        # JupyterLab on Windows changes sys.stdout, which has no `fileno` attr
+        # Refer to: https://github.com/open-mmlab/mmengine/issues/931
+        # TODO: find a solution to get compiler info in Windows JupyterLab,
+        # while preserving backward-compatibility in other systems.
+        env_info['MSVC'] = f'n/a, reason: {str(e)}'
+
+    env_info['PyTorch'] = torch.__version__
+    env_info['PyTorch compiling details'] = get_build_config()
+
+    try:
+        import torchvision
+        env_info['TorchVision'] = torchvision.__version__
+    except ModuleNotFoundError:
+        pass
+
+    try:
+        import cv2
+        env_info['OpenCV'] = cv2.__version__
+    except ImportError:
+        pass
+
+
+    return env_info
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
\ No newline at end of file
--- a/video-logs/bw-video-len_129-step_20-num-2.log
+++ b/video-logs/bw-video-len_129-step_20-num-2.log
--- a/video-logs/bw-video-len_129-step_20-num-4.log
+++ b/video-logs/bw-video-len_129-step_20-num-4.log
--- a/video-logs/bw-video-len_33-step_20-num-1.log
+++ b/video-logs/bw-video-len_33-step_20-num-1.log
+Namespace(model='HYVideo-T/2-cfgdistill', latent_channels=16, precision='bf16', rope_theta=256, vae='884-16c-hy', vae_precision='fp16', vae_tiling=True, text_encoder='llm', text_encoder_precision='fp16', text_states_dim=4096, text_len=256, tokenizer='llm', prompt_template='dit-llm-encode', prompt_template_video='dit-llm-encode-video', hidden_state_skip_layer=2, apply_final_norm=False, text_encoder_2='clipL', text_encoder_precision_2='fp16', text_states_dim_2=768, tokenizer_2='clipL', text_len_2=77, denoise_type='flow', flow_shift=7.0, flow_reverse=True, flow_solver='euler', use_linear_quadratic_schedule=False, linear_schedule_end=25, model_base='ckpts', dit_weight='ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt', model_resolution='540p', load_key='module', use_cpu_offload=False, batch_size=1, infer_steps=20, disable_autocast=False, save_path='./results', save_path_suffix='', name_suffix='', num_videos=1, video_size=[1280, 720], video_length=33, prompt='A cat walks on the grass, realistic style.', seed_type='auto', seed=42, neg_prompt=None, cfg_scale=1.0, embedded_cfg_scale=6.0, use_fp8=False, reproduce=False, ulysses_degree=1, ring_degree=1)
+2026-02-02 14:09:46.064 | INFO     | hyvideo.inference:from_pretrained:154 - Got text-to-video model root path: ckpts
+2026-02-02 14:09:46.065 | INFO     | hyvideo.inference:from_pretrained:189 - Building model...
+2026-02-02 14:09:46.741 | INFO     | hyvideo.inference:load_state_dict:340 - Loading torch model ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt...
+/workspace/cicd/HunyuanVideo-t2v/hyvideo/inference.py:341: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
+2026-02-02 14:10:02.963 | INFO     | hyvideo.vae:load_vae:29 - Loading 3D VAE model (884-16c-hy) from: ./ckpts/hunyuan-video-t2v-720p/vae
+/workspace/cicd/HunyuanVideo-t2v/hyvideo/vae/__init__.py:39: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  ckpt = torch.load(vae_ckpt, map_location=vae.device)
+2026-02-02 14:10:05.461 | INFO     | hyvideo.vae:load_vae:55 - VAE to dtype: torch.float16
+2026-02-02 14:10:05.633 | INFO     | hyvideo.text_encoder:load_text_encoder:28 - Loading text encoder model (llm) from: ./ckpts/text_encoder
+Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.
+
Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards:  25%|██▌       | 1/4 [00:02<00:07,  2.49s/it]
Loading checkpoint shards:  50%|█████     | 2/4 [00:05<00:05,  2.71s/it]
Loading checkpoint shards:  75%|███████▌  | 3/4 [00:08<00:02,  2.77s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  1.79s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.12s/it]
+2026-02-02 14:10:19.819 | INFO     | hyvideo.text_encoder:load_text_encoder:50 - Text encoder to dtype: torch.float16
+2026-02-02 14:10:23.769 | INFO     | hyvideo.text_encoder:load_tokenizer:64 - Loading tokenizer (llm) from: ./ckpts/text_encoder
+2026-02-02 14:10:24.283 | INFO     | hyvideo.text_encoder:load_text_encoder:28 - Loading text encoder model (clipL) from: ./ckpts/text_encoder_2
+2026-02-02 14:10:24.447 | INFO     | hyvideo.text_encoder:load_text_encoder:50 - Text encoder to dtype: torch.float16
+2026-02-02 14:10:24.500 | INFO     | hyvideo.text_encoder:load_tokenizer:64 - Loading tokenizer (clipL) from: ./ckpts/text_encoder_2
+2026-02-02 14:10:24.595 | INFO     | hyvideo.inference:predict:580 - Input (height, width, video_length) = (1280, 720, 33)
+2026-02-02 14:10:24.617 | DEBUG    | hyvideo.inference:predict:642 - 
+                        height: 1280
+                         width: 720
+                  video_length: 33
+                        prompt: ['A cat walks on the grass, realistic style.']
+                    neg_prompt: ['']
+                          seed: 42
+                   infer_steps: 2
+         num_videos_per_prompt: 1
+                guidance_scale: 1.0
+                      n_tokens: 32400
+                    flow_shift: 7.0
+       embedded_guidance_scale: 6.0
+/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py:602: UserWarning: 1Torch was not compiled with memory efficient attention. (Triggered internally at /home/pytorch/aten/src/ATen/native/transformers/hip/sdp_utils.cpp:663.)
+  attn_output = torch.nn.functional.scaled_dot_product_attention(
+
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:12<00:12, 12.57s/it]
100%|██████████| 2/2 [00:20<00:00,  9.91s/it]
100%|██████████| 2/2 [00:20<00:00, 10.30s/it]
+2026-02-02 14:11:05.154 | INFO     | hyvideo.inference:predict:671 - Success, time: 40.5368127822876
+2026-02-02 14:11:05.154 | INFO     | hyvideo.inference:predict:580 - Input (height, width, video_length) = (1280, 720, 33)
+2026-02-02 14:11:05.180 | DEBUG    | hyvideo.inference:predict:642 - 
+                        height: 1280
+                         width: 720
+                  video_length: 33
+                        prompt: ['A cat walks on the grass, realistic style.']
+                    neg_prompt: ['']
+                          seed: 42
+                   infer_steps: 20
+         num_videos_per_prompt: 1
+                guidance_scale: 1.0
+                      n_tokens: 32400
+                    flow_shift: 7.0
+       embedded_guidance_scale: 6.0
+
  0%|          | 0/20 [00:00<?, ?it/s]
  5%|▌         | 1/20 [00:08<02:35,  8.17s/it]
 10%|█         | 2/20 [00:16<02:25,  8.10s/it]
 15%|█▌        | 3/20 [00:24<02:18,  8.13s/it]
 20%|██        | 4/20 [00:32<02:10,  8.14s/it]
 25%|██▌       | 5/20 [00:40<02:02,  8.14s/it]
 30%|███       | 6/20 [00:48<01:54,  8.15s/it]
 35%|███▌      | 7/20 [00:57<01:45,  8.15s/it]
 40%|████      | 8/20 [01:05<01:37,  8.16s/it]
 45%|████▌     | 9/20 [01:13<01:29,  8.16s/it]
 50%|█████     | 10/20 [01:21<01:21,  8.16s/it]
 55%|█████▌    | 11/20 [01:29<01:13,  8.16s/it]
 60%|██████    | 12/20 [01:37<01:05,  8.16s/it]
 65%|██████▌   | 13/20 [01:46<00:57,  8.17s/it]
 70%|███████   | 14/20 [01:54<00:49,  8.17s/it]
 75%|███████▌  | 15/20 [02:02<00:40,  8.17s/it]
 80%|████████  | 16/20 [02:10<00:32,  8.17s/it]
 85%|████████▌ | 17/20 [02:18<00:24,  8.16s/it]
 90%|█████████ | 18/20 [02:26<00:16,  8.16s/it]
 95%|█████████▌| 19/20 [02:35<00:08,  8.17s/it]
100%|██████████| 20/20 [02:43<00:00,  8.16s/it]
100%|██████████| 20/20 [02:43<00:00,  8.16s/it]
+2026-02-02 14:14:02.787 | INFO     | hyvideo.inference:predict:671 - Success, time: 177.60623216629028
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+2026-02-02 14:14:03.989 | INFO     | __main__:main:72 - Sample save to: ./results/2026-02-02-14:14:02_seed42_A cat walks on the grass, realistic style..mp4