Mlu590 (#520)

1. 修复之前的代码合并冲突，并测试通过。 --------- Co-authored-by: Yang Yong (雍洋) <yongyang1030@163.com>

Mlu590 (#520)
1. 修复之前的代码合并冲突，并测试通过。 --------- Co-authored-by: Yang Yong (雍洋) <yongyang1030@163.com>
c0863477 · Kane · GitHub · 47b3ce2f · c0863477 · c0863477
Unverified Commit c0863477 authored Nov 27, 2025 by Kane Committed by GitHub Nov 27, 2025
20 changed files
--- a/lightx2v/common/ops/attn/ulysses_attn.py
+++ b/lightx2v/common/ops/attn/ulysses_attn.py
@@ -117,6 +117,9 @@ class UlyssesAttnWeight(AttnWeightTemplate):
        elif hasattr(torch, "mlu") and torch.mlu.is_available():
            torch.mlu.synchronize()
            self.config["run_device"] = "mlu"
+        elif hasattr(torch, "npu") and torch.npu.is_available():
+            torch.npu.synchronize()
+            self.config["run_device"] = "npu"
 @ATTN_WEIGHT_REGISTER("ulysses-4090")

--- a/lightx2v/common/ops/conv/conv3d.py
+++ b/lightx2v/common/ops/conv/conv3d.py
@@ -35,7 +35,7 @@ class Conv3dWeight(Conv3dWeightTemplate):
    def load(self, weight_dict):
        device = weight_dict[self.weight_name].device
-        if device.type == "cuda":
+        if device.type in ["cuda", "mlu", "npu"]:
            self.weight = weight_dict[self.weight_name]
            if self.bias_name is not None:
                self.bias = weight_dict[self.bias_name]

--- a/lightx2v/common/ops/embedding/embedding_weight.py
+++ b/lightx2v/common/ops/embedding/embedding_weight.py
@@ -22,7 +22,7 @@ class EmbeddingWeightTemplate(metaclass=ABCMeta):
                self.weight_cuda_buffer = weight_dict[self.weight_name].cuda()
            else:
                device = weight_dict[self.weight_name].device
-                if device.type == "cuda":
+                if device.type in ["cuda", "mlu", "npu"]:
                    self.weight = weight_dict[self.weight_name]
                elif device.type == "cpu":
                    weight_shape = weight_dict[self.weight_name].shape

--- a/lightx2v/common/ops/mm/mm_weight.py
+++ b/lightx2v/common/ops/mm/mm_weight.py
@@ -296,7 +296,7 @@ class MMWeightQuantTemplate(MMWeightTemplate):
                self.bias_cuda_buffer = weight_dict[self.bias_name].cuda()
            else:
                device = weight_dict[self.bias_name].device
-                if device.type == "cuda":
+                if device.type in ["cuda", "mlu", "npu"]:
                    self.bias = weight_dict[self.bias_name]
                elif device.type == "cpu":
                    bias_shape = weight_dict[self.bias_name].shape
@@ -362,7 +362,7 @@ class MMWeightQuantTemplate(MMWeightTemplate):
            self.weight, self.weight_scale = self.weight.to(device), self.weight_scale.to(device)
        else:
            device = weight_dict[self.weight_name].device
-            if device.type == "cuda":
+            if device.type in ["cuda", "mlu", "npu"]:
                self.weight = weight_dict[self.weight_name]
                self.weight_scale = weight_dict[self.weight_scale_name]
            elif device.type == "cpu":
@@ -387,7 +387,7 @@ class MMWeightQuantTemplate(MMWeightTemplate):
            self.weight, self.weight_scale = self.weight.to(device), self.weight_scale.to(device)
        else:
            device = weight_dict[self.weight_name].device
-            if device.type == "cuda":
+            if device.type in ["cuda", "mlu", "npu"]:
                self.weight = weight_dict[self.weight_name]
                self.weight_scale = weight_dict[self.weight_scale_name]
            elif device.type == "cpu":
@@ -412,7 +412,7 @@ class MMWeightQuantTemplate(MMWeightTemplate):
        weight_global_scale = weight_dict[f"{self.weight_name}_global_scale"]
        alpha = 1.0 / (input_global_scale * weight_global_scale)
-        if device.type == "cuda":
+        if device.type in ["cuda", "mlu", "npu"]:
            self.weight = weight_dict[self.weight_name]
            self.weight_scale = weight_dict[self.weight_scale_name]
            self.input_global_scale = input_global_scale
@@ -1172,8 +1172,8 @@ class MMWeightWint8channelAint8channeldynamicMlu(MMWeightQuantTemplate):
        Kernel: mlu
    """
-    def __init__(self, weight_name, bias_name, lazy_load=False, lazy_load_file=None):
+    def __init__(self, weight_name, bias_name, create_cuda_buffer=False, lazy_load=False, lazy_load_file=None, is_post_adapter=False):
-        super().__init__(weight_name, bias_name, lazy_load, lazy_load_file)
+        super().__init__(weight_name, bias_name, create_cuda_buffer, lazy_load, lazy_load_file, is_post_adapter)
        self.load_func = self.load_int8_perchannel_sym
        self.weight_need_transpose = False
        self.act_quant_func = self.act_quant_int8_perchannel_sym_tmo

--- a/lightx2v/common/ops/norm/layer_norm_weight.py
+++ b/lightx2v/common/ops/norm/layer_norm_weight.py
@@ -32,7 +32,7 @@ class LNWeightTemplate(metaclass=ABCMeta):
            else:
                if self.weight_name is not None:
                    device = weight_dict[self.weight_name].device
-                    if device.type == "cuda":
+                    if device.type in ["cuda", "mlu", "npu"]:
                        self.weight = weight_dict[self.weight_name]
                        if self.bias_name is not None:
                            self.bias = weight_dict[self.bias_name]

--- a/lightx2v/common/ops/norm/triton_ops.py
+++ b/lightx2v/common/ops/norm/triton_ops.py
@@ -337,6 +337,8 @@ def maybe_contiguous(x):
 def triton_autotune_configs():
+    if not torch.cuda.is_available():
+        return []
    # Return configs with a valid warp count for the current device
    configs = []
    # Maximum threads per block is architecture-dependent in theory, but in reality all are 1024

--- a/lightx2v/common/ops/tensor/tensor.py
+++ b/lightx2v/common/ops/tensor/tensor.py
@@ -29,7 +29,7 @@ class DefaultTensor:
                self.tensor_cuda_buffer = weight_dict[self.tensor_name].cuda()
            else:
                device = weight_dict[self.tensor_name].device
-                if device.type == "cuda":
+                if device.type in ["cuda", "mlu", "npu"]:
                    self.tensor = weight_dict[self.tensor_name]
                elif device.type == "cpu":
                    tensor_shape = weight_dict[self.tensor_name].shape

--- a/lightx2v/models/input_encoders/hf/hunyuan15/byt5/model.py
+++ b/lightx2v/models/input_encoders/hf/hunyuan15/byt5/model.py
@@ -158,7 +158,7 @@ class ByT5TextEncoder:
    def __init__(
        self,
        config,
-        device=torch.cuda.current_device(),
+        device=torch.device("cpu"),
        checkpoint_path=None,
        byt5_max_length=256,
        cpu_offload=False,
@@ -277,8 +277,8 @@ class ByT5TextEncoder:
            formatted_text = self.prompt_format.format_prompt(glyph_texts, text_styles)
            text_ids, text_mask = self.get_byt5_text_tokens(self.byt5_tokenizer, self.byt5_max_length, formatted_text)
-            text_ids = text_ids.to("cuda")
+            text_ids = text_ids.to(device)
-            text_mask = text_mask.to("cuda")
+            text_mask = text_mask.to(device)
            byt5_outputs = self.byt5_model(text_ids, attention_mask=text_mask.float())
            byt5_embeddings = byt5_outputs[0]
@@ -300,12 +300,12 @@ class ByT5TextEncoder:
        negative_masks = []
        for prompt in prompt_list:
-            pos_emb, pos_mask = self._process_single_byt5_prompt(prompt, "cuda")
+            pos_emb, pos_mask = self._process_single_byt5_prompt(prompt, self.device)
            positive_embeddings.append(pos_emb)
            positive_masks.append(pos_mask)
            if self.enable_cfg:  # TODO: 把cfg拆出去，更适合并行
-                neg_emb, neg_mask = self._process_single_byt5_prompt("", "cuda")
+                neg_emb, neg_mask = self._process_single_byt5_prompt("", self.device)
                negative_embeddings.append(neg_emb)
                negative_masks.append(neg_mask)
@@ -327,8 +327,8 @@ class ByT5TextEncoder:
    @torch.no_grad()
    def infer(self, prompts):
        if self.cpu_offload:
-            self.byt5_model = self.byt5_model.to("cuda")
+            self.byt5_model = self.byt5_model.to(self.device)
-            self.byt5_mapper = self.byt5_mapper.to("cuda")
+            self.byt5_mapper = self.byt5_mapper.to(self.device)
        byt5_embeddings, byt5_masks = self._prepare_byt5_embeddings(prompts)
        byt5_features = self.byt5_mapper(byt5_embeddings.to(torch.bfloat16))
        if self.cpu_offload:

--- a/lightx2v/models/input_encoders/hf/hunyuan15/qwen25/model.py
+++ b/lightx2v/models/input_encoders/hf/hunyuan15/qwen25/model.py
@@ -144,7 +144,13 @@ def load_text_encoder(
                continue
            new_w_dict[key.replace("model.", "")] = weight_dict[key]
        del weight_dict
-        torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        elif "mlu" in str(device):
+            torch.mlu.empty_cache()
+        elif "npu" in str(device):
+            torch.npu.empty_cache()
        gc.collect()
        text_encoder.load_state_dict(new_w_dict, assign=True)
@@ -545,7 +551,7 @@ class Qwen25VL_TextEncoder:
        self,
        text_len=1000,
        dtype=torch.float16,
-        device=torch.cuda.current_device(),
+        device=torch.device("cpu"),
        checkpoint_path=None,
        cpu_offload=False,
        qwen25vl_quantized=False,
@@ -583,20 +589,20 @@ class Qwen25VL_TextEncoder:
    def infer(self, texts):
        if self.cpu_offload:
-            self.text_encoder = self.text_encoder.to("cuda")
+            self.text_encoder = self.text_encoder.to(self.device)
        text_inputs = self.text_encoder.text2tokens(texts, data_type="video", max_length=self.text_len)
-        prompt_outputs = self.text_encoder.encode(text_inputs, data_type="video", device="cuda")
+        prompt_outputs = self.text_encoder.encode(text_inputs, data_type="video", device=self.device)
        if self.cpu_offload:
            self.text_encoder = self.text_encoder.to("cpu")
        prompt_embeds = prompt_outputs.hidden_state
        attention_mask = prompt_outputs.attention_mask
        if attention_mask is not None:
-            attention_mask = attention_mask.cuda()
+            attention_mask = attention_mask.to(self.device)
            _, seq_len = attention_mask.shape
            attention_mask = attention_mask.repeat(1, self.num_videos_per_prompt)
            attention_mask = attention_mask.view(self.num_videos_per_prompt, seq_len)
-        prompt_embeds = prompt_embeds.to(dtype=self.dtype, device="cuda")
+        prompt_embeds = prompt_embeds.to(dtype=self.dtype, device=self.device)
        seq_len = prompt_embeds.shape[1]
        # duplicate text embeddings for each generation per prompt, using mps friendly method

--- a/lightx2v/models/input_encoders/hf/hunyuan15/siglip/model.py
+++ b/lightx2v/models/input_encoders/hf/hunyuan15/siglip/model.py
@@ -175,7 +175,7 @@ class VisionEncoder(nn.Module):
        if isinstance(images, np.ndarray):
            # Preprocess images if they're numpy arrays
-            preprocessed = self.processor.preprocess(images=images, return_tensors="pt").to(device="cuda", dtype=self.model.dtype)
+            preprocessed = self.processor.preprocess(images=images, return_tensors="pt").to(device=self.device, dtype=self.model.dtype)
        else:
            # Assume already preprocessed
            preprocessed = images
@@ -229,7 +229,7 @@ class SiglipVisionEncoder:
    def __init__(
        self,
        config,
-        device=torch.cuda.current_device(),
+        device=torch.device("cpu"),
        checkpoint_path=None,
        cpu_offload=False,
    ):

--- a/lightx2v/models/input_encoders/hf/qwen25/qwen25_vlforconditionalgeneration.py
+++ b/lightx2v/models/input_encoders/hf/qwen25/qwen25_vlforconditionalgeneration.py
@@ -62,7 +62,7 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
        if self.cpu_offload:
            self.device = torch.device("cpu")
        else:
-            self.device = torch.device(self.run_device)
+            self.device = torch.device(self.config.get("run_device", "cuda"))
        self.dtype = torch.bfloat16
        self.load()
@@ -95,7 +95,7 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
    @torch.no_grad()
    def infer(self, text, image_list=None):
        if self.cpu_offload:
-            self.text_encoder.to(self.run_device)
+            self.text_encoder.to(self.device)
        if image_list is not None:
            condition_image_list = []
@@ -130,7 +130,7 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
                images=condition_image_list,
                padding=True,
                return_tensors="pt",
-            ).to(torch.device(self.run_device))
+            ).to(torch.device(self.device))
            encoder_hidden_states = self.text_encoder(
                input_ids=model_inputs.input_ids,
@@ -153,7 +153,7 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
            txt = [template.format(e) for e in text]
            image_info = {}
-            model_inputs = self.tokenizer(txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt").to(torch.device(self.run_device))
+            model_inputs = self.tokenizer(txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt").to(self.device)
            encoder_hidden_states = self.text_encoder(
                input_ids=model_inputs.input_ids,
                attention_mask=model_inputs.attention_mask,
@@ -169,7 +169,7 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
        prompt_embeds = torch.stack([torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states])
        encoder_attention_mask = torch.stack([torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list])
-        prompt_embeds = prompt_embeds.to(dtype=self.dtype, device=self.run_device)
+        prompt_embeds = prompt_embeds.to(dtype=self.dtype, device=self.device)
        prompt_embeds_mask = encoder_attention_mask
        _, seq_len, _ = prompt_embeds.shape
@@ -180,12 +180,9 @@ class Qwen25_VLForConditionalGeneration_TextEncoder:
        if self.cpu_offload:
            self.text_encoder.to(torch.device("cpu"))
-            if "mlu" in str(self.device):
+            if hasattr(torch, self.config.get("run_device", "cuda")):
-                torch.mlu.empty_cache()
+                torch_module = getattr(torch, self.config.get("run_device", "cuda"))
-            elif "cuda" in str(self.device):
+                torch_module.empty_cache()
-                torch.cuda.empty_cache()
-            elif "npu" in str(self.device):
-                torch.npu.empty_cache()
            gc.collect()
        return prompt_embeds, prompt_embeds_mask, image_info
--- a/lightx2v/models/input_encoders/hf/seko_audio/audio_adapter.py
+++ b/lightx2v/models/input_encoders/hf/seko_audio/audio_adapter.py
@@ -252,7 +252,7 @@ class AudioAdapter(nn.Module):
        quantized: bool = False,
        quant_scheme: str = None,
        cpu_offload: bool = False,
-        run_device=torch.device("cuda"),
+        device=torch.device("cpu"),
    ):
        super().__init__()
        self.cpu_offload = cpu_offload
@@ -263,7 +263,7 @@ class AudioAdapter(nn.Module):
            mlp_dims=mlp_dims,
            transformer_layers=projection_transformer_layers,
        )
-        self.run_device = run_device
+        self.device = torch.device(device)
        # self.num_tokens = num_tokens * 4
        self.num_tokens_x4 = num_tokens * 4
        self.audio_pe = nn.Parameter(torch.randn(self.num_tokens_x4, mlp_dims[-1] // num_tokens) * 0.02)
@@ -302,10 +302,10 @@ class AudioAdapter(nn.Module):
    @torch.no_grad()
    def forward_audio_proj(self, audio_feat, latent_frame):
        if self.cpu_offload:
-            self.audio_proj.to(self.run_device)
+            self.audio_proj.to(self.device)
        x = self.audio_proj(audio_feat, latent_frame)
        x = self.rearange_audio_features(x)
-        x = x + self.audio_pe.to(self.run_device)
+        x = x + self.audio_pe.to(self.device)
        if self.cpu_offload:
            self.audio_proj.to("cpu")
        return x
--- a/lightx2v/models/input_encoders/hf/seko_audio/audio_encoder.py
+++ b/lightx2v/models/input_encoders/hf/seko_audio/audio_encoder.py
@@ -5,15 +5,14 @@ from lightx2v.utils.envs import *
 class SekoAudioEncoderModel:
-    def __init__(self, model_path, audio_sr, cpu_offload, run_device):
+    def __init__(self, model_path, audio_sr, cpu_offload, device):
        self.model_path = model_path
        self.audio_sr = audio_sr
        self.cpu_offload = cpu_offload
        if self.cpu_offload:
            self.device = torch.device("cpu")
        else:
-            self.device = torch.device(run_device)
+            self.device = torch.device(device)
-        self.run_device = run_device
        self.load()
    def load(self):
@@ -27,13 +26,13 @@ class SekoAudioEncoderModel:
        self.audio_feature_encoder = self.audio_feature_encoder.to("cpu")
    def to_cuda(self):
-        self.audio_feature_encoder = self.audio_feature_encoder.to(self.run_device)
+        self.audio_feature_encoder = self.audio_feature_encoder.to(self.device)
    @torch.no_grad()
    def infer(self, audio_segment):
-        audio_feat = self.audio_feature_extractor(audio_segment, sampling_rate=self.audio_sr, return_tensors="pt").input_values.to(self.run_device).to(dtype=GET_DTYPE())
+        audio_feat = self.audio_feature_extractor(audio_segment, sampling_rate=self.audio_sr, return_tensors="pt").input_values.to(self.device).to(dtype=GET_DTYPE())
        if self.cpu_offload:
-            self.audio_feature_encoder = self.audio_feature_encoder.to(self.run_device)
+            self.audio_feature_encoder = self.audio_feature_encoder.to(self.device)
        audio_feat = self.audio_feature_encoder(audio_feat, return_dict=True).last_hidden_state
        if self.cpu_offload:
            self.audio_feature_encoder = self.audio_feature_encoder.to("cpu")

--- a/lightx2v/models/input_encoders/hf/wan/t5/model.py
+++ b/lightx2v/models/input_encoders/hf/wan/t5/model.py
@@ -744,8 +744,7 @@ class T5EncoderModel:
        self,
        text_len,
        dtype=torch.bfloat16,
-        device=torch.device("cuda"),
+        device=torch.device("cpu"),
-        run_device=torch.device("cuda"),
        checkpoint_path=None,
        tokenizer_path=None,
        shard_fn=None,
@@ -758,7 +757,6 @@ class T5EncoderModel:
        self.text_len = text_len
        self.dtype = dtype
        self.device = device
-        self.run_device = run_device
        if t5_quantized_ckpt is not None and t5_quantized:
            self.checkpoint_path = t5_quantized_ckpt
        else:
@@ -807,8 +805,8 @@ class T5EncoderModel:
    def infer(self, texts):
        ids, mask = self.tokenizer(texts, return_mask=True, add_special_tokens=True)
-        ids = ids.to(self.run_device)
+        ids = ids.to(self.device)
-        mask = mask.to(self.run_device)
+        mask = mask.to(self.device)
        seq_lens = mask.gt(0).sum(dim=1).long()
        with torch.no_grad():

--- a/lightx2v/models/input_encoders/hf/wan/xlm_roberta/model.py
+++ b/lightx2v/models/input_encoders/hf/wan/xlm_roberta/model.py
@@ -292,7 +292,7 @@ class VisionTransformer(nn.Module):
        b = x.size(0)
        # embeddings
-        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
+        x = self.patch_embedding(x.type(self.patch_embedding.weight.type())).flatten(2).permute(0, 2, 1)
        if self.pool_type in ("token", "token_fc"):
            x = torch.cat([self.cls_embedding.expand(b, -1, -1), x], dim=1)
        if interpolation:

--- a/lightx2v/models/networks/hunyuan_video/infer/attn_no_pad.py
+++ b/lightx2v/models/networks/hunyuan_video/infer/attn_no_pad.py
@@ -10,7 +10,7 @@ except ImportError:
    flash_attn_varlen_func_v3 = None
    logger.info("flash_attn_varlen_func_v3 not available")
-if torch.cuda.get_device_capability(0) in [(8, 9), (12, 0)]:
+if torch.cuda.is_available() and torch.cuda.get_device_capability(0) in [(8, 9), (12, 0)]:
    try:
        from sageattention import sageattn_qk_int8_pv_fp16_triton as sageattn
    except ImportError:

--- a/lightx2v/models/networks/hunyuan_video/infer/pre_infer.py
+++ b/lightx2v/models/networks/hunyuan_video/infer/pre_infer.py
@@ -68,6 +68,7 @@ class HunyuanVideo15PreInfer:
        self.heads_num = config["heads_num"]
        self.frequency_embedding_size = 256
        self.max_period = 10000
+        self.device = torch.device(self.config.get("run_device", "cuda"))
    def set_scheduler(self, scheduler):
        self.scheduler = scheduler
@@ -154,7 +155,7 @@ class HunyuanVideo15PreInfer:
        byt5_txt = byt5_txt + weights.cond_type_embedding.apply(torch.ones_like(byt5_txt[:, :, 0], device=byt5_txt.device, dtype=torch.long))
        txt, text_mask = self.reorder_txt_token(byt5_txt, txt, byt5_text_mask, text_mask, zero_feat=True)
-        siglip_output = siglip_output + weights.cond_type_embedding.apply(2 * torch.ones_like(siglip_output[:, :, 0], dtype=torch.long, device=torch.device("cuda")))
+        siglip_output = siglip_output + weights.cond_type_embedding.apply(2 * torch.ones_like(siglip_output[:, :, 0], dtype=torch.long, device=self.device))
        txt, text_mask = self.reorder_txt_token(siglip_output, txt, siglip_mask, text_mask)
        txt = txt[:, : text_mask.sum(), :]

--- a/lightx2v/models/networks/hunyuan_video/infer/transformer_infer.py
+++ b/lightx2v/models/networks/hunyuan_video/infer/transformer_infer.py
@@ -3,7 +3,11 @@ from typing import Tuple
 import torch
 import torch.nn.functional as F
 from einops import rearrange
-from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
+try:
+    from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
+except Exception as e:
+    apply_rope_with_cos_sin_cache_inplace = None
 from lightx2v.common.transformer_infer.transformer_infer import BaseTransformerInfer
@@ -96,6 +100,7 @@ class HunyuanVideo15TransformerInfer(BaseTransformerInfer):
        self.config = config
        self.double_blocks_num = config["mm_double_blocks_depth"]
        self.heads_num = config["heads_num"]
+        self.device = torch.device(self.config.get("run_device", "cuda"))
        if self.config["seq_parallel"]:
            self.seq_p_group = self.config.get("device_mesh").get_group(mesh_dim="seq_p")
        else:
@@ -215,7 +220,7 @@ class HunyuanVideo15TransformerInfer(BaseTransformerInfer):
        key = torch.cat([img_k, txt_k], dim=1)
        value = torch.cat([img_v, txt_v], dim=1)
        seqlen = query.shape[1]
-        cu_seqlens_qkv = torch.tensor([0, seqlen], dtype=torch.int32, device="cpu").to("cuda", non_blocking=True)
+        cu_seqlens_qkv = torch.tensor([0, seqlen], dtype=torch.int32, device="cpu").to(self.device, non_blocking=True)
        if self.config["seq_parallel"]:
            attn_out = weights.self_attention_parallel.apply(

--- a/lightx2v/models/networks/hunyuan_video/infer/triton_ops.py
+++ b/lightx2v/models/networks/hunyuan_video/infer/triton_ops.py
@@ -339,6 +339,8 @@ def maybe_contiguous(x):
 def triton_autotune_configs():
+    if not torch.cuda.is_available():
+        return []
    # Return configs with a valid warp count for the current device
    configs = []
    # Maximum threads per block is architecture-dependent in theory, but in reality all are 1024

--- a/lightx2v/models/networks/hunyuan_video/model.py
+++ b/lightx2v/models/networks/hunyuan_video/model.py
@@ -176,8 +176,8 @@ class HunyuanVideo15Model(CompiledMethodsMixin):
    def _load_safetensor_to_dict(self, file_path, unified_dtype, sensitive_layer):
        remove_keys = self.remove_keys if hasattr(self, "remove_keys") else []
-        if self.device.type == "cuda" and dist.is_initialized():
+        if self.device.type != "cpu" and dist.is_initialized():
-            device = torch.device("cuda:{}".format(dist.get_rank()))
+            device = torch.device("{}:{}".format(self.device.type, dist.get_rank()))
        else:
            device = self.device