first commit

daf4c74e · helloyongyang · Yang Yong(雍洋) · 6c79160f · daf4c74e · daf4c74e
Commit daf4c74e authored Mar 24, 2025 by helloyongyang Committed by Yang Yong(雍洋) Apr 08, 2025
20 changed files
--- a/lightx2v/text2v/models/schedulers/wan/scheduler.py
+++ b/lightx2v/text2v/models/schedulers/wan/scheduler.py
--- a/lightx2v/text2v/models/text_encoders/hf/clip/__init__.py
+++ b/lightx2v/text2v/models/text_encoders/hf/clip/__init__.py
--- a/lightx2v/text2v/models/text_encoders/hf/clip/model.py
+++ b/lightx2v/text2v/models/text_encoders/hf/clip/model.py
+import torch
+from transformers import CLIPTextModel, AutoTokenizer
+class TextEncoderHFClipModel():
+    def __init__(self, model_path, device):
+        self.device = device
+        self.model_path = model_path
+        self.init()
+        self.load()
+    def init(self):
+        self.max_length = 77
+    def load(self):
+        self.model = CLIPTextModel.from_pretrained(self.model_path).to(torch.float16).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, padding_side="right")
+    def to_cpu(self):
+        self.model = self.model.to("cpu")
+    def to_cuda(self):
+        self.model = self.model.to("cuda")
+    @torch.no_grad()
+    def infer(self, text, args):
+        if args.cpu_offload:
+            self.to_cuda()
+        tokens = self.tokenizer(
+            text,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_attention_mask=True,
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        ).to("cuda")
+        outputs = self.model(
+            input_ids=tokens["input_ids"],
+            attention_mask=tokens["attention_mask"],
+            output_hidden_states=False,
+        )
+        last_hidden_state = outputs["pooler_output"]
+        if args.cpu_offload:
+            self.to_cpu()
+        return last_hidden_state, tokens["attention_mask"]
+if __name__ == "__main__":
+    model = TextEncoderHFClipModel("/mnt/nvme0/yongyang/projects/hy/HunyuanVideo/ckpts/text_encoder_2", torch.device("cuda"))
+    text = 'A cat walks on the grass, realistic style.'
+    outputs = model.infer(text)
+    print(outputs)
--- a/lightx2v/text2v/models/text_encoders/hf/llama/__init__.py
+++ b/lightx2v/text2v/models/text_encoders/hf/llama/__init__.py
--- a/lightx2v/text2v/models/text_encoders/hf/llama/model.py
+++ b/lightx2v/text2v/models/text_encoders/hf/llama/model.py
+import torch
+from transformers import AutoModel, AutoTokenizer
+class TextEncoderHFLlamaModel():
+    def __init__(self, model_path, device):
+        self.device = device
+        self.model_path = model_path
+        self.init()
+        self.load()
+    def init(self):
+        self.max_length = 351
+        self.hidden_state_skip_layer = 2
+        self.crop_start = 95
+        self.prompt_template = (
+            "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
+            "1. The main content and theme of the video."
+            "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+            "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+            "4. background environment, light, style and atmosphere."
+            "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
+            "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+        )
+    def load(self):
+        self.model = AutoModel.from_pretrained(self.model_path, low_cpu_mem_usage=True).to(torch.float16).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, padding_side="right")
+    def to_cpu(self):
+        self.model = self.model.to("cpu")
+    def to_cuda(self):
+        self.model = self.model.to("cuda")
+    @torch.no_grad()
+    def infer(self, text, args):
+        if args.cpu_offload:
+            self.to_cuda()
+        text = self.prompt_template.format(text)
+        tokens = self.tokenizer(
+            text,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_attention_mask=True,
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        ).to("cuda")
+        outputs = self.model(
+            input_ids=tokens["input_ids"],
+            attention_mask=tokens["attention_mask"],
+            output_hidden_states=True,
+        )
+        last_hidden_state = outputs.hidden_states[-(self.hidden_state_skip_layer + 1)][:, self.crop_start:]        
+        attention_mask = tokens["attention_mask"][:, self.crop_start:]
+        if args.cpu_offload:
+            self.to_cpu()
+        return last_hidden_state, attention_mask
+if __name__ == "__main__":
+    model = TextEncoderHFLlamaModel("/mnt/nvme0/yongyang/projects/hy/HunyuanVideo/ckpts/text_encoder", torch.device("cuda"))
+    text = 'A cat walks on the grass, realistic style.'
+    outputs = model.infer(text)
+    print(outputs)
--- a/lightx2v/text2v/models/text_encoders/hf/t5/__init__.py
+++ b/lightx2v/text2v/models/text_encoders/hf/t5/__init__.py
--- a/lightx2v/text2v/models/text_encoders/hf/t5/model.py
+++ b/lightx2v/text2v/models/text_encoders/hf/t5/model.py
--- a/lightx2v/text2v/models/text_encoders/hf/t5/tokenizer.py
+++ b/lightx2v/text2v/models/text_encoders/hf/t5/tokenizer.py
--- a/lightx2v/text2v/models/video_encoders/hf/__init__.py
+++ b/lightx2v/text2v/models/video_encoders/hf/__init__.py
--- a/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/__init__.py
+++ b/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/__init__.py
--- a/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/autoencoder_kl_causal_3d.py
+++ b/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/autoencoder_kl_causal_3d.py
--- a/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/model.py
+++ b/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/model.py
+import os
+import torch
+from .autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+class VideoEncoderKLCausal3DModel():
+    def __init__(self, model_path, dtype, device):
+        self.model_path = model_path
+        self.dtype = dtype
+        self.device = device
+        self.load()
+    def load(self):
+        self.vae_path = os.path.join(self.model_path, 'hunyuan-video-t2v-720p/vae')
+        config = AutoencoderKLCausal3D.load_config(self.vae_path)
+        self.model = AutoencoderKLCausal3D.from_config(config)
+        ckpt = torch.load(os.path.join(self.vae_path, 'pytorch_model.pt'), map_location='cpu', weights_only=True)
+        self.model.load_state_dict(ckpt)
+        self.model = self.model.to(dtype=self.dtype, device=self.device)
+        self.model.requires_grad_(False)
+        self.model.eval()
+    def to_cpu(self):
+        self.model = self.model.to("cpu")
+    def to_cuda(self):
+        self.model = self.model.to("cuda")
+    def decode(self, latents, generator, args):
+        if args.cpu_offload:
+            self.to_cuda()
+        latents = latents / self.model.config.scaling_factor
+        latents = latents.to(dtype=self.dtype, device=torch.device("cuda"))
+        self.model.enable_tiling()
+        image = self.model.decode(
+            latents, return_dict=False, generator=generator
+        )[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().float()
+        if args.cpu_offload:
+            self.to_cpu()
+        return image
+if __name__ == "__main__":
+    vae_model = VideoEncoderKLCausal3DModel("/mnt/nvme0/yongyang/projects/hy/new/HunyuanVideo/ckpts", dtype=torch.float16, device=torch.device("cuda"))
--- a/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/unet_causal_3d_blocks.py
+++ b/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/unet_causal_3d_blocks.py
--- a/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/vae.py
+++ b/lightx2v/text2v/models/video_encoders/hf/autoencoder_kl_causal_3d/vae.py
--- a/lightx2v/text2v/models/video_encoders/hf/wan/__init__.py
+++ b/lightx2v/text2v/models/video_encoders/hf/wan/__init__.py
--- a/lightx2v/text2v/models/video_encoders/hf/wan/vae.py
+++ b/lightx2v/text2v/models/video_encoders/hf/wan/vae.py
--- a/lightx2v/text2v/models/video_encoders/trt/__init__.py
+++ b/lightx2v/text2v/models/video_encoders/trt/__init__.py
--- a/lightx2v/text2v/models/video_encoders/trt/autoencoder_kl_causal_3d/model.py
+++ b/lightx2v/text2v/models/video_encoders/trt/autoencoder_kl_causal_3d/model.py
--- a/lightx2v/text2v/models/video_encoders/trt/autoencoder_kl_causal_3d/trt_vae_infer.py
+++ b/lightx2v/text2v/models/video_encoders/trt/autoencoder_kl_causal_3d/trt_vae_infer.py
--- a/lightx2v/utils/__init__.py
+++ b/lightx2v/utils/__init__.py