"tests/vscode:/vscode.git/clone" did not exist on "4f65af0e252066d961bf864d0862f442e497f619"
Commit daf4c74e authored by helloyongyang's avatar helloyongyang Committed by Yang Yong(雍洋)
Browse files

first commit

parent 6c79160f
This diff is collapsed.
import torch
from transformers import CLIPTextModel, AutoTokenizer
class TextEncoderHFClipModel():
def __init__(self, model_path, device):
self.device = device
self.model_path = model_path
self.init()
self.load()
def init(self):
self.max_length = 77
def load(self):
self.model = CLIPTextModel.from_pretrained(self.model_path).to(torch.float16).to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, padding_side="right")
def to_cpu(self):
self.model = self.model.to("cpu")
def to_cuda(self):
self.model = self.model.to("cuda")
@torch.no_grad()
def infer(self, text, args):
if args.cpu_offload:
self.to_cuda()
tokens = self.tokenizer(
text,
return_length=False,
return_overflowing_tokens=False,
return_attention_mask=True,
truncation=True,
max_length=self.max_length,
padding="max_length",
return_tensors="pt",
).to("cuda")
outputs = self.model(
input_ids=tokens["input_ids"],
attention_mask=tokens["attention_mask"],
output_hidden_states=False,
)
last_hidden_state = outputs["pooler_output"]
if args.cpu_offload:
self.to_cpu()
return last_hidden_state, tokens["attention_mask"]
if __name__ == "__main__":
model = TextEncoderHFClipModel("/mnt/nvme0/yongyang/projects/hy/HunyuanVideo/ckpts/text_encoder_2", torch.device("cuda"))
text = 'A cat walks on the grass, realistic style.'
outputs = model.infer(text)
print(outputs)
import torch
from transformers import AutoModel, AutoTokenizer
class TextEncoderHFLlamaModel():
def __init__(self, model_path, device):
self.device = device
self.model_path = model_path
self.init()
self.load()
def init(self):
self.max_length = 351
self.hidden_state_skip_layer = 2
self.crop_start = 95
self.prompt_template = (
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
"1. The main content and theme of the video."
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
"4. background environment, light, style and atmosphere."
"5. camera angles, movements, and transitions used in the video:<|eot_id|>"
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
)
def load(self):
self.model = AutoModel.from_pretrained(self.model_path, low_cpu_mem_usage=True).to(torch.float16).to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, padding_side="right")
def to_cpu(self):
self.model = self.model.to("cpu")
def to_cuda(self):
self.model = self.model.to("cuda")
@torch.no_grad()
def infer(self, text, args):
if args.cpu_offload:
self.to_cuda()
text = self.prompt_template.format(text)
tokens = self.tokenizer(
text,
return_length=False,
return_overflowing_tokens=False,
return_attention_mask=True,
truncation=True,
max_length=self.max_length,
padding="max_length",
return_tensors="pt",
).to("cuda")
outputs = self.model(
input_ids=tokens["input_ids"],
attention_mask=tokens["attention_mask"],
output_hidden_states=True,
)
last_hidden_state = outputs.hidden_states[-(self.hidden_state_skip_layer + 1)][:, self.crop_start:]
attention_mask = tokens["attention_mask"][:, self.crop_start:]
if args.cpu_offload:
self.to_cpu()
return last_hidden_state, attention_mask
if __name__ == "__main__":
model = TextEncoderHFLlamaModel("/mnt/nvme0/yongyang/projects/hy/HunyuanVideo/ckpts/text_encoder", torch.device("cuda"))
text = 'A cat walks on the grass, realistic style.'
outputs = model.infer(text)
print(outputs)
This diff is collapsed.
This diff is collapsed.
import os
import torch
from .autoencoder_kl_causal_3d import AutoencoderKLCausal3D
class VideoEncoderKLCausal3DModel():
def __init__(self, model_path, dtype, device):
self.model_path = model_path
self.dtype = dtype
self.device = device
self.load()
def load(self):
self.vae_path = os.path.join(self.model_path, 'hunyuan-video-t2v-720p/vae')
config = AutoencoderKLCausal3D.load_config(self.vae_path)
self.model = AutoencoderKLCausal3D.from_config(config)
ckpt = torch.load(os.path.join(self.vae_path, 'pytorch_model.pt'), map_location='cpu', weights_only=True)
self.model.load_state_dict(ckpt)
self.model = self.model.to(dtype=self.dtype, device=self.device)
self.model.requires_grad_(False)
self.model.eval()
def to_cpu(self):
self.model = self.model.to("cpu")
def to_cuda(self):
self.model = self.model.to("cuda")
def decode(self, latents, generator, args):
if args.cpu_offload:
self.to_cuda()
latents = latents / self.model.config.scaling_factor
latents = latents.to(dtype=self.dtype, device=torch.device("cuda"))
self.model.enable_tiling()
image = self.model.decode(
latents, return_dict=False, generator=generator
)[0]
image = (image / 2 + 0.5).clamp(0, 1)
image = image.cpu().float()
if args.cpu_offload:
self.to_cpu()
return image
if __name__ == "__main__":
vae_model = VideoEncoderKLCausal3DModel("/mnt/nvme0/yongyang/projects/hy/new/HunyuanVideo/ckpts", dtype=torch.float16, device=torch.device("cuda"))
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment