Commit 78bae405 authored by mashun1's avatar mashun1
Browse files

open_sora_inference

parents
Pipeline #826 canceled with stages
import collections
import importlib
import logging
import os
import time
from collections import OrderedDict
from collections.abc import Sequence
from itertools import repeat
import numpy as np
import torch
import torch.distributed as dist
def print_rank(var_name, var_value, rank=0):
if dist.get_rank() == rank:
print(f"[Rank {rank}] {var_name}: {var_value}")
def print_0(*args, **kwargs):
if dist.get_rank() == 0:
print(*args, **kwargs)
def requires_grad(model: torch.nn.Module, flag: bool = True) -> None:
"""
Set requires_grad flag for all parameters in a model.
"""
for p in model.parameters():
p.requires_grad = flag
def format_numel_str(numel: int) -> str:
B = 1024**3
M = 1024**2
K = 1024
if numel >= B:
return f"{numel / B:.2f} B"
elif numel >= M:
return f"{numel / M:.2f} M"
elif numel >= K:
return f"{numel / K:.2f} K"
else:
return f"{numel}"
def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
tensor.div_(dist.get_world_size())
return tensor
def get_model_numel(model: torch.nn.Module) -> (int, int):
num_params = 0
num_params_trainable = 0
for p in model.parameters():
num_params += p.numel()
if p.requires_grad:
num_params_trainable += p.numel()
return num_params, num_params_trainable
def try_import(name):
"""Try to import a module.
Args:
name (str): Specifies what module to import in absolute or relative
terms (e.g. either pkg.mod or ..mod).
Returns:
ModuleType or None: If importing successfully, returns the imported
module, otherwise returns None.
"""
try:
return importlib.import_module(name)
except ImportError:
return None
def transpose(x):
"""
transpose a list of list
Args:
x (list[list]):
"""
ret = list(map(list, zip(*x)))
return ret
def get_timestamp():
timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
return timestamp
def format_time(seconds):
days = int(seconds / 3600 / 24)
seconds = seconds - days * 3600 * 24
hours = int(seconds / 3600)
seconds = seconds - hours * 3600
minutes = int(seconds / 60)
seconds = seconds - minutes * 60
secondsf = int(seconds)
seconds = seconds - secondsf
millis = int(seconds * 1000)
f = ""
i = 1
if days > 0:
f += str(days) + "D"
i += 1
if hours > 0 and i <= 2:
f += str(hours) + "h"
i += 1
if minutes > 0 and i <= 2:
f += str(minutes) + "m"
i += 1
if secondsf > 0 and i <= 2:
f += str(secondsf) + "s"
i += 1
if millis > 0 and i <= 2:
f += str(millis) + "ms"
i += 1
if f == "":
f = "0ms"
return f
def to_tensor(data):
"""Convert objects of various python types to :obj:`torch.Tensor`.
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.
Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
"""
if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not isinstance(data, str):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f"type {type(data)} cannot be converted to tensor.")
def to_ndarray(data):
if isinstance(data, torch.Tensor):
return data.numpy()
elif isinstance(data, np.ndarray):
return data
elif isinstance(data, Sequence):
return np.array(data)
elif isinstance(data, int):
return np.ndarray([data], dtype=int)
elif isinstance(data, float):
return np.array([data], dtype=float)
else:
raise TypeError(f"type {type(data)} cannot be converted to ndarray.")
def to_torch_dtype(dtype):
if isinstance(dtype, torch.dtype):
return dtype
elif isinstance(dtype, str):
dtype_mapping = {
"float64": torch.float64,
"float32": torch.float32,
"float16": torch.float16,
"fp32": torch.float32,
"fp16": torch.float16,
"half": torch.float16,
"bf16": torch.bfloat16,
}
if dtype not in dtype_mapping:
raise ValueError
dtype = dtype_mapping[dtype]
return dtype
else:
raise ValueError
def count_params(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def _ntuple(n):
def parse(x):
if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
return x
return tuple(repeat(x, n))
return parse
to_1tuple = _ntuple(1)
to_2tuple = _ntuple(2)
to_3tuple = _ntuple(3)
to_4tuple = _ntuple(4)
to_ntuple = _ntuple
def convert_SyncBN_to_BN2d(model_cfg):
for k in model_cfg:
v = model_cfg[k]
if k == "norm_cfg" and v["type"] == "SyncBN":
v["type"] = "BN2d"
elif isinstance(v, dict):
convert_SyncBN_to_BN2d(v)
def get_topk(x, dim=4, k=5):
x = to_tensor(x)
inds = x[..., dim].topk(k)[1]
return x[inds]
def param_sigmoid(x, alpha):
ret = 1 / (1 + (-alpha * x).exp())
return ret
def inverse_param_sigmoid(x, alpha, eps=1e-5):
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1 / x2) / alpha
def inverse_sigmoid(x, eps=1e-5):
"""Inverse function of sigmoid.
Args:
x (Tensor): The tensor to do the
inverse.
eps (float): EPS avoid numerical
overflow. Defaults 1e-5.
Returns:
Tensor: The x has passed the inverse
function of sigmoid, has same
shape with input.
"""
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1 / x2)
def count_columns(df, columns):
cnt_dict = OrderedDict()
num_samples = len(df)
for col in columns:
d_i = df[col].value_counts().to_dict()
for k in d_i:
d_i[k] = (d_i[k], d_i[k] / num_samples)
cnt_dict[col] = d_i
return cnt_dict
def build_logger(work_dir, cfgname):
log_file = cfgname + ".log"
log_path = os.path.join(work_dir, log_file)
logger = logging.getLogger(cfgname)
logger.setLevel(logging.INFO)
# formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
formatter = logging.Formatter("%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
handler1 = logging.FileHandler(log_path)
handler1.setFormatter(formatter)
handler2 = logging.StreamHandler()
handler2.setFormatter(formatter)
logger.addHandler(handler1)
logger.addHandler(handler2)
logger.propagate = False
return logger
from collections import OrderedDict
import torch
@torch.no_grad()
def update_ema(
ema_model: torch.nn.Module, model: torch.nn.Module, optimizer=None, decay: float = 0.9999, sharded: bool = True
) -> None:
"""
Step the EMA model towards the current model.
"""
ema_params = OrderedDict(ema_model.named_parameters())
model_params = OrderedDict(model.named_parameters())
for name, param in model_params.items():
if name == "pos_embed":
continue
if param.requires_grad == False:
continue
if not sharded:
param_data = param.data
ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
else:
if param.data.dtype != torch.float32:
param_id = id(param)
master_param = optimizer._param_store.working_to_master_param[param_id]
param_data = master_param.data
else:
param_data = param.data
ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
import os
import torch
import colossalai
import torch.distributed as dist
from mmengine.runner import set_random_seed
from opensora.datasets import save_sample
from opensora.registry import MODELS, SCHEDULERS, build_module
from opensora.utils.config_utils import parse_configs
from opensora.utils.misc import to_torch_dtype
from opensora.acceleration.parallel_states import set_sequence_parallel_group
from colossalai.cluster import DistCoordinator
def load_prompts(prompt_path):
with open(prompt_path, "r") as f:
prompts = [line.strip() for line in f.readlines()]
return prompts
def main():
# ======================================================
# 1. cfg and init distributed env
# ======================================================
cfg = parse_configs(training=False)
print(cfg)
# init distributed
colossalai.launch_from_torch({})
coordinator = DistCoordinator()
if coordinator.world_size > 1:
set_sequence_parallel_group(dist.group.WORLD)
enable_sequence_parallelism = True
else:
enable_sequence_parallelism = False
# ======================================================
# 2. runtime variables
# ======================================================
torch.set_grad_enabled(False)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = to_torch_dtype(cfg.dtype)
set_random_seed(seed=cfg.seed)
prompts = load_prompts(cfg.prompt_path)
# ======================================================
# 3. build model & load weights
# ======================================================
# 3.1. build model
input_size = (cfg.num_frames, *cfg.image_size)
vae = build_module(cfg.vae, MODELS)
latent_size = vae.get_latent_size(input_size)
text_encoder = build_module(cfg.text_encoder, MODELS, device=device) # T5 must be fp32
model = build_module(
cfg.model,
MODELS,
input_size=latent_size,
in_channels=vae.out_channels,
caption_channels=text_encoder.output_dim,
model_max_length=text_encoder.model_max_length,
dtype=dtype,
enable_sequence_parallelism=enable_sequence_parallelism,
)
text_encoder.y_embedder = model.y_embedder # hack for classifier-free guidance
# 3.2. move to device & eval
vae = vae.to(device, dtype).eval()
model = model.to(device, dtype).eval()
# 3.3. build scheduler
scheduler = build_module(cfg.scheduler, SCHEDULERS)
# 3.4. support for multi-resolution
model_args = dict()
if cfg.multi_resolution:
image_size = cfg.image_size
hw = torch.tensor([image_size], device=device, dtype=dtype).repeat(cfg.batch_size, 1)
ar = torch.tensor([[image_size[0] / image_size[1]]], device=device, dtype=dtype).repeat(cfg.batch_size, 1)
model_args["data_info"] = dict(ar=ar, hw=hw)
# ======================================================
# 4. inference
# ======================================================
sample_idx = 0
save_dir = cfg.save_dir
os.makedirs(save_dir, exist_ok=True)
for i in range(0, len(prompts), cfg.batch_size):
batch_prompts = prompts[i : i + cfg.batch_size]
samples = scheduler.sample(
model,
text_encoder,
z_size=(vae.out_channels, *latent_size),
prompts=batch_prompts,
device=device,
additional_args=model_args,
)
samples = vae.decode(samples.to(dtype))
if coordinator.is_master():
for idx, sample in enumerate(samples):
print(f"Prompt: {batch_prompts[idx]}")
save_path = os.path.join(save_dir, f"sample_{sample_idx}")
save_sample(sample, fps=cfg.fps, save_path=save_path)
sample_idx += 1
if __name__ == "__main__":
main()
from copy import deepcopy
import colossalai
import torch
import torch.distributed as dist
import wandb
from colossalai.booster import Booster
from colossalai.booster.plugin import LowLevelZeroPlugin
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
from tqdm import tqdm
from opensora.acceleration.checkpoint import set_grad_checkpoint
from opensora.acceleration.parallel_states import (
get_data_parallel_group,
set_data_parallel_group,
set_sequence_parallel_group,
)
from opensora.acceleration.plugin import ZeroSeqParallelPlugin
from opensora.datasets import DatasetFromCSV, get_transforms_image, get_transforms_video, prepare_dataloader
from opensora.registry import MODELS, SCHEDULERS, build_module
from opensora.utils.ckpt_utils import create_logger, load, model_sharding, record_model_param_shape, save
from opensora.utils.config_utils import (
create_experiment_workspace,
create_tensorboard_writer,
parse_configs,
save_training_config,
)
from opensora.utils.misc import all_reduce_mean, format_numel_str, get_model_numel, requires_grad, to_torch_dtype
from opensora.utils.train_utils import update_ema
def main():
# ======================================================
# 1. args & cfg
# ======================================================
cfg = parse_configs(training=True)
print(cfg)
exp_name, exp_dir = create_experiment_workspace(cfg)
save_training_config(cfg._cfg_dict, exp_dir)
# ======================================================
# 2. runtime variables & colossalai launch
# ======================================================
assert torch.cuda.is_available(), "Training currently requires at least one GPU."
assert cfg.dtype in ["fp16", "bf16"], f"Unknown mixed precision {cfg.dtype}"
# 2.1. colossalai init distributed training
colossalai.launch_from_torch({})
coordinator = DistCoordinator()
device = get_current_device()
dtype = to_torch_dtype(cfg.dtype)
# 2.2. init logger, tensorboard & wandb
if not coordinator.is_master():
logger = create_logger(None)
else:
logger = create_logger(exp_dir)
logger.info(f"Experiment directory created at {exp_dir}")
writer = create_tensorboard_writer(exp_dir)
if cfg.wandb:
wandb.init(project="minisora", name=exp_name, config=cfg._cfg_dict)
# 2.3. initialize ColossalAI booster
if cfg.plugin == "zero2":
plugin = LowLevelZeroPlugin(
stage=2,
precision=cfg.dtype,
initial_scale=2**16,
max_norm=cfg.grad_clip,
)
set_data_parallel_group(dist.group.WORLD)
elif cfg.plugin == "zero2-seq":
plugin = ZeroSeqParallelPlugin(
sp_size=cfg.sp_size,
stage=2,
precision=cfg.dtype,
initial_scale=2**16,
max_norm=cfg.grad_clip,
)
set_sequence_parallel_group(plugin.sp_group)
set_data_parallel_group(plugin.dp_group)
else:
raise ValueError(f"Unknown plugin {cfg.plugin}")
booster = Booster(plugin=plugin)
# ======================================================
# 3. build dataset and dataloader
# ======================================================
dataset = DatasetFromCSV(
cfg.data_path,
# TODO: change transforms
transform=(
get_transforms_video(cfg.image_size[0])
if not cfg.use_image_transform
else get_transforms_image(cfg.image_size[0])
),
num_frames=cfg.num_frames,
frame_interval=cfg.frame_interval,
root=cfg.root,
)
# TODO: use plugin's prepare dataloader
# a batch contains:
# {
# "video": torch.Tensor, # [B, C, T, H, W],
# "text": List[str],
# }
dataloader = prepare_dataloader(
dataset,
batch_size=cfg.batch_size,
num_workers=cfg.num_workers,
shuffle=True,
drop_last=True,
pin_memory=True,
process_group=get_data_parallel_group(),
)
logger.info(f"Dataset contains {len(dataset):,} videos ({cfg.data_path})")
total_batch_size = cfg.batch_size * dist.get_world_size() // cfg.sp_size
logger.info(f"Total batch size: {total_batch_size}")
# ======================================================
# 4. build model
# ======================================================
# 4.1. build model
input_size = (cfg.num_frames, *cfg.image_size)
vae = build_module(cfg.vae, MODELS)
latent_size = vae.get_latent_size(input_size)
text_encoder = build_module(cfg.text_encoder, MODELS, device=device) # T5 must be fp32
model = build_module(
cfg.model,
MODELS,
input_size=latent_size,
in_channels=vae.out_channels,
caption_channels=text_encoder.output_dim,
model_max_length=text_encoder.model_max_length,
dtype=dtype,
)
model_numel, model_numel_trainable = get_model_numel(model)
logger.info(
f"Trainable model params: {format_numel_str(model_numel_trainable)}, Total model params: {format_numel_str(model_numel)}"
)
# 4.2. create ema
ema = deepcopy(model).to(torch.float32).to(device)
requires_grad(ema, False)
ema_shape_dict = record_model_param_shape(ema)
# 4.3. move to device
vae = vae.to(device, dtype)
model = model.to(device, dtype)
# 4.4. build scheduler
scheduler = build_module(cfg.scheduler, SCHEDULERS)
# 4.5. setup optimizer
optimizer = HybridAdam(
filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.lr, weight_decay=0, adamw_mode=True
)
lr_scheduler = None
# 4.6. prepare for training
if cfg.grad_checkpoint:
set_grad_checkpoint(model)
model.train()
update_ema(ema, model, decay=0, sharded=False)
ema.eval()
# =======================================================
# 5. boost model for distributed training with colossalai
# =======================================================
torch.set_default_dtype(dtype)
model, optimizer, _, dataloader, lr_scheduler = booster.boost(
model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, dataloader=dataloader
)
torch.set_default_dtype(torch.float)
num_steps_per_epoch = len(dataloader)
logger.info("Boost model for distributed training")
# =======================================================
# 6. training loop
# =======================================================
start_epoch = start_step = log_step = sampler_start_idx = 0
running_loss = 0.0
# 6.1. resume training
if cfg.load is not None:
logger.info("Loading checkpoint")
start_epoch, start_step, sampler_start_idx = load(booster, model, ema, optimizer, lr_scheduler, cfg.load)
logger.info(f"Loaded checkpoint {cfg.load} at epoch {start_epoch} step {start_step}")
logger.info(f"Training for {cfg.epochs} epochs with {num_steps_per_epoch} steps per epoch")
dataloader.sampler.set_start_index(sampler_start_idx)
model_sharding(ema)
# 6.2. training loop
for epoch in range(start_epoch, cfg.epochs):
dataloader.sampler.set_epoch(epoch)
dataloader_iter = iter(dataloader)
logger.info(f"Beginning epoch {epoch}...")
with tqdm(
range(start_step, num_steps_per_epoch),
desc=f"Epoch {epoch}",
disable=not coordinator.is_master(),
total=num_steps_per_epoch,
initial=start_step,
) as pbar:
for step in pbar:
batch = next(dataloader_iter)
x = batch["video"].to(device, dtype) # [B, C, T, H, W]
y = batch["text"]
with torch.no_grad():
# Prepare visual inputs
x = vae.encode(x) # [B, C, T, H/P, W/P]
# Prepare text inputs
model_args = text_encoder.encode(y)
# Diffusion
t = torch.randint(0, scheduler.num_timesteps, (x.shape[0],), device=device)
loss_dict = scheduler.training_losses(model, x, t, model_args)
# Backward & update
loss = loss_dict["loss"].mean()
booster.backward(loss=loss, optimizer=optimizer)
optimizer.step()
optimizer.zero_grad()
# Update EMA
update_ema(ema, model.module, optimizer=optimizer)
# Log loss values:
all_reduce_mean(loss)
running_loss += loss.item()
global_step = epoch * num_steps_per_epoch + step
log_step += 1
# Log to tensorboard
if coordinator.is_master() and (global_step + 1) % cfg.log_every == 0:
avg_loss = running_loss / log_step
pbar.set_postfix({"loss": avg_loss, "step": step, "global_step": global_step})
running_loss = 0
log_step = 0
writer.add_scalar("loss", loss.item(), global_step)
if cfg.wandb:
wandb.log(
{
"iter": global_step,
"num_samples": global_step * total_batch_size,
"epoch": epoch,
"loss": loss.item(),
"avg_loss": avg_loss,
},
step=global_step,
)
# Save checkpoint
if cfg.ckpt_every > 0 and (global_step + 1) % cfg.ckpt_every == 0:
save(
booster,
model,
ema,
optimizer,
lr_scheduler,
epoch,
step + 1,
global_step + 1,
cfg.batch_size,
coordinator,
exp_dir,
ema_shape_dict,
)
logger.info(
f"Saved checkpoint at epoch {epoch} step {step + 1} global_step {global_step + 1} to {exp_dir}"
)
# the continue epochs are not resumed, so we need to reset the sampler start index and start step
dataloader.sampler.set_start_index(0)
start_step = 0
if __name__ == "__main__":
main()
from typing import List
from setuptools import find_packages, setup
def fetch_requirements(path) -> List[str]:
"""
This function reads the requirements file.
Args:
path (str): the path to the requirements file.
Returns:
The lines in the requirements file.
"""
with open(path, "r") as fd:
return [r.strip() for r in fd.readlines()]
def fetch_readme() -> str:
"""
This function reads the README.md file in the current directory.
Returns:
The lines in the README file.
"""
with open("README.md", encoding="utf-8") as f:
return f.read()
setup(
name="opensora",
version="1.0.0",
packages=find_packages(
exclude=(
"assets",
"configs",
"docs",
"outputs",
"pretrained_models",
"scripts",
"tests",
"tools",
"*.egg-info",
)
),
description="Democratizing Efficient Video Production for All",
long_description=fetch_readme(),
long_description_content_type="text/markdown",
license="Apache Software License 2.0",
install_requires=fetch_requirements("requirements.txt"),
python_requires=">=3.6",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
"Environment :: GPU :: NVIDIA CUDA",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: System :: Distributed Computing",
],
)
import colossalai
import torch
import torch.distributed as dist
from colossalai.testing import spawn
from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
from opensora.acceleration.parallel_states import set_sequence_parallel_group
from opensora.models.layers.blocks import (
Attention,
MultiHeadCrossAttention,
SeqParallelAttention,
SeqParallelMultiHeadCrossAttention,
)
def run_attention(rank, world_size):
# create model
torch.manual_seed(1024)
set_sequence_parallel_group(dist.group.WORLD)
seq_parallel_attention = SeqParallelAttention(dim=256, num_heads=4, qkv_bias=True, enable_flashattn=False).cuda()
torch.manual_seed(1024)
attention = Attention(
dim=256,
num_heads=4,
qkv_bias=True,
enable_flashattn=False,
).cuda()
# create inputs
torch.manual_seed(1024)
x = torch.randn(4, 64, 256).cuda()
seq_x = x.clone().detach()
x.requires_grad = True
x.retain_grad()
seq_x.requires_grad = True
seq_x.retain_grad()
sub_seq_x = split_forward_gather_backward(seq_x, dist.group.WORLD, dim=1, grad_scale="down")
# run model
out = attention(x)
sub_seq_out = seq_parallel_attention(sub_seq_x)
seq_out = gather_forward_split_backward(sub_seq_out, dist.group.WORLD, dim=1, grad_scale="up")
assert torch.allclose(seq_out, out, atol=1e-7), f"{seq_out}\nvs\n{out}"
# run backward
seq_out.mean().backward()
out.mean().backward()
# all reduce gradient for sp
for p in seq_parallel_attention.parameters():
if p.grad is not None:
dist.all_reduce(p.grad, group=dist.group.WORLD)
p.grad.div_(world_size)
# check grad
for p1, p2 in zip(seq_parallel_attention.parameters(), attention.parameters()):
assert torch.allclose(p1.grad, p2.grad, atol=1e-7), f"{p1.grad}\nvs\n{p2.grad}"
# check input grad
assert torch.allclose(x.grad, seq_x.grad, atol=1e-7), f"{x.grad}\nvs\n{seq_x.grad}"
def run_cross_attention(rank, world_size):
# create model
torch.manual_seed(1024)
set_sequence_parallel_group(dist.group.WORLD)
seq_parallel_attention = SeqParallelMultiHeadCrossAttention(
d_model=256,
num_heads=4,
).cuda().to(torch.bfloat16)
torch.manual_seed(1024)
attention = MultiHeadCrossAttention(
d_model=256,
num_heads=4,
).cuda().to(torch.bfloat16)
# make sure the weights are the same
for p1, p2 in zip(seq_parallel_attention.parameters(), attention.parameters()):
p1.data.copy_(p2.data)
# create inputs
torch.manual_seed(1024)
x = torch.randn(4, 64, 256).cuda().to(torch.bfloat16)
y = torch.randn(4, 32, 256).cuda().to(torch.bfloat16)
mask = [2, 10, 8, 16]
mask = None
seq_x = x.clone().detach()
seq_y = y.clone().detach()
# set grad
x.requires_grad = True
x.retain_grad()
seq_x.requires_grad = True
seq_x.retain_grad()
y.requires_grad = True
y.retain_grad()
seq_y.requires_grad = True
seq_y.retain_grad()
# split by sequence
sub_seq_x = split_forward_gather_backward(seq_x, dist.group.WORLD, dim=1, grad_scale="down")
# run model
out = attention(x, y, mask)
sub_seq_out = seq_parallel_attention(sub_seq_x, seq_y, mask)
seq_out = gather_forward_split_backward(sub_seq_out, dist.group.WORLD, dim=1, grad_scale="up")
assert torch.allclose(seq_out, out, rtol=1e-5, atol=1e-6), f"\n{seq_out}\nvs\n{out}"
# run backward
seq_out.mean().backward()
out.mean().backward()
# all reduce gradient for sp
for name, p in seq_parallel_attention.named_parameters():
if p.grad is not None:
dist.all_reduce(p.grad, group=dist.group.WORLD)
p.grad.div_(world_size)
else:
print(f"grad of {name} is None")
# # check grad
for p1, p2 in zip(seq_parallel_attention.named_parameters(), attention.named_parameters()):
assert torch.allclose(p1[1].grad, p2[1].grad, rtol=1e-3, atol=1e-4), f"\n{p1[0]}\nvs\n{p2[0]}:\n{p1[1].grad}\nvs\n{p2[1].grad}"
# # check input grad
assert torch.allclose(x.grad, seq_x.grad, atol=1e-7), f"{x.grad}\nvs\n{seq_x.grad}"
assert torch.allclose(y.grad, seq_y.grad, atol=1e-7), f"{y.grad}\nvs\n{seq_y.grad}"
def run_dist(rank, world_size, port):
colossalai.launch({}, rank=rank, world_size=world_size, host="localhost", port=port)
# run_attention(rank, world_size)
run_cross_attention(rank, world_size)
def test_seq_parallel_attention():
spawn(run_dist, nprocs=2)
if __name__ == "__main__":
test_seq_parallel_attention()
import time
from copy import deepcopy
import colossalai
import torch
from colossalai.shardformer import ShardConfig, ShardFormer
from colossalai.testing import spawn
from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy
from opensora.models.text_encoder.t5 import T5Embedder
def run_t5_encoder(rank, world_size, port):
colossalai.launch({}, rank=rank, world_size=world_size, port=port, host="localhost")
# t5 embedder
t5_path = "./pretrained_models/t5_ckpts"
hf_t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=t5_path, torch_dtype=torch.float)
sf_t5 = deepcopy(hf_t5)
# create huggingface model as normal
shard_config = ShardConfig(
tensor_parallel_process_group=None,
pipeline_stage_manager=None,
enable_tensor_parallelism=False,
enable_fused_normalization=False,
enable_flash_attention=False,
enable_jit_fused=True,
enable_sequence_parallelism=False,
enable_sequence_overlap=False,
)
shard_former = ShardFormer(shard_config=shard_config)
sharded_model, _ = shard_former.optimize(sf_t5.model, policy=T5EncoderPolicy())
sf_t5.model = sharded_model
# test t5 embedder
texts = ["Who is the best player in the history of NBA?", "How to study computer science?"]
for i in range(5):
hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
sf_embs, sf_masks = sf_t5.get_text_embeddings(texts)
# check accuracy
assert torch.allclose(hf_embs, sf_embs, rtol=1e-4, atol=1e-5), f"{hf_embs} \nvs\n{sf_embs}"
assert torch.allclose(hf_masks, sf_masks), f"{hf_masks} \nvs\n{sf_masks}"
# measure perf
torch.cuda.synchronize()
hf_start = time.time()
for i in range(20):
hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
torch.cuda.synchronize()
hf_end = time.time()
# convert sf to fp16
hf_t5.model = hf_t5.model.half()
torch.cuda.synchronize()
sf_start = time.time()
for i in range(20):
hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
torch.cuda.synchronize()
sf_end = time.time()
print(f"[Performance] native: {hf_end - hf_start}s, shardformer: {sf_end - sf_start} s")
def test_t5_encoder():
spawn(run_t5_encoder)
if __name__ == "__main__":
test_t5_encoder()
# Video Captioning
Human labeling of videos is expensive and time-consuming. We adopt powerful image captioning models to generate captions for videos. Although GPT-4V achieves a better performance, its 20s/sample speed is too slow for us. With batch inference, we can achieve a speed of 3s/sample with LLaVA, and the quality is comparable. LLaVA is the second best open-source model in [MMMU](https://mmmu-benchmark.github.io/) and accepts any resolution.
![Caption](https://i0.imgs.ovh/2024/03/16/eXdvC.png)
## GPT-4V Captioning
Run the following command to generate captions for videos with GPT-4V:
```bash
python -m tools.caption.caption_gpt4 FOLDER_WITH_VIDEOS output.csv --key $OPENAI_API_KEY
```
The cost is approximately $0.01 per video (3 frames per video). The output is a CSV file with path and caption.
## LLaVA Captioning
First, install LLaVA according to their [official instructions](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#install). We use the `liuhaotian/llava-v1.6-34b` model for captioning, which can be download [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b). Then, run the following command to generate captions for videos with LLaVA:
```bash
CUDA_VISIBLE_DEVICES=0,1 python -m tools.caption.caption_llava samples output.csv
```
The Yi-34B requires 2 80GB GPUs and 3s/sample. The output is a CSV file with path and caption.
import argparse
import csv
import os
import requests
import tqdm
from .utils import extract_frames, prompts, read_video_list
def get_caption(frame, prompt, api_key):
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt,
},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}},
],
}
],
"max_tokens": 300,
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60)
caption = response.json()["choices"][0]["message"]["content"]
caption = caption.replace("\n", " ")
return caption
def main(args):
# ======================================================
# 1. read video list
# ======================================================
videos = read_video_list(args.video_folder, args.output_file)
f = open(args.output_file, "a")
writer = csv.writer(f)
# ======================================================
# 2. generate captions
# ======================================================
for video in tqdm.tqdm(videos):
video_path = os.path.join(args.video_folder, video)
frame, length = extract_frames(video_path, base_64=True)
if len(frame) < 3:
continue
prompt = prompts[args.prompt]
caption = get_caption(frame, prompt, args.key)
writer.writerow((video, caption, length))
f.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("video_folder", type=str)
parser.add_argument("output_file", type=str)
parser.add_argument("--prompt", type=str, default="three_frames")
parser.add_argument("--key", type=str)
args = parser.parse_args()
main(args)
import argparse
import csv
import os
import warnings
import torch
from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, IMAGE_TOKEN_INDEX
from llava.conversation import conv_templates
from llava.mm_utils import get_anyres_image_grid_shape, get_model_name_from_path, process_images, tokenizer_image_token
from llava.model.builder import load_pretrained_model
from llava.model.llava_arch import unpad_image
from llava.utils import disable_torch_init
from tqdm import tqdm
from .utils import extract_frames, prompts, read_video_list
disable_torch_init()
def prepare_inputs_labels_for_multimodal(
self, input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes=None
):
# llava_arch.py
vision_tower = self.get_vision_tower()
if vision_tower is None or images is None or input_ids.shape[1] == 1:
return input_ids, position_ids, attention_mask, past_key_values, None, labels
if type(images) is list or images.ndim == 5:
if type(images) is list:
images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
concat_images = torch.cat([image for image in images], dim=0)
image_features = self.encode_images(concat_images)
split_sizes = [image.shape[0] for image in images]
image_features = torch.split(image_features, split_sizes, dim=0)
mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
if mm_patch_merge_type == "flat":
image_features = [x.flatten(0, 1) for x in image_features]
elif mm_patch_merge_type.startswith("spatial"):
new_image_features = []
for image_idx, image_feature in enumerate(image_features):
if image_feature.shape[0] > 1:
base_image_feature = image_feature[0]
image_feature = image_feature[1:]
height = width = self.get_vision_tower().num_patches_per_side
assert height * width == base_image_feature.shape[0]
if image_aspect_ratio == "anyres":
num_patch_width, num_patch_height = get_anyres_image_grid_shape(
image_sizes[image_idx],
self.config.image_grid_pinpoints,
self.get_vision_tower().config.image_size,
)
image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
else:
raise NotImplementedError
if "unpad" in mm_patch_merge_type:
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
image_feature = unpad_image(image_feature, image_sizes[image_idx])
image_feature = torch.cat(
(
image_feature,
self.model.image_newline[:, None, None]
.expand(*image_feature.shape[:-1], 1)
.to(image_feature.device),
),
dim=-1,
)
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
else:
image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
image_feature = image_feature.flatten(0, 3)
image_feature = torch.cat((base_image_feature, image_feature), dim=0)
else:
image_feature = image_feature[0]
if "unpad" in mm_patch_merge_type:
image_feature = torch.cat(
(image_feature, self.model.image_newline[None].to(image_feature.device)), dim=0
)
new_image_features.append(image_feature)
image_features = new_image_features
else:
raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
else:
image_features = self.encode_images(images)
# TODO: image start / end is not implemented here to support pretraining.
if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False):
raise NotImplementedError
# Let's just add dummy tensors if they do not exist,
# it is a headache to deal with None all the time.
# But it is not ideal, and if you have a better idea,
# please open an issue / submit a PR, thanks.
_labels = labels
_position_ids = position_ids
_attention_mask = attention_mask
if attention_mask is None:
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
else:
attention_mask = attention_mask.bool()
if position_ids is None:
position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
if labels is None:
labels = torch.full_like(input_ids, IGNORE_INDEX)
# remove the padding using attention_mask -- FIXME
input_ids = [
cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
]
labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
new_input_embeds = []
new_labels = []
cur_image_idx = 0
for batch_idx, cur_input_ids in enumerate(input_ids):
num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
if num_images == 0:
cur_image_features = image_features[cur_image_idx]
cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
new_input_embeds.append(cur_input_embeds)
new_labels.append(labels[batch_idx])
cur_image_idx += 1
continue
image_token_indices = (
[-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
)
cur_input_ids_noim = []
cur_labels = labels[batch_idx]
cur_labels_noim = []
for i in range(len(image_token_indices) - 1):
cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
split_sizes = [x.shape[0] for x in cur_labels_noim]
cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
cur_new_input_embeds = []
cur_new_labels = []
for i in range(num_images + 1):
cur_new_input_embeds.append(cur_input_embeds_no_im[i])
cur_new_labels.append(cur_labels_noim[i])
if i < num_images:
cur_image_features = image_features[cur_image_idx]
cur_image_idx += 1
cur_new_input_embeds.append(cur_image_features)
cur_new_labels.append(
torch.full(
(cur_image_features.shape[0],),
IGNORE_INDEX,
device=cur_labels.device,
dtype=cur_labels.dtype,
)
)
cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
cur_new_input_embeds = torch.cat(cur_new_input_embeds)
cur_new_labels = torch.cat(cur_new_labels)
new_input_embeds.append(cur_new_input_embeds)
new_labels.append(cur_new_labels)
# Truncate sequences to max length as image embeddings can make the sequence longer
tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
if tokenizer_model_max_length is not None:
new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
# Combine them
max_len = max(x.shape[0] for x in new_input_embeds)
batch_size = len(new_input_embeds)
new_input_embeds_padded = []
new_labels_padded = torch.full(
(batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device
)
attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
cur_len = cur_new_embed.shape[0]
if getattr(self.config, "tokenizer_padding_side", "right") == "left":
new_input_embeds_padded.append(
torch.cat(
(
torch.zeros(
(max_len - cur_len, cur_new_embed.shape[1]),
dtype=cur_new_embed.dtype,
device=cur_new_embed.device,
),
cur_new_embed,
),
dim=0,
)
)
if cur_len > 0:
new_labels_padded[i, -cur_len:] = cur_new_labels
attention_mask[i, -cur_len:] = True
position_ids[i, -cur_len:] = torch.arange(
0, cur_len, dtype=position_ids.dtype, device=position_ids.device
)
else:
new_input_embeds_padded.append(
torch.cat(
(
cur_new_embed,
torch.zeros(
(max_len - cur_len, cur_new_embed.shape[1]),
dtype=cur_new_embed.dtype,
device=cur_new_embed.device,
),
),
dim=0,
)
)
if cur_len > 0:
new_labels_padded[i, :cur_len] = cur_new_labels
attention_mask[i, :cur_len] = True
position_ids[i, :cur_len] = torch.arange(
0, cur_len, dtype=position_ids.dtype, device=position_ids.device
)
new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
if _labels is None:
new_labels = None
else:
new_labels = new_labels_padded
if _attention_mask is None:
attention_mask = None
else:
attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
if _position_ids is None:
position_ids = None
return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
@torch.inference_mode()
def main(args):
# ======================================================
# 1. read video list
# ======================================================
videos = read_video_list(args.video_folder, args.output_file)
f = open(args.output_file, "a")
writer = csv.writer(f)
# ======================================================
# 2. load model and prepare prompts
# ======================================================
model_path = "liuhaotian/llava-v1.6-34b"
query = prompts[args.prompt]
print(f"Prompt: {query}")
conv = conv_templates["chatml_direct"].copy()
conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query)
prompt = conv.get_prompt()
with warnings.catch_warnings():
warnings.simplefilter("ignore") # Pytorch non-meta copying warning fills out the console
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path=model_path,
model_base=None,
model_name=get_model_name_from_path(model_path),
)
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
input_ids = input_ids.unsqueeze(0).to(model.device)
# ======================================================
# 3. generate captions
# ======================================================
bs = args.bs
for i in tqdm(range(0, len(videos), bs)):
# prepare a batch of inputs
video_files = videos[i : i + bs]
frames = []
video_lengths = []
for video_file in video_files:
frame, length = extract_frames(os.path.join(args.video_folder, video_file))
if len(frame) < 3:
continue
frames.append(frame)
video_lengths.append(length)
if len(frames) == 0:
continue
# encode the batch of inputs
samples = []
for imgs in frames:
imgs_size = [img.size for img in imgs]
imgs = process_images(imgs, image_processor, model.config)
imgs = imgs.to(model.device, dtype=torch.float16)
with torch.inference_mode():
_, _, _, _, inputs_embeds, _ = prepare_inputs_labels_for_multimodal(
model, input_ids, None, None, None, None, images=imgs, image_sizes=imgs_size
)
samples.append(inputs_embeds)
# padding
max_len = max([sample.shape[1] for sample in samples])
attention_mask = torch.tensor(
[[0] * (max_len - samples[i].shape[1]) + [1] * samples[i].shape[1] for i in range(len(samples))]
).to(model.device)
inputs_embeds = [
torch.cat(
[
torch.zeros(
(1, max_len - samples[i].shape[1], samples[i].shape[-1]),
device=model.device,
dtype=torch.float16,
),
samples[i],
],
dim=1,
)
for i in range(len(samples))
]
inputs_embeds = torch.cat(inputs_embeds, dim=0)
# generate outputs
output_ids = super(type(model), model).generate(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
do_sample=True,
temperature=0.2,
max_new_tokens=512,
use_cache=True,
)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
outputs = [output.replace("\n", " ").strip() for output in outputs]
# save results
result = list(zip(video_files, outputs, video_lengths))
for t in result:
writer.writerow(t)
f.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("video_folder", type=str)
parser.add_argument("output_file", type=str)
parser.add_argument("--bs", type=int, default=32)
parser.add_argument("--prompt", type=str, default="three_frames")
args = parser.parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment