feat: gpu初始提交

30af93f2 · chenpangpang · 68e98ab8 · 30af93f2 · 30af93f2 · 30af93f2
Commit 30af93f2 authored Dec 26, 2024 by chenpangpang
6 changed files
--- a/NVComposer/utils/save_video.py
+++ b/NVComposer/utils/save_video.py
+import os
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+import torch
+import torchvision
+from torch import Tensor
+from torchvision.utils import make_grid
+from torchvision.transforms.functional import to_tensor
+def frames_to_mp4(frame_dir, output_path, fps):
+    def read_first_n_frames(d: os.PathLike, num_frames: int):
+        if num_frames:
+            images = [
+                Image.open(os.path.join(d, f))
+                for f in sorted(os.listdir(d))[:num_frames]
+            ]
+        else:
+            images = [Image.open(os.path.join(d, f)) for f in sorted(os.listdir(d))]
+        images = [to_tensor(x) for x in images]
+        return torch.stack(images)
+    videos = read_first_n_frames(frame_dir, num_frames=None)
+    videos = videos.mul(255).to(torch.uint8).permute(0, 2, 3, 1)
+    torchvision.io.write_video(
+        output_path, videos, fps=fps, video_codec="h264", options={"crf": "10"}
+    )
+def tensor_to_mp4(video, savepath, fps, rescale=True, nrow=None):
+    """
+    video: torch.Tensor, b,c,t,h,w, 0-1
+    if -1~1, enable rescale=True
+    """
+    n = video.shape[0]
+    video = video.permute(2, 0, 1, 3, 4)  # t,n,c,h,w
+    nrow = int(np.sqrt(n)) if nrow is None else nrow
+    frame_grids = [
+        torchvision.utils.make_grid(framesheet, nrow=nrow) for framesheet in video
+    ]  # [3, grid_h, grid_w]
+    # stack in temporal dim [T, 3, grid_h, grid_w]
+    grid = torch.stack(frame_grids, dim=0)
+    grid = torch.clamp(grid.float(), -1.0, 1.0)
+    if rescale:
+        grid = (grid + 1.0) / 2.0
+    # [T, 3, grid_h, grid_w] -> [T, grid_h, grid_w, 3]
+    grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+    # print(f'Save video to {savepath}')
+    torchvision.io.write_video(
+        savepath, grid, fps=fps, video_codec="h264", options={"crf": "10"}
+    )
+def tensor2videogrids(video, root, filename, fps, rescale=True, clamp=True):
+    assert video.dim() == 5  # b,c,t,h,w
+    assert isinstance(video, torch.Tensor)
+    video = video.detach().cpu()
+    if clamp:
+        video = torch.clamp(video, -1.0, 1.0)
+    n = video.shape[0]
+    video = video.permute(2, 0, 1, 3, 4)  # t,n,c,h,w
+    frame_grids = [
+        torchvision.utils.make_grid(framesheet, nrow=int(np.sqrt(n)))
+        for framesheet in video
+    ]  # [3, grid_h, grid_w]
+    # stack in temporal dim [T, 3, grid_h, grid_w]
+    grid = torch.stack(frame_grids, dim=0)
+    if rescale:
+        grid = (grid + 1.0) / 2.0
+    # [T, 3, grid_h, grid_w] -> [T, grid_h, grid_w, 3]
+    grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+    path = os.path.join(root, filename)
+    # print('Save video ...')
+    torchvision.io.write_video(
+        path, grid, fps=fps, video_codec="h264", options={"crf": "10"}
+    )
+    # print('Finish!')
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype("data/DejaVuSans.ttf", size=size)
+        nc = int(40 * (wh[0] / 256))
+        lines = "\n".join(
+            xc[bi][start : start + nc] for start in range(0, len(xc[bi]), nc)
+        )
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+def log_local(batch_logs, save_dir, filename, save_fps=10, rescale=True):
+    if batch_logs is None:
+        return None
+    """ save images and videos from images dict """
+    def save_img_grid(grid, path, rescale):
+        if rescale:
+            grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+        grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        grid = grid.numpy()
+        grid = (grid * 255).astype(np.uint8)
+        os.makedirs(os.path.split(path)[0], exist_ok=True)
+        Image.fromarray(grid).save(path)
+    for key in batch_logs:
+        value = batch_logs[key]
+        if isinstance(value, list) and isinstance(value[0], str):
+            # a batch of captions
+            path = os.path.join(save_dir, "%s-%s.txt" % (key, filename))
+            with open(path, "w") as f:
+                for i, txt in enumerate(value):
+                    f.write(f"idx={i}, txt={txt}\n")
+                f.close()
+        elif isinstance(value, torch.Tensor) and value.dim() == 5:
+            # save video grids
+            video = value  # b,c,t,h,w
+            # only save grayscale or rgb mode
+            if video.shape[1] != 1 and video.shape[1] != 3:
+                continue
+            n = video.shape[0]
+            video = video.permute(2, 0, 1, 3, 4)  # t,n,c,h,w
+            frame_grids = [
+                torchvision.utils.make_grid(framesheet, nrow=int(1))
+                for framesheet in video
+            ]  # [3, n*h, 1*w]
+            # stack in temporal dim [t, 3, n*h, w]
+            grid = torch.stack(frame_grids, dim=0)
+            if rescale:
+                grid = (grid + 1.0) / 2.0
+            grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+            path = os.path.join(save_dir, "%s-%s.mp4" % (key, filename))
+            torchvision.io.write_video(
+                path, grid, fps=save_fps, video_codec="h264", options={"crf": "10"}
+            )
+        elif isinstance(value, torch.Tensor) and value.dim() == 4:
+            img = value
+            if img.shape[1] != 1 and img.shape[1] != 3:
+                continue
+            n = img.shape[0]
+            grid = torchvision.utils.make_grid(img, nrow=1)
+            path = os.path.join(save_dir, "%s-%s.jpg" % (key, filename))
+            save_img_grid(grid, path, rescale)
+        else:
+            pass
+def prepare_to_log(batch_logs, max_images=100000, clamp=True):
+    if batch_logs is None:
+        return None
+    # process
+    for key in batch_logs:
+        N = (
+            batch_logs[key].shape[0]
+            if hasattr(batch_logs[key], "shape")
+            else len(batch_logs[key])
+        )
+        N = min(N, max_images)
+        batch_logs[key] = batch_logs[key][:N]
+        if isinstance(batch_logs[key], torch.Tensor):
+            batch_logs[key] = batch_logs[key].detach().cpu()
+            if clamp:
+                try:
+                    batch_logs[key] = torch.clamp(batch_logs[key].float(), -1.0, 1.0)
+                except RuntimeError:
+                    print("clamp_scalar_cpu not implemented for Half")
+    return batch_logs
+def fill_with_black_squares(video, desired_len: int) -> Tensor:
+    if len(video) >= desired_len:
+        return video
+    return torch.cat(
+        [
+            video,
+            torch.zeros_like(video[0])
+            .unsqueeze(0)
+            .repeat(desired_len - len(video), 1, 1, 1),
+        ],
+        dim=0,
+    )
+def load_num_videos(data_path, num_videos):
+    # first argument can be either data_path of np array
+    if isinstance(data_path, str):
+        videos = np.load(data_path)["arr_0"]  # NTHWC
+    elif isinstance(data_path, np.ndarray):
+        videos = data_path
+    else:
+        raise Exception
+    if num_videos is not None:
+        videos = videos[:num_videos, :, :, :, :]
+    return videos
+def npz_to_video_grid(
+    data_path, out_path, num_frames, fps, num_videos=None, nrow=None, verbose=True
+):
+    if isinstance(data_path, str):
+        videos = load_num_videos(data_path, num_videos)
+    elif isinstance(data_path, np.ndarray):
+        videos = data_path
+    else:
+        raise Exception
+    n, t, h, w, c = videos.shape
+    videos_th = []
+    for i in range(n):
+        video = videos[i, :, :, :, :]
+        images = [video[j, :, :, :] for j in range(t)]
+        images = [to_tensor(img) for img in images]
+        video = torch.stack(images)
+        videos_th.append(video)
+    if verbose:
+        videos = [
+            fill_with_black_squares(v, num_frames)
+            for v in tqdm(videos_th, desc="Adding empty frames")
+        ]  # NTCHW
+    else:
+        videos = [fill_with_black_squares(v, num_frames) for v in videos_th]  # NTCHW
+    frame_grids = torch.stack(videos).permute(1, 0, 2, 3, 4)  # [T, N, C, H, W]
+    if nrow is None:
+        nrow = int(np.ceil(np.sqrt(n)))
+    if verbose:
+        frame_grids = [
+            make_grid(fs, nrow=nrow) for fs in tqdm(frame_grids, desc="Making grids")
+        ]
+    else:
+        frame_grids = [make_grid(fs, nrow=nrow) for fs in frame_grids]
+    if os.path.dirname(out_path) != "":
+        os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    frame_grids = (
+        (torch.stack(frame_grids) * 255).to(torch.uint8).permute(0, 2, 3, 1)
+    )  # [T, H, W, C]
+    torchvision.io.write_video(
+        out_path, frame_grids, fps=fps, video_codec="h264", options={"crf": "10"}
+    )
--- a/NVComposer/utils/utils.py
+++ b/NVComposer/utils/utils.py
+import importlib
+import numpy as np
+import torch
+import torch.distributed as dist
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params
+def check_istarget(name, para_list):
+    """
+    name: full name of source para
+    para_list: partial name of target para
+    """
+    istarget = False
+    for para in para_list:
+        if para in name:
+            return True
+    return istarget
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == "__is_first_stage__":
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def load_npz_from_dir(data_dir):
+    data = [
+        np.load(os.path.join(data_dir, data_name))["arr_0"]
+        for data_name in os.listdir(data_dir)
+    ]
+    data = np.concatenate(data, axis=0)
+    return data
+def load_npz_from_paths(data_paths):
+    data = [np.load(data_path)["arr_0"] for data_path in data_paths]
+    data = np.concatenate(data, axis=0)
+    return data
+def setup_dist(args):
+    if dist.is_initialized():
+        return
+    torch.cuda.set_device(args.local_rank)
+    torch.distributed.init_process_group("nccl", init_method="env://")
--- a/assets/二维码.jpeg
+++ b/assets/二维码.jpeg
--- a/hf_down.py
+++ b/hf_down.py
+from huggingface_hub import hf_hub_download
+ckpt = hf_hub_download(
+    repo_id="TencentARC/NVComposer", filename="NVComposer-V0.1.ckpt", repo_type="model", local_dir="./models"
+)
--- a/start.sh
+++ b/start.sh
+#!/bin/bash
+cd /root/NVComposer
+python app.py
--- a/启动器.ipynb
+++ b/启动器.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e5c5a211-2ccd-4341-af10-ac546484b91f",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## 项目介绍\n",
+    "- 原项目地址：https://huggingface.co/spaces/TencentARC/NVComposer\n",
+    "- NVComposer是一个多视图图像增强生成新视图合成的模型。上传单、多张视图可生成类似相机移动拍摄的视频。 \n",
+    "- 项目在L20显卡，cuda12.2上进行适配\n",
+    "## 使用说明\n",
+    "- 启动和重启 Notebook 点上方工具栏中的「重启并运行所有单元格」。出现如下内容就算成功了：\n",
+    "    - `Running on local URL:  http://0.0.0.0:7860`\n",
+    "    - `Running on public URL: https://xxxxxxxxxxxxxxx.gradio.live`\n",
+    "- 通过以下方式开启页面：\n",
+    "    - 控制台打开「自定义服务」了，访问自定义服务端口号设置为7860\n",
+    "    - 直接打开显示的公开链接`public URL`\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53a96614-e2d2-4710-a82b-0d5ca9cb9872",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# 启动\n",
+    "!sh start.sh"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "**扫码关注公众号，获取更多资讯**<br>\n",
+    "<div align=center>\n",
+    "<img src=\"assets/二维码.jpeg\" width = 20% />\n",
+    "</div>\n"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "2f54158c2967bc25"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "6dc59fbbcf222b6b"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}