feat: gpu初始提交

30af93f2 · chenpangpang · 68e98ab8 · 30af93f2 · 30af93f2 · 30af93f2
Commit 30af93f2 authored Dec 26, 2024 by chenpangpang
20 changed files
--- a/.gitignore
+++ b/.gitignore
+.idea
+chenyh
--- a/Dockerfile
+++ b/Dockerfile
+FROM image.sourcefind.cn:5000/gpu/admin/base/jupyterlab-pytorch:2.3.1-py3.10-cuda11.8-ubuntu22.04-devel as base
+ARG IMAGE=nvcomposer
+ARG IMAGE_UPPER=NVComposer
+ARG BRANCH=gpu
+RUN cd /root && git clone -b $BRANCH http://developer.hpccube.com/codes/chenpangpang/$IMAGE.git
+WORKDIR /root/$IMAGE/$IMAGE_UPPER
+RUN pip install -r requirements.txt
+#########
+# Prod  #
+#########
+FROM image.sourcefind.cn:5000/gpu/admin/base/jupyterlab-pytorch:2.3.1-py3.10-cuda11.8-ubuntu22.04-devel
+ARG IMAGE=nvcomposer
+ARG IMAGE_UPPER=NVComposer
+COPY chenyh/$IMAGE/frpc_linux_amd64_* /opt/conda/lib/python3.10/site-packages/gradio/
+RUN chmod +x /opt/conda/lib/python3.10/site-packages/gradio/frpc_linux_amd64_*
+COPY chenyh/nvcomposer/NVComposer-V0.1.ckpt /root/$IMAGE_UPPER/NVComposer-V0.1.ckpt
+COPY --from=base /opt/conda/lib/python3.10/site-packages /opt/conda/lib/python3.10/site-packages
+COPY --from=base /root/$IMAGE/$IMAGE_UPPER /root/$IMAGE_UPPER
+COPY --from=base /root/$IMAGE/启动器.ipynb /root/$IMAGE/start.sh  /root/
+COPY --from=base /root/$IMAGE/assets/  /root/assets/
\ No newline at end of file
--- a/NVComposer/.gitignore
+++ b/NVComposer/.gitignore
+.idea
+__pycache__
+.git
+*.pyc
+.DS_Store
+._*
+cache
\ No newline at end of file
--- a/NVComposer/LICENSE
+++ b/NVComposer/LICENSE
--- a/NVComposer/README.md
+++ b/NVComposer/README.md
+---
+title: NVComposer
+emoji: 📸
+colorFrom: indigo
+colorTo: gray
+sdk: gradio
+sdk_version: 4.38.1
+app_file: app.py
+pinned: false
+python_version: 3.1
+---
\ No newline at end of file
--- a/NVComposer/app.py
+++ b/NVComposer/app.py
--- a/NVComposer/assets/sample1.jpg
+++ b/NVComposer/assets/sample1.jpg
--- a/NVComposer/assets/sample1.mp4
+++ b/NVComposer/assets/sample1.mp4
--- a/NVComposer/assets/sample2.jpg
+++ b/NVComposer/assets/sample2.jpg
--- a/NVComposer/assets/sample2.mp4
+++ b/NVComposer/assets/sample2.mp4
--- a/NVComposer/assets/sample3.jpg
+++ b/NVComposer/assets/sample3.jpg
--- a/NVComposer/assets/sample3.mp4
+++ b/NVComposer/assets/sample3.mp4
--- a/NVComposer/assets/sample4.jpg
+++ b/NVComposer/assets/sample4.jpg
--- a/NVComposer/assets/sample4.mp4
+++ b/NVComposer/assets/sample4.mp4
--- a/NVComposer/assets/sample5-1.png
+++ b/NVComposer/assets/sample5-1.png
--- a/NVComposer/assets/sample5-2.png
+++ b/NVComposer/assets/sample5-2.png
--- a/NVComposer/assets/sample5.mp4
+++ b/NVComposer/assets/sample5.mp4
--- a/NVComposer/configs/dual_stream/nvcomposer.yaml
+++ b/NVComposer/configs/dual_stream/nvcomposer.yaml
+num_frames: &num_frames 16
+resolution: &resolution [576, 1024]
+model:
+  base_learning_rate: 1.0e-5
+  scale_lr: false
+  target: core.models.diffusion.DualStreamMultiViewDiffusionModel
+  params:
+    use_task_embedding: false
+    ray_as_image: false
+    apply_condition_mask_in_training_loss: true
+    separate_noise_and_condition: true
+    condition_padding_with_anchor: false
+    use_ray_decoder_loss_high_frequency_isolation: false
+    train_with_multi_view_feature_alignment: true
+    use_text_cross_attention_condition: false
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_time_steps_cond: 1
+    log_every_t: 200
+    time_steps: 1000
+    data_key_images: combined_images
+    data_key_rays: combined_rays
+    data_key_text_condition: caption
+    cond_stage_trainable: false
+    image_size: [72, 128]
+    channels: 10
+    monitor: global_step
+    scale_by_std: false
+    scale_factor: 0.18215
+    use_dynamic_rescale: true
+    base_scale: 0.3
+    use_ema: false
+    uncond_prob: 0.05
+    uncond_type: 'empty_seq'
+    use_camera_pose_query_transformer: false
+    random_cond: false
+    cond_concat: true
+    frame_mask: false
+    padding: true
+    per_frame_auto_encoding: true
+    parameterization: "v"
+    rescale_betas_zero_snr: true
+    use_noise_offset: false
+    scheduler_config:
+      target: utils.lr_scheduler.LambdaLRScheduler
+      interval: 'step'
+      frequency: 100
+      params:
+        start_step: 0
+        final_decay_ratio: 0.1
+        decay_steps: 100
+    bd_noise: false
+    unet_config:
+      target: core.modules.networks.unet_modules.UNetModel
+      params:
+        in_channels: 20
+        out_channels: 10
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        dropout: 0.1
+        num_head_channels: 64
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: true
+        temporal_conv: true
+        temporal_attention: true
+        temporal_selfatt_only: true
+        use_relative_position: false
+        use_causal_attention: false
+        temporal_length: *num_frames
+        addition_attention: true
+        image_cross_attention: true
+        image_cross_attention_scale_learnable: true
+        default_fs: 3
+        fs_condition: false
+        use_spatial_temporal_attention: true
+        use_addition_ray_output_head: true
+        ray_channels: 6
+        use_lora_for_rays_in_output_blocks: false
+        use_task_embedding: false
+        use_ray_decoder: true
+        use_ray_decoder_residual: true
+        full_spatial_temporal_attention: true
+        enhance_multi_view_correspondence: false
+        camera_pose_condition: true
+        use_feature_alignment: true
+    first_stage_config:
+      target: core.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_img_config:
+      target: core.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
+      params:
+        freeze: true
+    image_proj_model_config:
+      target: core.modules.encoders.resampler.Resampler
+      params:
+        dim: 1024
+        depth: 4
+        dim_head: 64
+        heads: 12
+        num_queries: 16
+        embedding_dim: 1280
+        output_dim: 1024
+        ff_mult: 4
+        video_length: *num_frames
--- a/NVComposer/core/basics.py
+++ b/NVComposer/core/basics.py
+import torch.nn as nn
+from utils.utils import instantiate_from_config
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def nonlinearity(type="silu"):
+    if type == "silu":
+        return nn.SiLU()
+    elif type == "leaky_relu":
+        return nn.LeakyReLU()
+class GroupNormSpecific(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def normalization(channels, num_groups=32):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :param num_groups: number of groupseg.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNormSpecific(num_groups, channels)
+class HybridConditioner(nn.Module):
+    def __init__(self, c_concat_config, c_crossattn_config):
+        super().__init__()
+        self.concat_conditioner = instantiate_from_config(c_concat_config)
+        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
+    def forward(self, c_concat, c_crossattn):
+        c_concat = self.concat_conditioner(c_concat)
+        c_crossattn = self.crossattn_conditioner(c_crossattn)
+        return {"c_concat": [c_concat], "c_crossattn": [c_crossattn]}
--- a/NVComposer/core/common.py
+++ b/NVComposer/core/common.py
+import math
+from inspect import isfunction
+import torch
+import torch.distributed as dist
+from torch import nn
+def gather_data(data, return_np=True):
+    """gather data from multiple processes to one list"""
+    data_list = [torch.zeros_like(data) for _ in range(dist.get_world_size())]
+    dist.all_gather(data_list, data)  # gather not supported with NCCL
+    if return_np:
+        data_list = [data.cpu().numpy() for data in data_list]
+    return data_list
+def autocast(f):
+    def do_autocast(*args, **kwargs):
+        with torch.cuda.amp.autocast(
+            enabled=True,
+            dtype=torch.get_autocast_gpu_dtype(),
+            cache_enabled=torch.is_autocast_cache_enabled(),
+        ):
+            return f(*args, **kwargs)
+    return do_autocast
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def noise_like(shape, device, repeat=False):
+    def repeat_noise():
+        return torch.randn((1, *shape[1:]), device=device).repeat(
+            shape[0], *((1,) * (len(shape) - 1))
+        )
+    def noise():
+        return torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def exists(val):
+    return val is not None
+def identity(*args, **kwargs):
+    return nn.Identity()
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+def shape_to_str(x):
+    shape_str = "x".join([str(x) for x in x.shape])
+    return shape_str
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+# USE_DEEP_SPEED_CHECKPOINTING = False
+# if USE_DEEP_SPEED_CHECKPOINTING:
+#     import deepspeed
+#
+#     _gradient_checkpoint_function = deepspeed.checkpointing.checkpoint
+# else:
+_gradient_checkpoint_function = torch.utils.checkpoint.checkpoint
+def gradient_checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        # args = tuple(inputs) + tuple(params)
+        # return CheckpointFunction.apply(func, len(inputs), *args)
+        if isinstance(inputs, tuple):
+            return _gradient_checkpoint_function(func, *inputs, use_reentrant=False)
+        else:
+            return _gradient_checkpoint_function(func, inputs, use_reentrant=False)
+    else:
+        return func(*inputs)
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    @torch.cuda.amp.custom_fwd
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    @torch.cuda.amp.custom_bwd  # add this
+    def backward(ctx, *output_grads):
+        """
+        for x in ctx.input_tensors:
+            if isinstance(x, int):
+                print('-----------------', ctx.run_function)
+        """
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads