Initial commit

3a0371d5 · suily · 3a0371d5 · 3a0371d5 · 3a0371d5 · 3a0371d5
Commit 3a0371d5 authored Nov 28, 2024 by suily
20 changed files
--- a/feat/stage3_clip_feat/v_XD3yFrJHiv8.npy
+++ b/feat/stage3_clip_feat/v_XD3yFrJHiv8.npy
--- a/feat/stage3_clip_feat/v_fzp5ooc727c.npy
+++ b/feat/stage3_clip_feat/v_fzp5ooc727c.npy
--- a/feat/stage3_clip_feat/v_lZKrd84QElk.npy
+++ b/feat/stage3_clip_feat/v_lZKrd84QElk.npy
--- a/feat/stage3_clip_feat/v_oP77DgsbhKQ.npy
+++ b/feat/stage3_clip_feat/v_oP77DgsbhKQ.npy
--- a/feat/stage3_clip_feat/v_ogQozSI5V8U.npy
+++ b/feat/stage3_clip_feat/v_ogQozSI5V8U.npy
--- a/images/demo.mp4
+++ b/images/demo.mp4
--- a/images/ex.png
+++ b/images/ex.png
--- a/images/ex1.png
+++ b/images/ex1.png
--- a/images/ex2.png
+++ b/images/ex2.png
--- a/images/ex3.png
+++ b/images/ex3.png
--- a/images/framework.png
+++ b/images/framework.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode = 1107
+# 模型名称
+modelName=vtimellm_pytorch
+# 模型描述
+modelDescription=VTimeLLM是一种新颖的Video LLM，旨在对时间边界进行细粒度的视频时刻理解和推理。
+# 应用场景
+appScenario=推理,训练,视频理解,气象,交通,电商,广媒,教育
+# 框架类型
+frameType=pytorch
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+# torch
+# flash-attn
+# torchvision
+# deepspeed
+decord
+easydict
+einops
+gradio
+numpy
+pandas>=2.0.3
+peft>=0.4.0
+Pillow
+tqdm
+transformers==4.31.0
+git+https://github.com/openai/CLIP.git
+sentencepiece
+protobuf
+wandb
+ninja
+huggingface_hub
--- a/scripts/stage1.sh
+++ b/scripts/stage1.sh
+#!/bin/bash
+MODEL_VERSION=vicuna-v1-5-7b
+gpu_vis=0,1 # per_device_train_batch_size * gradient_accumulation_steps * n_gpus = 128
+MASTER_PORT=29029
+deepspeed --include localhost:$gpu_vis --master_port $MASTER_PORT vtimellm/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path ./checkpoints/vicuna-7b-v1.5 \
+    --version plain \
+    --data_path ./data/blip_laion_cc_sbu_558k.json \
+    --feat_folder ./feat/558k_clip_feat \
+    --tune_mm_mlp_adapter True \
+    --output_dir ./checkpoints/vtimellm-$MODEL_VERSION-stage1_test \
+    --bf16 True \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 200 \
+    --save_total_limit 1 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --tf32 True \
+    --logging_steps 1 \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
\ No newline at end of file
--- a/scripts/stage1_glm.sh
+++ b/scripts/stage1_glm.sh
+#!/bin/bash
+MODEL_VERSION=chatglm3-6b
+gpu_vis=0 # per_device_train_batch_size * gradient_accumulation_steps * n_gpus = 128
+MASTER_PORT=29570
+deepspeed --include localhost:$gpu_vis --master_port $MASTER_PORT vtimellm/train/train.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version plain \
+    --data_path ./data/blip_laion_cc_sbu_558k_chinese.json \
+    --feat_folder /path/to/stage1_feat \
+    --tune_mm_mlp_adapter True \
+    --output_dir ./checkpoints/vtimellm-$MODEL_VERSION-stage1 \
+    --bf16 True \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 24000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
--- a/scripts/stage2.sh
+++ b/scripts/stage2.sh
+#!/bin/bash
+MODEL_VERSION=vicuna-v1-5-7b
+gpu_vis=0,1 # per_device_train_batch_size * gradient_accumulation_steps * n_gpus = 128
+MASTER_PORT=29029
+deepspeed --include localhost:$gpu_vis --master_port $MASTER_PORT vtimellm/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --lora_enable True \
+    --model_name_or_path ./checkpoints/vicuna-7b-v1.5 \
+    --version v1 \
+    --data_path ./data/stage2.json \
+    --feat_folder ./feat/intern_clip_feat \
+    --pretrain_mm_mlp_adapter ./checkpoints/vtimellm-$MODEL_VERSION-stage1_test/mm_projector.bin \
+    --output_dir ./checkpoints/vtimellm-$MODEL_VERSION-stage2_test \
+    --bf16 True \
+    --num_train_epochs 2 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 200 \
+    --save_total_limit 1 \
+    --learning_rate 1e-4 \
+    --freeze_mm_mlp_adapter True \
+    --lora_r 64 \
+    --lora_alpha 128 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
\ No newline at end of file
--- a/scripts/stage2_glm.sh
+++ b/scripts/stage2_glm.sh
+#!/bin/bash
+MODEL_VERSION=chatglm3-6b
+gpu_vis=0 # per_device_train_batch_size * gradient_accumulation_steps * n_gpus = 128
+MASTER_PORT=29570
+deepspeed --include localhost:$gpu_vis --master_port $MASTER_PORT vtimellm/train/train.py \
+    --deepspeed ./scripts/zero3.json \
+    --lora_enable True \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version plain \
+    --data_path ./data/stage2_chinese.json \
+    --feat_folder /path/to/stage2_feat \
+    --pretrain_mm_mlp_adapter ./checkpoints/vtimellm-$MODEL_VERSION-stage1/mm_projector.bin \
+    --output_dir ./checkpoints/vtimellm-$MODEL_VERSION-stage2 \
+    --bf16 True \
+    --num_train_epochs 2 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 16 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-4 \
+    --freeze_mm_mlp_adapter True \
+    --lora_r 64 \
+    --lora_alpha 128 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
--- a/scripts/stage3.sh
+++ b/scripts/stage3.sh
+#!/bin/bash
+MODEL_VERSION=vicuna-v1-5-7b
+gpu_vis=0,1 # per_device_train_batch_size * gradient_accumulation_steps * n_gpus = 128
+MASTER_PORT=29029
+deepspeed --include localhost:$gpu_vis --master_port $MASTER_PORT vtimellm/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --lora_enable True \
+    --training_stage 3 \
+    --model_name_or_path ./checkpoints/vicuna-7b-v1.5 \
+    --version v1 \
+    --data_path ./data/stage3.json \
+    --feat_folder ./feat/stage3_clip_feat \
+    --pretrain_mm_mlp_adapter ./checkpoints/vtimellm-$MODEL_VERSION-stage1_test/mm_projector.bin \
+    --stage2_path ./checkpoints/vtimellm-$MODEL_VERSION-stage2_test \
+    --output_dir ./checkpoints/vtimellm-$MODEL_VERSION-stage3_test \
+    --bf16 True \
+    --num_train_epochs 2 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 200 \
+    --save_total_limit 1 \
+    --learning_rate 1e-4 \
+    --freeze_mm_mlp_adapter True \
+    --lora_r 64 \
+    --lora_alpha 128 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
\ No newline at end of file
--- a/scripts/stage3_glm.sh
+++ b/scripts/stage3_glm.sh
+#!/bin/bash
+MODEL_VERSION=chatglm3-6b
+gpu_vis=0 # per_device_train_batch_size * gradient_accumulation_steps * n_gpus = 128
+MASTER_PORT=29570
+deepspeed --include localhost:$gpu_vis --master_port $MASTER_PORT vtimellm/train/train.py \
+    --deepspeed ./scripts/zero2.json \
+    --lora_enable True \
+    --training_stage 3 \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version plain \
+    --data_path ./data/stage3_chinese.json \
+    --feat_folder /path/to/stage3_feat \
+    --pretrain_mm_mlp_adapter ./checkpoints/vtimellm-$MODEL_VERSION-stage1/mm_projector.bin \
+    --stage2_path ./checkpoints/vtimellm-$MODEL_VERSION-stage2 \
+    --output_dir ./checkpoints/vtimellm-$MODEL_VERSION-stage3 \
+    --bf16 True \
+    --num_train_epochs 2 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 16 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-4 \
+    --freeze_mm_mlp_adapter True \
+    --lora_r 64 \
+    --lora_alpha 128 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb
--- a/scripts/zero2.json
+++ b/scripts/zero2.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
\ No newline at end of file