Commit d5878167 authored by mashun1's avatar mashun1
Browse files

llava-next

parents
Pipeline #2589 failed with stages
in 0 seconds
export OMP_NUM_THREADS=8
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
# export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE}
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DEBUG=INFO
VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
############### Pretrain ################
# Stage 2
PROMPT_VERSION="qwen_1_5"
#torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \
llava/train/train_dpo.py \
--deepspeed scripts/zero3.json \
--model_name_or_path lmms-lab/LongVA-7B \
--version $PROMPT_VERSION \
--dpo_alpha 1.0 --beta 0.1 --gamma 0 \
--data_path="/data/llava_video/shareVideoGPTV/dpo/sft_dpo_17k.jsonl" \
--image_folder /data/llava_data \
--video_folder /llava_video/shareVideoGPTV/frames/all_frames/ \
--mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--mm_spatial_pool_stride 2 \
--mm_resampler_type "spatial_pool" \
--mm_spatial_pool_out_channels 1024 \
--group_by_modality_length True \
--image_aspect_ratio anyres \
--image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
--mm_patch_merge_type unires \
--bf16 True \
--run_name $MID_RUN_NAME \
--output_dir "/checkpoints/${MID_RUN_NAME}" \
--num_train_epochs 3 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 16 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 3000 \
--save_total_limit 1 \
--learning_rate 5e-7 \
--weight_decay 0. \
--warmup_ratio 0.1 \
--lr_scheduler_type "linear" \
--logging_steps 1 \
--tf32 True \
--model_max_length 32768 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
--lazy_preprocess True \
--report_to wandb \
--torch_compile True \
--torch_compile_backend "inductor" \
--dataloader_drop_last True \
--attn_implementation sdpa
\ No newline at end of file
export OMP_NUM_THREADS=8
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
# export NCCL_IB_HCA=${ARNOLD_RDMA_DEVICE}
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DEBUG=INFO
VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
# DPO Stage
PROMPT_VERSION="qwen_1_5"
SFT_MODEL="lmms-lab/llava-onevision-qwen2-7b-ov"
EPOCH=1
beta=0.1
DPO_RUN_NAME="llava-onevision-qwen2-7b-ov_dpo-beta${beta}-epoch${EPOCH}"
DPO_CLEAN_NAME="${DPO_RUN_NAME##*/}"
OUTPUT_DIR="<your-output-folder>/${DPO_CLEAN_NAME}"
DATA_PATH="<your-data-path>"
echo $DPO_RUN_NAME
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
llava/train/train_dpo.py \
--deepspeed scripts/zero3.json \
--model_name_or_path=${SFT_MODEL} \
--dpo_alpha=1.0 \
--beta=${beta} \
--gamma=0 \
--version $PROMPT_VERSION \
--data_path=$DATA_PATH \
--image_folder "<your-image-folder>" \
--mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
--unfreeze_mm_vision_tower True \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--image_aspect_ratio anyres_max_9 \
--image_grid_pinpoints "(1x1),...,(6x6)" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--run_name $DPO_CLEAN_NAME \
--output_dir $OUTPUT_DIR \
--num_train_epochs $EPOCH \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 1 \
--learning_rate 5e-7 \
--weight_decay 0. \
--warmup_ratio 0.1 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 32768 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to wandb \
--dataloader_drop_last True
export OMP_NUM_THREADS=8
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DEBUG=INFO
LLM_VERSION="Qwen/Qwen2-7B-Instruct"
# for 7b model we recommend bs=1, accum=2, 16 nodes, 128 gpus, lr=1e-5, warmup=0.03
# for 72b model we recommend bs=1, accum=1, 32 nodes, 256 gpus, lr=1e-5, warmup=0.03
LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
############### Pretrain ################
BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
############### Finetune ################
# Stage 2
PROMPT_VERSION="qwen_1_5"
RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov_stage_am9"
PREV_STAGE_CHECKPOINT="/mnt/bn/vl-research/checkpoints/onevision/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mid_to_final_next_3m_am9_july14" # replace it with your last checkpoint training from single image collection
echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
echo "MID_RUN_NAME: ${RUN_NAME}"
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
llava/train/train_mem.py \
--deepspeed scripts/zero3.json \
--model_name_or_path $PREV_STAGE_CHECKPOINT \
--version $PROMPT_VERSION \
--data_path /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/scripts/i18n/scale_llms/next_ov_stage_july21.yaml \
--image_folder /mnt/bn/vl-research/data/llava_data \
--video_folder /mnt/bn/vl-research/data/llava_video \
--mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
--mm_vision_tower_lr=2e-6 \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--image_aspect_ratio anyres_max_9 \
--image_grid_pinpoints "(1x1),...,(6x6)" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--run_name $RUN_NAME \
--output_dir /mnt/bn/vl-research/checkpoints/onevision/$RUN_NAME \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 1 \
--learning_rate 1e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 32768 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to wandb \
--torch_compile True \
--torch_compile_backend "inductor" \
--dataloader_drop_last True \
--frames_upbound 32
exit 0;
# You can delete the sdpa attn_implementation if you want to use flash attn
export OMP_NUM_THREADS=8
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DEBUG=INFO
LLM_VERSION="Qwen/Qwen2-7B-Instruct"
# for 7b model we recommend bs=1, accum=2, 16 nodes, 128 gpus, lr=1e-5, warmup=0.03
# for 72b model we recommend bs=1, accum=1, 32 nodes, 256 gpus, lr=1e-5, warmup=0.03
LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
############### Pretrain ################
BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
############### Finetune ################
# Stage 2
PROMPT_VERSION="qwen_1_5"
RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-si_stage_am9"
PREV_STAGE_CHECKPOINT="/mnt/bn/vl-research/checkpoints/onevision/xxxxxxxxxxxxxxxx" # replace it with your last checkpoint training from mid stage
echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
echo "MID_RUN_NAME: ${RUN_NAME}"
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
llava/train/train_mem.py \
--deepspeed scripts/zero3.json \
--model_name_or_path $PREV_STAGE_CHECKPOINT \
--version $PROMPT_VERSION \
--data_path /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/scripts/i18n/scale_llms/next_3p2m_single_image.yaml \
--image_folder /mnt/bn/vl-research/data/llava_data \
--video_folder /mnt/bn/vl-research/data/llava_video \
--mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
--mm_vision_tower_lr=2e-6 \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--image_aspect_ratio anyres_max_9 \
--image_grid_pinpoints "(1x1),...,(6x6)" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--run_name $RUN_NAME \
--output_dir /mnt/bn/vl-research/checkpoints/onevision/$RUN_NAME \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 1 \
--learning_rate 1e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 32768 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to wandb \
--torch_compile True \
--torch_compile_backend "inductor" \
--dataloader_drop_last True \
--frames_upbound 32
exit 0;
datasets:
- json_path: /mnt/bn/vl-research/data/llava_instruct/blip558k_stage1.5_finetune_w_prompt.json # released in lmms-lab/LLaVA-ReCap-*
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/coco118k_stage1.5_finetune_w_prompt.json # released in lmms-lab/LLaVA-ReCap-*
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/cc3m_recap_data_prompt_v2.json # released in lmms-lab/LLaVA-ReCap-*
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_tr_sft.json # released in lmms-lab/LLaVA-OneVision-Mid-Data
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/instruct_azure_dc_zh_92K.json # not released, explained at https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main/scripts/train
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/Evol-Instruct-GPT4-Turbo-143K.json # released in lmms-lab/LLaVA-OneVision-Mid-Data
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/synthdog_zh/synthdog_zh_100k.json # released in lmms-lab/LLaVA-OneVision-Mid-Data
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/synthdog_en/synthdog_en_100k.json # released in lmms-lab/LLaVA-OneVision-Mid-Data
sampling_strategy: all
\ No newline at end of file
datasets:
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_next_fit_mix_filtered_text_wild_738590.json
sampling_strategy: "first:50%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_wild_4v_39k.json # not released
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_wild_4v_12k.json # not released
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mavis_math_metagen_87358.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mavis_math_rule_geo_100000.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/cambrian_filtered_gpt4vo_sp_token_fltd_max10k_checked.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/VisualWebInstruct_filtered_263589.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/visual_chat_en_26048_gpt4o_coco_checked.json # not released
sampling_strategy: "all"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/gpt4o_combinations_51316.json
# sampling_strategy: "all"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/chrome_writting_train_8835.json
# sampling_strategy: "first:20%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/k12_printing_train_256646.json
# sampling_strategy: "first:1%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/iiit5k_annotations_2000.json
# sampling_strategy: "first:20%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/hme100k_train_clean_74502.json
# sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sroie_data_33626.json
# sampling_strategy: "first:1%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/orand_car_a_train_2009.json
# sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/orand_car_b_train_3000.json
# sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llavar_gpt4_20k.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/ai2d_azuregpt_detailed_understanding_4874.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/infographic_vqa_4404.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/infographic_azuregpt4v_1992.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/lrv_chart_1787.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/lrv_normal_gpt4v_filtered_10500.json
sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/scienceqa_nona_context_19218.json
# sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/allava_instruct_vflan4v_20000.json
sampling_strategy: "first:30%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/allava_instruct_laion4v_50000.json
sampling_strategy: "first:30%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/textocr_gpt4v_train_converted_25114.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/ai2d_train_internvl_single_12413.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/textcaps_train_21952.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_qa_sft.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_cap_sft.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_ie_sft.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_kg_sft.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/vision_flan_filtered_186070.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mathqa_29837.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo3k_2101.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_qa_converted_67833.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_align_converted_60252.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4o_dataset.jsonl
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-coco-50k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-knowledge-2k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-llava-30k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-sam-20k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_CLEVR-Math_5290.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_FigureQA_17597.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_Geometry3K_9734.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_GeoQA+_17172.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_GEOS_508.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_IconQA_22599.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_MapQA_5235.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_PlotQA_5485.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_PMC-VQA_35958.json
sampling_strategy: "first:1%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_Super-CLEVR_8652.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_TabMWP_22462.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_TQA_10181.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_UniGeo_11959.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VizWiz_6614.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VQA-AS_5907.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VQA-RAD_2130.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_qwen2_72b_st_300000_sp_token_fltd_299992.json
sampling_strategy: "end:20%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_l3_80b_st_300000.json
sampling_strategy: "end:20%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_l3_80b_mt_300000_sp_token_fltd_299998.json
sampling_strategy: "end:20%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/image_textualization_dataset_filtered.json
sampling_strategy: "first:20%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/ai2d_llava_format_2434.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/chart2text_26961.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/chartqa_18265_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/diagram_image_to_text_300.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/hateful_memes_8500_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/hitab_2500_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/iam_5663.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/infographic_vqa_2118_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/intergps_1280_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/mapqa_37417_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/rendered_text_10000.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/robut_sqa_8514.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/robut_wikisql_74989.json
sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/robut_wtq_38246_llava_format_filtered_4000tokens_38236.json
# sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/screen2words_15730.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/tabmwp_22722.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/tallyqa_98680_llava_format.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/st_vqa_17247_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/tqa_llava_format_27307.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/visual7w_llava_format_14366.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/visualmrc_3027.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/vqarad_313_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/vsr_2157_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/vistext_9969.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/websight_10000.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_ofa_DEMON-FULL_filtered_311085.json # released in lmms-lab/M4-Instruct
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_ofa_mantis-instruct_reformatted.json # released in lmms-lab/M4-Instruct
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/academic_source_30s_v1_all.json # will be released in next version of LLaVA-NeXT-Video
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/0718_0_30_s_academic_mc_v0_1_all.json # will be released in next version of LLaVA-NeXT-Video
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4video_255000.json # download from sharegpt4video
sampling_strategy: all
export OMP_NUM_THREADS=8
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DEBUG=INFO
LLM_VERSION="Qwen/Qwen2-7B-Instruct"
LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
############### Pretrain ################
PROMPT_VERSION=plain
BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
llava/train/train_mem.py \
--deepspeed scripts/zero3.json \
--model_name_or_path ${LLM_VERSION} \
--version ${PROMPT_VERSION} \
--data_path /blip_558k/blip_558k_plain.json \
--image_folder /blip_558k/images \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_tunable_parts="mm_mlp_adapter" \
--mm_vision_select_layer -2 \
--mm_projector_type mlp2x_gelu \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--bf16 True \
--output_dir /checkpoints/projectors/${BASE_RUN_NAME} \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "no" \
--save_steps 50000 \
--learning_rate 1e-3 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 8192 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
--lazy_preprocess True \
--report_to wandb \
--run_name $BASE_RUN_NAME \
--attn_implementation sdpa
# You can delete the sdpa attn_implementation if you want to use flash attn
\ No newline at end of file
export OMP_NUM_THREADS=8
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DEBUG=INFO
LLM_VERSION="Qwen/Qwen2-7B-Instruct"
LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
############### Pretrain ################
PROMPT_VERSION=plain
BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
llava/train/train_mem.py \
--deepspeed scripts/zero3.json \
--model_name_or_path ${LLM_VERSION} \
--version ${PROMPT_VERSION} \
--data_path /blip_558k/blip_558k_plain.json \
--image_folder /blip_558k/images \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_tunable_parts="mm_mlp_adapter" \
--mm_vision_select_layer -2 \
--mm_projector_type mlp2x_gelu \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--bf16 True \
--output_dir /checkpoints/projectors/${BASE_RUN_NAME} \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "no" \
--save_steps 50000 \
--learning_rate 1e-3 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 8192 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
--lazy_preprocess True \
--report_to wandb \
--run_name $BASE_RUN_NAME \
--attn_implementation sdpa
# You can delete the sdpa attn_implementation if you want to use flash attn
\ No newline at end of file
datasets:
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_next_fit_mix_filtered_text_wild_738590.json # released in lmms-lab/LLaVA-NeXT-Data
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_wild_4v_39k.json # not released
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_wild_4v_12k.json # not released
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llavar_gpt4_20k.json
sampling_strategy: "all"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sroie_data_33626.json
# sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/orand_car_a_train_2009.json
# sampling_strategy: "all"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/orand_car_b_train_3000.json
# sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mavis_math_metagen_87358.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mavis_math_rule_geo_100000.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/chrome_writting_train_8835.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/k12_printing_train_256646.json
sampling_strategy: "first:1%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/iiit5k_annotations_2000.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/hme100k_train_clean_74502.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/ai2d_azuregpt_detailed_understanding_4874.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/infographic_vqa_4404.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/infographic_azuregpt4v_1992.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/lrv_chart_1787.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/lrv_normal_gpt4v_filtered_10500.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/scienceqa_nona_context_19218.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/allava_instruct_vflan4v_20000.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/allava_instruct_laion4v_50000.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/textocr_gpt4v_train_converted_25114.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/ai2d_train_internvl_single_12413.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/textcaps_train_21952.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_qa_sft.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_cap_sft.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_ie_sft.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_kg_sft.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/vision_flan_filtered_186070.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mathqa_29837.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo3k_2101.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_qa_converted_67833.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_align_converted_60252.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-coco-50k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-knowledge-2k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-llava-30k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-sam-20k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_CLEVR-Math_5290.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_FigureQA_17597.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_Geometry3K_9734.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_GeoQA+_17172.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_GEOS_508.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_IconQA_22599.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_MapQA_5235.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_PMC-VQA_35958.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_Super-CLEVR_8652.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_TabMWP_22462.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_TQA_10181.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_UniGeo_11959.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VizWiz_6614.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VQA-AS_5907.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VQA-RAD_2130.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/Evol-Instruct-GPT4-Turbo-143000.json
sampling_strategy: "first:30%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_qwen2_72b_st_300000_sp_token_fltd_299992.json
sampling_strategy: "first:50%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_l3_80b_st_300000.json
sampling_strategy: "first:50%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_l3_80b_mt_300000_sp_token_fltd_299998.json
sampling_strategy: "first:50%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/image_textualization_dataset_filtered.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/cambrian_filtered_gpt4vo_sp_token_fltd_max10k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4o_dataset.jsonl
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/ai2d_llava_format_2434.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/aokvqa_16539_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/chart2text_26961.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/chartqa_18265_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/clevr_70000_llava_format.json
sampling_strategy: "first:1%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/diagram_image_to_text_300.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/dvqa_200000_llava_format.json
sampling_strategy: "first:1%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/figureqa_100000_llava_format.json
sampling_strategy: "first:1%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/geomverse_9303.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/hateful_memes_8500_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/hitab_2500_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/iam_5663.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/raven_42000.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/iconqa_llava_format_27307.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/infographic_vqa_2118_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/intergps_1280_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/mapqa_37417_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/multihiertt_7619.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/rendered_text_10000.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/robut_sqa_8514.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/robut_wikisql_74989.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/robut_wtq_38246_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/screen2words_15730.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/scienceqa_llava_format_4976.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/tabmwp_22722.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/tallyqa_98680_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/st_vqa_17247_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/tqa_llava_format_27307.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/visual7w_llava_format_14366.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/visualmrc_3027.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/vqarad_313_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/vsr_2157_llava_format.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/vistext_9969.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/websight_10000.json
sampling_strategy: "all"
#!/bin/bash
ROOT_DIR="/home/LLaVA-NeXT"
if [ ! -e $ROOT_DIR ]; then
echo "The root dir does not exist. Exiting the script."
exit 1
fi
cd $ROOT_DIR
export PYTHONWARNINGS=ignore
export TOKENIZERS_PARALLELISM=false
CKPT=$1
CONV_MODE=$2
FRAMES=$3
POOL_STRIDE=$4
POOL_MODE=$5
NEWLINE_POSITION=$6
OVERWRITE=$7
VIDEO_PATH=$8
if [ "$OVERWRITE" = False ]; then
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}_overwrite_${OVERWRITE}
else
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
fi
python3 playground/demo/video_demo.py \
--model-path $CKPT \
--video_path ${VIDEO_PATH} \
--output_dir ./work_dirs/video_demo/$SAVE_DIR \
--output_name pred \
--chunk-idx $(($IDX - 1)) \
--overwrite ${OVERWRITE} \
--mm_spatial_pool_stride ${POOL_STRIDE:-4} \
--for_get_frames_num $FRAMES \
--conv-mode $CONV_MODE \
--mm_spatial_pool_mode ${POOL_MODE:-average} \
--mm_newline_position ${NEWLINE_POSITION:-grid} \
--prompt "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes."
#!/bin/bash
ROOT_DIR="root to LLaVA-NeXT-Video"
if [ ! -e $ROOT_DIR ]; then
echo "The root dir does not exist. Exiting the script."
exit 1
fi
cd $ROOT_DIR
export PYTHONWARNINGS=ignore
export TOKENIZERS_PARALLELISM=false
CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
gpu_list="${CUDA_VISIBLE_DEVICES}"
GPULIST=(${(s:,:)gpu_list})
CHUNKS=${#GPULIST[@]}
echo "Using $CHUNKS GPUs"
CKPT=$1
CONV_MODE=$2
FRAMES=$3
OVERWRITE=$4
PREDEFINED_CONFIGURE=$5
mm_spatial_pool_stride=$6
MODEL_MAX_LENGTH=${7:-0}
CKPT=$1
CONV_MODE=$2
FRAMES=$3
POOL_STRIDE=$4
OVERWRITE=$5
CHUNKS=${6:-1}
PATCHIFY=False
OPENAIKEY="INPUT YOUR OPENAI API"
if [ "$OVERWRITE" = False ]; then
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}_overwrite_${OVERWRITE}
else
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
fi
echo $SAVE_DIR
# for IDX in {1..$CHUNKS}; do
# GPU_ID=${GPULIST[$IDX]} # Note: Zsh arrays are 1-indexed by default
# # GPU_FREE=0
# # while [ $GPU_FREE -eq 0 ]; do
# # # Using nvidia-smi to get the memory usage of the GPU with ID $GPU_ID
# # # Parsing the output to extract the memory usage, and checking if it is "0"
# # MEM_USAGE=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $GPU_ID | tr -d '[:space:]')
# # if [ "$MEM_USAGE" -eq 0 ]; then
# # GPU_FREE=1
# # echo "GPU $GPU_ID is free."
# # else
# # echo "GPU $GPU_ID is in use. Memory used: ${MEM_USAGE}MiB. Checking again in 100 seconds..."
# # sleep 100
# # fi
# # done
# echo "Running on GPU $GPU_ID"
# CUDA_VISIBLE_DEVICES=$GPU_ID python3 llavavid/eval/model_activitynet_qa.py \
# --model-path $CKPT \
# --video_dir ./data/llava_video/ActivityNet-QA/all_test \
# --gt_file_question ./data/llava_video/ActivityNet-QA/test_q.json \
# --gt_file_answers ./data/llava_videoActivityNet-QA/test_a.json \
# --output_dir ./work_dirs/eval_activitynet/$SAVE_DIR \
# --output_name pred \
# --num-chunks $CHUNKS \
# --chunk-idx $(($IDX - 1)) \
# --overwrite ${OVERWRITE} \
# --patchify_video_feature ${PATCHIFY} \
# --predefined_configure ${PREDEFINED_CONFIGURE} \
# --mm_spatial_pool_stride ${mm_spatial_pool_stride:-4} \
# --for_get_frames_num $FRAMES \
# --model-max-length ${MODEL_MAX_LENGTH:-0} \
# --conv-mode $CONV_MODE &
# done
# wait
python3 llava/eval/eval_activitynet_qa.py \
--pred_path ./work_dirs/eval_activitynet/$SAVE_DIR \
--output_dir ./work_dirs/eval_activitynet/$SAVE_DIR/results \
--output_json ./work_dirs/eval_activitynet/$SAVE_DIR/results.json \
--num_chunks $CHUNKS \
--api_key $OPENAIKEY \
# --num_tasks 16 \
\ No newline at end of file
#!/bin/bash
ROOT_DIR="root to LLaVA-NeXT-Video"
if [ ! -e $ROOT_DIR ]; then
echo "The root dir does not exist. Exiting the script."
exit 1
fi
cd $ROOT_DIR
export python3WARNINGS=ignore
export TOKENIZERS_PARALLELISM=false
# CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
gpu_list="${CUDA_VISIBLE_DEVICES}"
GPULIST=(${(s:,:)gpu_list})
# CHUNKS=${#GPULIST[@]}
# echo "Using $CHUNKS GPUs"
CKPT=$1
CONV_MODE=$2
FRAMES=$3
POOL_STRIDE=$4
OVERWRITE=$5
CHUNKS=${6:-1}
OPENAIKEY="INPUT YOUR OPENAI API"
if [ "$OVERWRITE" = False ]; then
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}_overwrite_${OVERWRITE}
else
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
fi
echo $SAVE_DIR
# Assuming GPULIST is a bash array containing your GPUs
GPULIST=(0 1 2 3 4 5 6 7)
# Get the number of GPUs
NUM_GPUS=${#GPULIST[@]}
# Calculate GPUs per chunk
GPUS_PER_CHUNK=$((NUM_GPUS / CHUNKS))
for IDX in $(seq 1 $CHUNKS); do
START=$(((IDX-1) * GPUS_PER_CHUNK))
LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
# Convert the chunk GPUs array to a comma-separated string
CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
# ALL_GPUS_FREE=0
# while [ $ALL_GPUS_FREE -eq 0 ]; do
# ALL_GPUS_FREE=1 # Assume all GPUs are free initially
# for GPU_ID in $CHUNK_GPUS; do
# MEM_USAGE=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $GPU_ID | tr -d '[:space:]')
# # Assuming a GPU is considered free if its memory usage is less than 100 MiB
# if [ "$MEM_USAGE" -ge 100 ]; then
# ALL_GPUS_FREE=0
# echo "GPU $GPU_ID is in use. Memory used: ${MEM_USAGE}MiB."
# break # Exit the loop early as we found a GPU that is not free
# fi
# done
# if [ $ALL_GPUS_FREE -eq 0 ]; then
# echo "Not all GPUs in chunk are free. Checking again in 100 seconds..."
# sleep 100
# fi
# done
echo "CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR"
CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 llava/eval/model_video_chatgpt_general.py \
--model-path $CKPT \
--video_dir ./data/llava_video/video-chatgpt/evaluation/Test_Videos/ \
--gt_file ./data/llava_video/video-chatgpt/evaluation/generic_qa.json \
--output_dir ./work_dirs/eval_video_chatgpt/$SAVE_DIR \
--output_name pred \
--num-chunks $CHUNKS \
--chunk-idx $(($IDX - 1)) \
--overwrite ${OVERWRITE:-true} \
--mm_spatial_pool_stride ${POOL_STRIDE:-4} \
--for_get_frames_num $FRAMES \
--conv-mode $CONV_MODE &
done
wait
python3 llava/eval/evaluate_benchmark_1_correctness.py \
--pred_path ./work_dirs/eval_video_chatgpt/$SAVE_DIR \
--output_dir ./work_dirs/eval_video_chatgpt/$SAVE_DIR/correctness_results \
--output_json ./work_dirs/eval_video_chatgpt/$SAVE_DIR/correctness_results.json \
--num_chunks $CHUNKS \
--output_name pred \
--num_tasks 16 \
--api_key $OPENAIKEY \
python3 llava/eval/evaluate_benchmark_2_detailed_orientation.py \
--pred_path ./work_dirs/eval_video_chatgpt/$SAVE_DIR \
--output_dir ./work_dirs/eval_video_chatgpt/$SAVE_DIR/detail_results \
--output_json ./work_dirs/eval_video_chatgpt/$SAVE_DIR/detail_results.json \
--num_chunks $CHUNKS \
--output_name pred \
--num_tasks 16 \
--api_key $OPENAIKEY \
python3 llava/eval/evaluate_benchmark_3_context.py \
--pred_path ./work_dirs/eval_video_chatgpt/$SAVE_DIR \
--output_dir ./work_dirs/eval_video_chatgpt/$SAVE_DIR/context_results \
--output_json ./work_dirs/eval_video_chatgpt/$SAVE_DIR/context_results.json \
--num_chunks $CHUNKS \
--output_name pred \
--num_tasks 16 \
--api_key $OPENAIKEY \
for IDX in $(seq 1 $CHUNKS); do
START=$(((IDX-1) * GPUS_PER_CHUNK))
LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
# Convert the chunk GPUs array to a comma-separated string
CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
# ALL_GPUS_FREE=0
# while [ $ALL_GPUS_FREE -eq 0 ]; do
# ALL_GPUS_FREE=1 # Assume all GPUs are free initially
# for GPU_ID in $CHUNK_GPUS; do
# MEM_USAGE=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $GPU_ID | tr -d '[:space:]')
# # Assuming a GPU is considered free if its memory usage is less than 100 MiB
# if [ "$MEM_USAGE" -ge 100 ]; then
# ALL_GPUS_FREE=0
# echo "GPU $GPU_ID is in use. Memory used: ${MEM_USAGE}MiB."
# break # Exit the loop early as we found a GPU that is not free
# fi
# done
# if [ $ALL_GPUS_FREE -eq 0 ]; then
# echo "Not all GPUs in chunk are free. Checking again in 100 seconds..."
# sleep 100
# fi
# done
echo "CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR"
CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 llava/eval/model_video_chatgpt_general.py \
--model-path $CKPT \
--video_dir ./data/llava_video/video-chatgpt/evaluation/Test_Videos/ \
--gt_file ./data/llava_video/video-chatgpt/evaluation/temporal_qa.json \
--output_dir ./work_dirs/eval_video_chatgpt/$SAVE_DIR \
--output_name pred_temporal \
--num-chunks $CHUNKS \
--chunk-idx $(($IDX - 1)) \
--for_get_frames_num $FRAMES \
--overwrite ${OVERWRITE} \
--mm_spatial_pool_stride ${POOL_STRIDE:-4} \
--conv-mode $CONV_MODE &
done
wait
python3 llava/eval/evaluate_benchmark_4_temporal.py \
--pred_path ./work_dirs/eval_video_chatgpt/$SAVE_DIR \
--output_dir ./work_dirs/eval_video_chatgpt/$SAVE_DIR/temporal_results \
--output_json ./work_dirs/eval_video_chatgpt/$SAVE_DIR/temporal_results.json \
--num_chunks $CHUNKS \
--output_name pred_temporal \
--num_tasks 16 \
--api_key $OPENAIKEY \
for IDX in $(seq 1 $CHUNKS); do
START=$(((IDX-1) * GPUS_PER_CHUNK))
LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
# Convert the chunk GPUs array to a comma-separated string
CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
# ALL_GPUS_FREE=0
# while [ $ALL_GPUS_FREE -eq 0 ]; do
# ALL_GPUS_FREE=1 # Assume all GPUs are free initially
# for GPU_ID in $CHUNK_GPUS; do
# MEM_USAGE=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $GPU_ID | tr -d '[:space:]')
# # Assuming a GPU is considered free if its memory usage is less than 100 MiB
# if [ "$MEM_USAGE" -ge 100 ]; then
# ALL_GPUS_FREE=0
# echo "GPU $GPU_ID is in use. Memory used: ${MEM_USAGE}MiB."
# break # Exit the loop early as we found a GPU that is not free
# fi
# done
# if [ $ALL_GPUS_FREE -eq 0 ]; then
# echo "Not all GPUs in chunk are free. Checking again in 100 seconds..."
# sleep 100
# fi
# done
echo "CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR"
CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 llava/eval/model_video_chatgpt_consistency.py \
--model-path $CKPT \
--video_dir ./data/llava_video/video-chatgpt/evaluation/Test_Videos/ \
--gt_file ./data/llava_video/video-chatgpt/evaluation/consistency_qa.json \
--output_dir ./work_dirs/eval_video_chatgpt/$SAVE_DIR \
--output_name pred_consistency \
--num-chunks $CHUNKS \
--chunk-idx $(($IDX - 1)) \
--mm_spatial_pool_stride ${POOL_STRIDE:-4} \
--for_get_frames_num $FRAMES \
--overwrite ${OVERWRITE} \
--conv-mode $CONV_MODE &
done
wait
python3 llava/eval/evaluate_benchmark_5_consistency.py \
--pred_path ./work_dirs/eval_video_chatgpt/$SAVE_DIR \
--output_dir ./work_dirs/eval_video_chatgpt/$SAVE_DIR/consistency_results \
--output_json ./work_dirs/eval_video_chatgpt/$SAVE_DIR/consistency_results.json \
--num_chunks $CHUNKS \
--output_name pred_consistency \
--num_tasks 16 \
--api_key $OPENAIKEY \
#!/bin/bash
ROOT_DIR="/mnt/bn/vl-research/workspace/yhzhang/llava-next-video"
if [ ! -e $ROOT_DIR ]; then
echo "The root dir does not exist. Exiting the script."
exit 1
fi
cd $ROOT_DIR
export PYTHONWARNINGS=ignore
export TOKENIZERS_PARALLELISM=false
CKPT=$1
CONV_MODE=$2
FRAMES=$3
POOL_STRIDE=$4
OVERWRITE=$5
CHUNKS=${6:-1}
DO_CENTER_CROP=${7:-False}
echo "Using $CHUNKS GPUs"
LOAD_8BIT=False
if [ "$OVERWRITE" = False ]; then
if [ "$MODEL_MAX_LENGTH" = 0 ]; then
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_overwrite_${OVERWRITE}
else
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_overwrite_${OVERWRITE}
fi
else
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
fi
SAVE_DIR=${SAVE_DIR}_do_center_crop_${DO_CENTER_CROP}
# Assuming GPULIST is a bash array containing your GPUs
GPULIST=(0 1 2 3 4 5 6 7)
# GPULIST=(0)
# Get the number of GPUs
NUM_GPUS=${#GPULIST[@]}
# Calculate GPUs per chunk
GPUS_PER_CHUNK=$((NUM_GPUS / CHUNKS))
for IDX in $(seq 1 $CHUNKS); do
START=$(((IDX-1) * GPUS_PER_CHUNK))
LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
# Convert the chunk GPUs array to a comma-separated string
CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
# ALL_GPUS_FREE=0
# while [ $ALL_GPUS_FREE -eq 0 ]; do
# ALL_GPUS_FREE=1 # Assume all GPUs are free initially
# for GPU_ID in $CHUNK_GPUS; do
# MEM_USAGE=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $GPU_ID | tr -d '[:space:]')
# # Assuming a GPU is considered free if its memory usage is less than 100 MiB
# if [ "$MEM_USAGE" -ge 100 ]; then
# ALL_GPUS_FREE=0
# echo "GPU $GPU_ID is in use. Memory used: ${MEM_USAGE}MiB."
# break # Exit the loop early as we found a GPU that is not free
# fi
# done
# if [ $ALL_GPUS_FREE -eq 0 ]; then
# echo "Not all GPUs in chunk are free. Checking again in 100 seconds..."
# sleep 100
# fi
# done
echo "CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR"
CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 llava/eval/model_video_description_from_t2v.py \
--model-path $CKPT \
--gt_file /mnt/bn/vl-research-1t/tuyen/webvid_hdvg_movie_pond5_for_captioning_evaluation/webvid_hdvg_movie_pond5_for_captioning_evaluation.processed.csv \
--output_dir ./work_dirs/eval_video_description_from_t2v/$SAVE_DIR \
--output_name pred \
--num-chunks $CHUNKS \
--chunk-idx $(($IDX - 1)) \
--overwrite ${OVERWRITE} \
--mm_spatial_pool_stride ${POOL_STRIDE:-4} \
--for_get_frames_num $FRAMES \
--load_8bit $LOAD_8BIT \
--do_center_crop $DO_CENTER_CROP \
--conv-mode $CONV_MODE &
done
wait
cat ${ROOT_DIR}/work_dirs/eval_video_description_from_t2v/$SAVE_DIR/${CHUNKS}* > ${ROOT_DIR}/work_dirs/eval_video_description_from_t2v/$SAVE_DIR/pred.json
#!/bin/bash
ROOT_DIR="root to LLaVA-NeXT-Video"
if [ ! -e $ROOT_DIR ]; then
echo "The root dir does not exist. Exiting the script."
exit 1
fi
cd $ROOT_DIR
export PYTHONWARNINGS=ignore
export TOKENIZERS_PARALLELISM=false
OPENAIKEY="INPUT YOUR OPENAI API"
SAVE_DIR=$1
python3 llava/eval/evaluate_benchmark_video_detail_description.py \
--pred_path ./work_dirs/eval_video_detail_description/$SAVE_DIR/pred.json \
--output_dir ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results \
--output_json ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results.json \
--num_chunks 1 \
--num_tasks 16 \
--api_key $OPENAIKEY \
\ No newline at end of file
#!/bin/bash
ROOT_DIR="/mnt/bn/vl-research/workspace/yhzhang/llava-next-video"
if [ ! -e $ROOT_DIR ]; then
echo "The root dir does not exist. Exiting the script."
exit 1
fi
cd $ROOT_DIR
export PYTHONWARNINGS=ignore
export TOKENIZERS_PARALLELISM=false
OPENAIKEY="INPUT YOUR OPENAI API"
CKPT=$1
CONV_MODE=$2
FRAMES=$3
POOL_STRIDE=$4
OVERWRITE=$5
CHUNKS=${6:-1}
echo "Using $CHUNKS GPUs"
if [ "$OVERWRITE" = False ]; then
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}_overwrite_${OVERWRITE}
else
SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
fi
# Assuming GPULIST is a bash array containing your GPUs
GPULIST=(0 1 2 3 4 5 6 7)
# Get the number of GPUs
NUM_GPUS=${#GPULIST[@]}
# Calculate GPUs per chunk
GPUS_PER_CHUNK=$((NUM_GPUS / CHUNKS))
for IDX in $(seq 1 $CHUNKS); do
START=$(((IDX-1) * GPUS_PER_CHUNK))
LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
# Convert the chunk GPUs array to a comma-separated string
CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
# ALL_GPUS_FREE=0
# while [ $ALL_GPUS_FREE -eq 0 ]; do
# ALL_GPUS_FREE=1 # Assume all GPUs are free initially
# for GPU_ID in $CHUNK_GPUS; do
# MEM_USAGE=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i $GPU_ID | tr -d '[:space:]')
# # Assuming a GPU is considered free if its memory usage is less than 100 MiB
# if [ "$MEM_USAGE" -ge 100 ]; then
# ALL_GPUS_FREE=0
# echo "GPU $GPU_ID is in use. Memory used: ${MEM_USAGE}MiB."
# break # Exit the loop early as we found a GPU that is not free
# fi
# done
# if [ $ALL_GPUS_FREE -eq 0 ]; then
# echo "Not all GPUs in chunk are free. Checking again in 100 seconds..."
# sleep 100
# fi
# done
echo "CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR"
CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 llava/eval/model_video_detail_description.py \
--model-path $CKPT \
--video_dir ./data/llava_video/video-chatgpt/evaluation/Test_Videos/ \
--output_dir ./work_dirs/eval_video_detail_description/$SAVE_DIR \
--output_name pred \
--num-chunks $CHUNKS \
--chunk-idx $(($IDX - 1)) \
--overwrite ${OVERWRITE} \
--mm_spatial_pool_stride ${POOL_STRIDE:-4} \
--for_get_frames_num $FRAMES \
--conv-mode $CONV_MODE &
done
wait
python3 llava/eval/evaluate_benchmark_video_detail_description.py \
--pred_path ./work_dirs/eval_video_detail_description/$SAVE_DIR \
--output_dir ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results \
--output_json ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results.json \
--num_chunks $CHUNKS \
--num_tasks 16 \
--api_key $OPENAIKEY \
#!/bin/bash
# Set up the data folder
IMAGE_FOLDER="XXX"
VIDEO_FOLDER="XXX"
DATA_YAML="XXX" # e.g exp.yaml
############### Prepare Envs #################
python3 -m pip install flash-attn --no-build-isolation
alias python=python3
############### Show Envs ####################
nvidia-smi
################ Arnold Jobs ################
LLM_VERSION="Qwen/Qwen2-72B-Instruct"
LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-72B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
# Stage 2
PROMPT_VERSION="qwen_1_5"
MID_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov_to_video_am9"
PREV_STAGE_CHECKPOINT="lmms-lab/llava-onevision-qwen2-72b-ov-si"
echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
echo "MID_RUN_NAME: ${MID_RUN_NAME}"
# ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \
deepspeed --master_port 30000 \
llava/train/train_mem.py \
--deepspeed scripts/zero3.json \
--model_name_or_path $PREV_STAGE_CHECKPOINT \
--version $PROMPT_VERSION \
--data_path $DATA_YAML \
--image_folder $IMAGE_FOLDER \
--video_folder $VIDEO_FOLDER \
--mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
--mm_vision_tower_lr=2e-6 \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--image_aspect_ratio anyres_max_9 \
--image_grid_pinpoints "(1x1),...,(6x6)" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--run_name $MID_RUN_NAME \
--output_dir ./work_dirs/$MID_RUN_NAME \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 500 \
--save_total_limit 1 \
--learning_rate 1e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 32768 \
--gradient_checkpointing True \
--dataloader_num_workers 2 \
--lazy_preprocess True \
--report_to wandb \
--torch_compile True \
--torch_compile_backend "inductor" \
--dataloader_drop_last True \
--frames_upbound 32 \
--mm_newline_position grid \
--add_time_instruction True \
--force_sample True \
--mm_spatial_pool_stride 2
exit 0;
\ No newline at end of file
#!/bin/bash
# Set up the data folder
IMAGE_FOLDER="XXX"
VIDEO_FOLDER="XXX"
DATA_YAML="XXX" # e.g exp.yaml
############### Prepare Envs #################
python3 -m pip install flash-attn --no-build-isolation
alias python=python3
############### Show Envs ####################
nvidia-smi
################ Arnold Jobs ################
LLM_VERSION="Qwen/Qwen2-7B-Instruct"
LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
#
BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
# Stage 2
PROMPT_VERSION="qwen_1_5"
MID_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov_to_video_am9"
PREV_STAGE_CHECKPOINT="lmms-lab/llava-onevision-qwen2-7b-ov-si"
echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
echo "MID_RUN_NAME: ${MID_RUN_NAME}"
# ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" --nnodes="${ARNOLD_WORKER_NUM}" --node_rank="${ARNOLD_ID}" --master_addr="${METIS_WORKER_0_HOST}" --master_port="${port_in_cmd}" \
deepspeed --master_port 30000 \
llava/train/train_mem.py \
--deepspeed scripts/zero3.json \
--model_name_or_path $PREV_STAGE_CHECKPOINT \
--version $PROMPT_VERSION \
--data_path $DATA_YAML \
--image_folder $IMAGE_FOLDER \
--video_folder $VIDEO_FOLDER \
--mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
--mm_vision_tower_lr=2e-6 \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--image_aspect_ratio anyres_max_9 \
--image_grid_pinpoints "(1x1),...,(6x6)" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--run_name $MID_RUN_NAME \
--output_dir ./work_dirs/$MID_RUN_NAME \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 500 \
--save_total_limit 1 \
--learning_rate 1e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 32768 \
--gradient_checkpointing True \
--dataloader_num_workers 2 \
--lazy_preprocess True \
--report_to wandb \
--torch_compile True \
--torch_compile_backend "inductor" \
--dataloader_drop_last True \
--frames_upbound 64 \
--mm_newline_position grid \
--add_time_instruction True \
--force_sample True \
--mm_spatial_pool_stride 2
exit 0;
\ No newline at end of file
datasets:
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_next_fit_mix_filtered_text_wild_738590.json
sampling_strategy: "first:50%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_wild_4v_39k.json
# sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_wild_4v_12k.json
# sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mavis_math_metagen_87358.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mavis_math_rule_geo_100000.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/cambrian_filtered_gpt4vo_sp_token_fltd_max10k_checked.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/VisualWebInstruct_filtered_263589.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/visual_chat_en_26048_gpt4o_coco_checked.json
sampling_strategy: "all"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/gpt4o_combinations_51316.json
# sampling_strategy: "all"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/chrome_writting_train_8835.json
# sampling_strategy: "first:20%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/k12_printing_train_256646.json
# sampling_strategy: "first:1%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/iiit5k_annotations_2000.json
# sampling_strategy: "first:20%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/hme100k_train_clean_74502.json
# sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sroie_data_33626.json
# sampling_strategy: "first:1%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/orand_car_a_train_2009.json
# sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/orand_car_b_train_3000.json
# sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llavar_gpt4_20k.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/ai2d_azuregpt_detailed_understanding_4874.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/infographic_vqa_4404.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/infographic_azuregpt4v_1992.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/lrv_chart_1787.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/lrv_normal_gpt4v_filtered_10500.json
sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/scienceqa_nona_context_19218.json
# sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/allava_instruct_vflan4v_20000.json
sampling_strategy: "first:30%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/allava_instruct_laion4v_50000.json
sampling_strategy: "first:30%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/textocr_gpt4v_train_converted_25114.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/ai2d_train_internvl_single_12413.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/textcaps_train_21952.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_qa_sft.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_cap_sft.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_ie_sft.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_kg_sft.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/vision_flan_filtered_186070.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mathqa_29837.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo3k_2101.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_qa_converted_67833.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_align_converted_60252.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4o_dataset.jsonl
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-coco-50k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-knowledge-2k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-llava-30k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-sam-20k.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_CLEVR-Math_5290.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_FigureQA_17597.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_Geometry3K_9734.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_GeoQA+_17172.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_GEOS_508.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_IconQA_22599.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_MapQA_5235.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_PlotQA_5485.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_PMC-VQA_35958.json
sampling_strategy: "first:1%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_Super-CLEVR_8652.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_TabMWP_22462.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_TQA_10181.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_UniGeo_11959.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VizWiz_6614.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VQA-AS_5907.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VQA-RAD_2130.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_qwen2_72b_st_300000_sp_token_fltd_299992.json
sampling_strategy: "end:20%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_l3_80b_st_300000.json
sampling_strategy: "end:20%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_l3_80b_mt_300000_sp_token_fltd_299998.json
sampling_strategy: "end:20%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/image_textualization_dataset_filtered.json
sampling_strategy: "first:20%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/ai2d_llava_format_2434.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/chart2text_26961.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/chartqa_18265_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/diagram_image_to_text_300.json
sampling_strategy: "all"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/hateful_memes_8500_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/hitab_2500_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/iam_5663.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/infographic_vqa_2118_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/intergps_1280_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/mapqa_37417_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/rendered_text_10000.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/robut_sqa_8514.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/robut_wikisql_74989.json
sampling_strategy: "first:10%"
# - json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/robut_wtq_38246_llava_format_filtered_4000tokens_38236.json
# sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/screen2words_15730.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/tabmwp_22722.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/tallyqa_98680_llava_format.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/st_vqa_17247_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/tqa_llava_format_27307.json
sampling_strategy: "first:5%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/visual7w_llava_format_14366.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/visualmrc_3027.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/vqarad_313_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/vsr_2157_llava_format.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/vistext_9969.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/cauldron/websight_10000.json
sampling_strategy: "first:10%"
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_ofa_DEMON-FULL_filtered_311085.json
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_ofa_mantis-instruct_reformatted.json
sampling_strategy: all
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/academic_source_30s_v1_all.json
# sampling_strategy: all
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/0718_0_30_s_academic_mc_v0_1_all.json
# sampling_strategy: all
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4video_255000.json
# sampling_strategy: all
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_academic_v0_1_cap.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_youtube_v0_1_cap.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_academic_v0_1_cap.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_youtube_v0_1_cap.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/1_2_m_academic_v0_1_cap.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/1_2_m_youtube_v0_1_cap.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_academic_oe_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_academic_mc_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_youtube_oe_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_youtube_mc_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_activitynetqa_oe_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_nextqa_oe_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_nextqa_mc_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/0_30_s_perceptiontest_mc_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_academic_oe_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_academic_mc_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_youtube_oe_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_youtube_mc_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_activitynetqa_oe_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_nextqa_oe_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_nextqa_mc_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/30_60_s_perceptiontest_mc_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/1_2_m_academic_oe_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/1_2_m_academic_mc_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/1_2_m_youtube_oe_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/1_2_m_youtube_mc_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/1_2_m_activitynetqa_oe_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/1_2_m_nextqa_oe_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/1_2_m_nextqa_mc_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/sharegptvideo_qa_255k.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/2_3_m_academic_v0_1_cap.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/2_3_m_youtube_v0_1_cap.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/2_3_m_academic_oe_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/2_3_m_academic_mc_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/2_3_m_youtube_oe_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/2_3_m_youtube_mc_v0_1_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/2_3_m_nextqa_oe_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/2_3_m_nextqa_mc_qa.json
sampling_strategy: "all"
- json_path: /mnt/bn/tiktok-mm-3/aiic/users/wujinming/_training_data/jsons/tos/2_3_m_activitynetqa_oe_qa.json
sampling_strategy: "all"
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment