llava-next

d5878167 · mashun1 · d5878167 · d5878167 · d5878167 · d5878167
Commit d5878167 authored Mar 28, 2025 by mashun1
20 changed files
--- a/scripts/archived/finetune_mixtral_1.5.sh
+++ b/scripts/archived/finetune_mixtral_1.5.sh
+#!/bin/bash
+dataset_name=$1
+cd /mnt/bn/vl-research/workspace/yhzhang/LLaVA
+# Install yolk3k if not installed
+if ! pip show yolk3k > /dev/null 2>&1; then
+    pip install yolk3k
+fi
+# Get the installed version of transformers
+installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
+# Get the latest version of transformers from PyPI
+latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
+# Check if the installed version is not the latest
+if [ "$installed_version" != "$latest_version" ]; then
+    pip install -U transformers
+fi
+# Get the installed version of deepspeed
+installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
+# Get the latest version of deepspeed from PyPI
+# latest_version=$(yolk -V deepspeed | cut -d ' ' -f 2)
+# Check if the installed version is not the latest
+if [ "$installed_version" != "0.12.2" ]; then
+    pip install deepspeed==0.12.2
+fi
+# Install yolk3k if not installed
+if ! pip show flash-attn > /dev/null 2>&1; then
+    pip install flash-attn --no-build-isolation
+fi
+################## MISTRAL ##################
+PROMPT_VERSION=mistral_instruct
+MODEL_VERSION="Mistral-7B-Instruct-v0.2"
+################## MISTRAL ##################
+################## project ##################
+PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-mlp2x_gelu-pretrain_blip558k_plain"
+################## data ##################
+DATA_NAME=$dataset_name
+# wandb configure
+export WANDB_API_KEY="03fc62d68025c9498cf6493432551badd7d4f953"
+wandb login $WANDB_API_KEY
+export WANDB_NAME=$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME
+export WANDB_PROJECT=LLaVA_Mixtral
+export WANDB_MODE=online
+wandb online
+deepspeed --master_port 26000 \
+    llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/$DATA_NAME.json \
+    --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/$PROJECT_NAME/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_projector_type mlp2x_gelu \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava--$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME--finetune \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 16 \
+    --lazy_preprocess True 
+    # --report_to wandb
--- a/scripts/archived/finetune_mixtral_1.6_336px_anyres.sh
+++ b/scripts/archived/finetune_mixtral_1.6_336px_anyres.sh
+#!/bin/bash
+dataset_name=$1
+cd /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next
+# Install yolk3k if not installed
+if ! pip show yolk3k > /dev/null 2>&1; then
+    pip install yolk3k
+fi
+pip install pydantic
+# Get the installed version of transformers
+installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
+# Get the latest version of transformers from PyPI
+latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
+# Check if the installed version is not the latest
+if [ "$installed_version" != "4.36.2" ]; then
+    pip install transformers==4.36.2
+fi
+# Get the installed version of deepspeed
+installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
+# Check if the installed version is not the latest
+if [ "$installed_version" != "0.12.2" ]; then
+    pip install deepspeed==0.12.2
+fi
+# Install flash-atten if not installed
+if ! pip show flash-attn > /dev/null 2>&1; then
+    pip install flash-attn --no-build-isolation
+fi
+################## MISTRAL ##################
+PROMPT_VERSION=mistral_instruct
+MODEL_VERSION="Mistral-7B-Instruct-v0.2"
+################## MISTRAL ##################
+################## project ##################
+PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain"
+################## data ##################
+DATA_NAME=$dataset_name
+# wandb configure
+export WANDB_API_KEY=e464cc107357c7b38e87f239bc3eb2ce5fb73c7c
+export WANDB_PROJECT=llava
+export WANDB_NAME=$PROJECT_NAME--$DATA_NAME--336px--anyres--sft
+export WANDB_MODE=online
+wandb online
+deepspeed --master_port 26000 \
+    llava/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path /mnt/bn/vl-research/workspace/project/2023/LLaVA/checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/$DATA_NAME.json \
+    --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --pretrain_mm_mlp_adapter /mnt/bn/vl-research/workspace/project/2023/LLaVA/checkpoints/ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain/mm_projector.bin \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --group_by_modality_length True \
+    --unfreeze_mm_vision_tower True \
+    --mm_vision_tower_lr 2e-6 \
+    --image_aspect_ratio anyres \
+    --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
+    --mm_patch_merge_type spatial_unpad \
+    --bf16 True \
+    --output_dir ./checkpoints/$PROJECT_NAME--$DATA_NAME--336px--anyres--sft \
+    --num_train_epochs 9 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "epoch" \
+    --save_steps 1500 \
+    --learning_rate 5e-6 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 4096 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8 \
+    --lazy_preprocess True \
+    --report_to wandb
--- a/scripts/archived/finetune_mixtral_1.6_336px_anyres_freeze_vision.sh
+++ b/scripts/archived/finetune_mixtral_1.6_336px_anyres_freeze_vision.sh
+#!/bin/bash
+dataset_name=$1
+cd /mnt/bn/vl-research/workspace/yhzhang/LLaVA
+# Install yolk3k if not installed
+if ! pip show yolk3k > /dev/null 2>&1; then
+    pip install yolk3k
+fi
+pip install pydantic
+# Get the installed version of transformers
+installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
+# Get the latest version of transformers from PyPI
+latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
+# Check if the installed version is not the latest
+if [ "$installed_version" != "4.36.2" ]; then
+    pip install transformers==4.36.2
+fi
+# Get the installed version of deepspeed
+installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
+# Check if the installed version is not the latest
+if [ "$installed_version" != "0.12.2" ]; then
+    pip install deepspeed==0.12.2
+fi
+# Install flash-atten if not installed
+if ! pip show flash-attn > /dev/null 2>&1; then
+    pip install flash-attn --no-build-isolation
+fi
+################## MISTRAL ##################
+PROMPT_VERSION=mistral_instruct
+MODEL_VERSION="Mistral-7B-Instruct-v0.2"
+################## MISTRAL ##################
+################## project ##################
+PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain"
+################## data ##################
+DATA_NAME=$dataset_name
+# wandb configure
+export WANDB_API_KEY=e464cc107357c7b38e87f239bc3eb2ce5fb73c7c
+export WANDB_PROJECT=llava
+export WANDB_NAME=$PROJECT_NAME--$DATA_NAME--336px--unfreeze--anyres--sft
+export WANDB_MODE=online
+wandb online
+deepspeed --master_port 26000 \
+    llava/train/train_mem.py \
+    --deepspeed ./scripts/zero3.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/$DATA_NAME.json \
+    --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --pretrain_mm_mlp_adapter /mnt/bn/vl-research/workspace/project/2023/LLaVA/checkpoints/ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_projector_type mlp2x_gelu \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --group_by_modality_length True \
+    --image_aspect_ratio anyres \
+    --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
+    --mm_patch_merge_type spatial_unpad \
+    --bf16 True \
+    --output_dir ./checkpoints/$PROJECT_NAME--$DATA_NAME--336px--anyres--unfreeze--sft \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 16 \
--- a/scripts/archived/finetune_mixtral_1.6_336px_anyres_lmms_eval.sh
+++ b/scripts/archived/finetune_mixtral_1.6_336px_anyres_lmms_eval.sh
+#!/bin/bash
+# set up wandb
+export WANDB_API_KEY=a651c244635bc6f913ab654af3f0eebaecdc9381
+export WANDB_ENTITY=llava-vl
+export WANDB_PROJECT=llava-next
+export PYTHONWARNINGS="ignore"
+cd /mnt/bn/vl-research/workspace/boli01/projects/lmms-eval
+pip install -e .
+# set up llava dev env
+cd /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next
+################## MISTRAL ##################
+PROMPT_VERSION=mistral_instruct
+MODEL_VERSION="Mistral-7B-Instruct-v0.2"
+################## MISTRAL ##################
+################## project ##################
+PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain"
+################## data ##################
+DATA_NAME='llava_caps20k_chartqa19k'
+export WANDB_NAME=$PROJECT_NAME--$DATA_NAME--336px--anyres--sft
+export WANDB_MODE=online
+wandb online
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" deepspeed --master_port 26000 --include localhost:0,1,2,3,4,5,6,7 llava/train/train_mem.py \
+    --deepspeed ./scripts/zero3_offload.json \
+    --model_name_or_path mistralai/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/llava_instruct/$DATA_NAME.json \
+    --image_folder /mnt/bn/vl-research/data/llava \
+    --vision_tower openai/clip-vit-large-patch14-336 \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --group_by_modality_length True \
+    --unfreeze_mm_vision_tower True \
+    --mm_vision_tower_lr 2e-6 \
+    --image_aspect_ratio anyres \
+    --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
+    --mm_patch_merge_type spatial_unpad \
+    --bf16 True \
+    --output_dir ./checkpoints/$PROJECT_NAME--llava1.6--336px--anyres--sft \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1500 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 4096 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 32 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --run_name $WANDB_NAME
+# starting here is the args for evaluation
+    --eval_num_processes 4 \ 
+    --task_names mme,docvqa_val \
+    --model_args pretrained=./checkpoints/$PROJECT_NAME--$DATA_NAME--336px--anyres--sft \
+    --limit 8 \
+    --batch_size 1 \
+    --log_samples \
+    --log_samples_suffix debug \
+    --output_path ./logs/
--- a/scripts/archived/finetune_mixtral_copy.sh
+++ b/scripts/archived/finetune_mixtral_copy.sh
+#!/bin/bash
+cd /mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA
+# Install yolk3k if not installed
+if ! pip show yolk3k > /dev/null 2>&1; then
+    pip install yolk3k
+fi
+# Get the installed version of transformers
+installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
+# Get the latest version of transformers from PyPI
+latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
+# Check if the installed version is not the latest
+if [ "$installed_version" != "$latest_version" ]; then
+    pip install -U transformers
+fi
+# Get the installed version of deepspeed
+installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
+# Get the latest version of deepspeed from PyPI
+latest_version=$(yolk -V deepspeed | cut -d ' ' -f 2)
+# Check if the installed version is not the latest
+if [ "$installed_version" != "$latest_version" ]; then
+    pip install deepspeed==0.12.2
+fi
+# Install yolk3k if not installed
+if ! pip show flash-attn > /dev/null 2>&1; then
+    pip install flash-attn --no-build-isolation
+fi
+################## MISTRAL ##################
+PROMPT_VERSION=mistral_instruct
+MODEL_VERSION="Mistral-7B-Instruct-v0.2"
+################## VICUNA ##################
+################## project ##################
+PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-mlp2x_gelu-pretrain_blip558k_plain"
+################## data ##################
+DATA_NAME="llava_instruct_150k"
+# wandb configure
+export WANDB_API_KEY="03fc62d68025c9498cf6493432551badd7d4f953"
+wandb login $WANDB_API_KEY
+export WANDB_NAME=$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME
+export WANDB_PROJECT=LLaVA_Mixtral
+export WANDB_MODE=online
+wandb online
+deepspeed --master_port 26000 \
+    llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/$DATA_NAME.json \
+    --image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data/coco/train2017 \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/$PROJECT_NAME/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_projector_type mlp2x_gelu \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava--$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME--finetune \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 16 \
+    --lazy_preprocess True \
+    --report_to wandb
--- a/scripts/archived/finetune_qlora.sh
+++ b/scripts/archived/finetune_qlora.sh
+#!/bin/bash
+# Uncomment and set the following variables correspondingly to run this script:
+################## VICUNA ##################
+# PROMPT_VERSION=v1
+# MODEL_VERSION="vicuna-v1-3-7b"
+################## VICUNA ##################
+################## LLaMA-2 ##################
+# PROMPT_VERSION="llava_llama_2"
+# MODEL_VERSION="llama-2-7b-chat"
+################## LLaMA-2 ##################
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --lora_enable True \
+    --bits 4 \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/llava_instruct_80k.json \
+    --image_folder /path/to/coco/train2017 \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True \
+    --dataloader_num_workers 16 \
+    --report_to wandb
--- a/scripts/archived/finetune_sqa.sh
+++ b/scripts/archived/finetune_sqa.sh
+#!/bin/bash
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path lmsys/vicuna-13b-v1.3 \
+    --version $PROMPT_VERSION \
+    --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \
+    --image_folder /Data/ScienceQA/data/scienceqa/images/train \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \
+    --num_train_epochs 12 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 16 \
+    --lazy_preprocess True \
+    --report_to wandb
--- a/scripts/archived/merge_lora_weights.py
+++ b/scripts/archived/merge_lora_weights.py
+import argparse
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import get_model_name_from_path
+def merge_lora(args):
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map="cpu")
+    model.save_pretrained(args.save_model_path)
+    tokenizer.save_pretrained(args.save_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True)
+    parser.add_argument("--model-base", type=str, required=True)
+    parser.add_argument("--save-model-path", type=str, required=True)
+    args = parser.parse_args()
+    merge_lora(args)
--- a/scripts/archived/pretrain.sh
+++ b/scripts/archived/pretrain.sh
+#!/bin/bash
+# Uncomment and set the following variables correspondingly to run this script:
+# MODEL_VERSION=vicuna-v1-3-7b
+# MODEL_VERSION=llama-2-7b-chat
+########### DO NOT CHANGE ###########
+########### USE THIS FOR BOTH ###########
+PROMPT_VERSION=plain
+########### DO NOT CHANGE ###########
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path /path/to/pretrain_data.json \
+    --image_folder /path/to/images \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --tune_mm_mlp_adapter True \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 24000 \
+    --learning_rate 2e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 16 \
+    --lazy_preprocess True \
+    --report_to wandb
--- a/scripts/archived/quick_check.py
+++ b/scripts/archived/quick_check.py
+import json
+import os
+import argparse
+from tqdm import tqdm
+import yaml
+def check_missing_images(json_path, images_folder):
+    data = json.load(open(json_path, "r"))
+    missing_data = []
+    for i, d in enumerate(tqdm(data)):
+        image = d["image"] if "image" in d else ""
+        if image != "":
+            path = os.path.join(images_folder, image)
+            if not os.path.exists(path):
+                print(f"Missing image: {path}")
+                missing_data.append(d)
+    return missing_data
+def read_yaml_to_llava_data(yaml_path, images_folder):
+    print(f"Reading YAML file: {yaml_path}")
+    with open(yaml_path, "r") as f:
+        data = yaml.safe_load(f)
+    llava_json_paths = data["datasets"]
+    for item in llava_json_paths:
+        json_path = item["json_path"]
+        missing_data = check_missing_images(json_path, images_folder)
+        if len(missing_data) > 0:
+            print(f"Missing images in {json_path}:")
+            for d in missing_data:
+                print(d)
+def direct_check_llava_data(json_path, images_folder):
+    missing_data = check_missing_images(json_path, images_folder)
+    if len(missing_data) > 0:
+        print(f"Missing images in {json_path}:")
+        for d in missing_data:
+            print(d)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Check for missing images in dataset.")
+    parser.add_argument("--yaml_path", type=str, default="", help="Path to the YAML file containing the dataset.")
+    parser.add_argument("--json_path", type=str, default="", help="Path to the JSON file containing the dataset.")
+    parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing the images.")
+    args = parser.parse_args()
+    if args.json_path != "":
+        direct_check_llava_data(args.json_path, args.images_folder)
+    elif args.yaml_path != "":
+        read_yaml_to_llava_data(args.yaml_path, args.images_folder)
--- a/scripts/archived/sqa_eval_batch.sh
+++ b/scripts/archived/sqa_eval_batch.sh
+#!/bin/bash
+CHUNKS=8
+for IDX in {0..7}; do
+    CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
+        --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
+        --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
+        --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
+        --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
+        --num-chunks $CHUNKS \
+        --chunk-idx $IDX \
+        --conv-mode llava_v1 &
+done
--- a/scripts/archived/sqa_eval_gather.sh
+++ b/scripts/archived/sqa_eval_gather.sh
+#!/bin/bash
+CHUNKS=8
+output_file="test_llava-13b.jsonl"
+# Clear out the output file if it exists.
+> "$output_file"
+# Loop through the indices and concatenate each file.
+for idx in $(seq 0 $((CHUNKS-1))); do
+  cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file"
+done
+python llava/eval/eval_science_qa.py \
+    --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \
+    --result-file ./test_llava-13b.jsonl \
+    --output-file ./test_llava-13b_output.json \
+    --output-result ./test_llava-13b_result.json
--- a/scripts/interleave/eval_all.sh
+++ b/scripts/interleave/eval_all.sh
+#!/bin/bash
+export HIP_VISIBLE_DEVICES=4,5,6,7
+# evaluate
+./scripts/interleave/eval_interleave_3d.sh /home/LLaVA-NeXT/ckpts/llava-next-interleave-qwen-7b /home/LLaVA-NeXT/data/interleave_bench multi_image_in_domain
+#./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_image_out_domain
+#./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_view_in_domain
--- a/scripts/interleave/eval_interleave_3d.sh
+++ b/scripts/interleave/eval_interleave_3d.sh
+alias python=python3
+CKPT_PATH=$1
+NAME=$(echo "$CKPT_PATH" | awk -F'/' '{print $NF}')
+echo $NAME
+##### set images path 
+DATA_PATH=$2
+EVAL_TYPE=$3
+JSON_PATH=$2/$3.json
+############################### eval multi-image 
+RESULT_NAME="logs/${NAME}/${EVAL_TYPE}"
+echo $RESULT_NAME
+mkdir -p logs/${NAME}
+file_path=${RESULT_NAME}/result.jsonl
+bash scripts/interleave/eval_multiprocess.sh \
+${CKPT_PATH} \
+${JSON_PATH} \
+${RESULT_NAME} \
+${DATA_PATH} \
+"" \
+4 0
+python3 llava/eval/evaluate_interleave.py --result-dir ${RESULT_NAME}
--- a/scripts/interleave/eval_multiprocess.sh
+++ b/scripts/interleave/eval_multiprocess.sh
+#!/bin/bash
+# Check if three arguments are passed
+if [ "$#" -ne 7 ]; then
+    echo "Usage: $0 <model_path> <question_path> <base_answer_path> <image_folder> <extra_prompt> <N> <temperature>"
+    exit 1
+fi
+# Assign the command line arguments to variables
+model_path=$1
+question_path=$2
+base_answer_path=$3
+image_folder=$4
+extra_prompt=$5
+N=$6
+temperature=$7
+# Loop over each chunk/process
+for (( chunk_id=0; chunk_id<N; chunk_id++ ))
+do
+    # Define the answer path for each chunk
+    answer_path="${base_answer_path}/result_${chunk_id}.jsonl"
+    if [ -f "$answer_path" ]; then
+        rm "$answer_path"
+    fi
+    # Run the Python program in the background
+    CUDA_VISIBLE_DEVICES="$chunk_id" python3 llava/eval/model_vqa.py --model-path "$model_path" --question-file "$question_path" --answers-file "$answer_path" --num-chunks "$N" --chunk-idx "$chunk_id" --image-folder "$image_folder" --extra-prompt "$extra_prompt" --temperature "$temperature" &
+    # Uncomment below if you need a slight delay between starting each process
+    # sleep 0.1
+done
+# Wait for all background processes to finish
+wait
+merged_file="${base_answer_path}/result.jsonl"
+if [ -f "$merged_file" ]; then
+    rm "$merged_file"
+fi
+# Merge all the JSONL files into one
+#cat "${base_answer_path}"_*.jsonl > "${base_answer_path}.jsonl"
+for ((i=0; i<N; i++)); do
+  input_file="${base_answer_path}/result_${i}.jsonl"
+  cat "$input_file" >> "${base_answer_path}/result.jsonl"
+done
+# remove the unmerged files
+for (( chunk_id=0; chunk_id<N; chunk_id++ ))
+do
+    # Define the answer path for each chunk
+    answer_path="${base_answer_path}/result_${chunk_id}.jsonl"
+    if [ -f "$answer_path" ]; then
+        rm "$answer_path"
+    fi
+done
\ No newline at end of file
--- a/scripts/qwen.py
+++ b/scripts/qwen.py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+device = "cuda"  # the device to load the model onto
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B-Chat", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B-Chat")
+prompt = "Give me a short introduction to large language model."
+messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}]
+text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+model_inputs = tokenizer([text], return_tensors="pt").to(device)
+generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
+generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(response)
--- a/scripts/summarize_data.py
+++ b/scripts/summarize_data.py
+import json
+import os
+from tqdm import tqdm
+with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k.json") as f:
+    llava_v1_5_mix665k = json.load(f)  # 665298
+with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_instruct_150k.json") as f:
+    llava_instruct_150k = json.load(f)  # 157712
+# Create sets of "id" fields
+mix665k_ids = set()
+for item in llava_v1_5_mix665k:
+    all_conv = ""
+    for cur_conversation in item["conversations"]:
+        all_conv += cur_conversation["value"]
+    mix665k_ids.add(f'{item["id"]}_{all_conv}')
+instruct_150k_ids = set()
+for item in llava_instruct_150k:
+    all_conv = ""
+    for cur_conversation in item["conversations"]:
+        all_conv += cur_conversation["value"]
+    instruct_150k_ids.add(f'{item["id"]}_{all_conv}')
+share_gpt_ids = set()
+for item in llava_v1_5_mix665k:
+    if "image" not in item:
+        all_conv = ""
+        for cur_conversation in item["conversations"]:
+            all_conv += cur_conversation["value"]
+        share_gpt_ids.add(f'{item["id"]}_{all_conv}')  # 40688
+# Get "id" fields that are in mix665k but not in instruct_150k and share_gpt
+new_ids = mix665k_ids - instruct_150k_ids - share_gpt_ids  # 466898
+# Get "id" fields that are in mix665k but not in share_gpt
+# new_ids = mix665k_ids - share_gpt_ids #624610
+# import pdb; pdb.set_trace()
+# Filter mix665k data based on new_ids
+new_data = []
+for item in llava_v1_5_mix665k:
+    all_conv = ""
+    for cur_conversation in item["conversations"]:
+        all_conv += cur_conversation["value"]
+    if f'{item["id"]}_{all_conv}' in new_ids:
+        new_data.append(item)
+import pdb
+pdb.set_trace()
+with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/mixtral_instruct_135K_of_158K_V1.5.json") as f:
+    new_mixtral_instruct = json.load(f)
+# mixtral_instruct_50K_of_80K_V1.json@
+# print(len(new_data))
+# for _ in new_mixtral_instruct:
+#     # import pdb; pdb.set_trace()
+#     if "coco" not in _["image"]:
+#         _["image"] = f"coco/train2017/{_['image']}"
+#     new_data.append(_)
+# print(len(instruct_150k_ids))
+print(len(new_data))
+# for _ in tqdm(new_data):
+#     if "image" in _:
+#         if "000000442654" in _["image"]:
+#             all_conv = ""
+#             for cur_conversation in _["conversations"]:
+#                 all_conv += cur_conversation["value"]
+#         # if not os.path.exists(f'/mnt/bn/vl-research/workspace/boli01/data/playground/data/{_["image"]}'):
+#             import pdb; pdb.set_trace()
+# Write new_data to a new JSON file
+with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k_minus_llava_instruct_150k_minus_sharegpt_plus_mixtral_instruct_135K_of_158K_V1.5.json", "w") as f:
+    json.dump(new_data, f)
--- a/scripts/train/README.md
+++ b/scripts/train/README.md
+# About Training Scripts
+We first release the basic training scripts for LLaVA-NeXT. It's based on previous LLaVA's training scripts and researchers familiar with LLaVA will find it easy to use.
+We will gradually release the more detailed training scripts for our LLaVA OneVision models including the mid stage, single-image final stage and one-vision final stage.
+> They are basically the same as the basic training scripts, but with some modifications, such as the data yaml.
+- `finetune_clip.sh`: This could be seen as the first image version LLaVA-NeXT (2024-01) training script, with `anyres` strategy and maximum 2x2 image grids.
+- `finetune_siglip.sh`: Same but with `siglip` encoder, each grid becomes 729 tokens.
+- `finetune_onevision.sh`: This is our latest training script, with `anyres_max_9` strategy and image grids weaving from 1x1 to 6x6, at most to 2304x2304 resolution. Inside the script, we also incorporate the multi-image and video data into training loop. the detail token strategy could be found in our paper.
+# About the LLaVA-OneVision Data
+We need to address the fact that our data has been collected and used in different projects/people. LLaVA-OneVision is our first attempt to integrate these datasets. For the data that has already been uploaded, we will refer you to the corresponding locations. We kindly ask everyone to gather the "fragments" and piece them together into a "diamond" in your own environment. 
+Here we explain the some technical details on our data. 
+- **pretrain data** - BLIP558K (same as previous llava 1.5 series)
+- **mid stage data mixture**
+  ```yaml
+    datasets:
+      - json_path: /mnt/bn/vl-research/data/llava_instruct/blip558k_stage1.5_finetune_w_prompt.json
+        sampling_strategy: all
+      - json_path: /mnt/bn/vl-research/data/llava_instruct/coco118k_stage1.5_finetune_w_prompt.json
+        sampling_strategy: all
+      - json_path: /mnt/bn/vl-research/data/llava_instruct/cc3m_recap_data_prompt_v2.json
+        sampling_strategy: all
+      - json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_tr_sft.json
+        sampling_strategy: all
+      - json_path: /mnt/bn/vl-research/data/llava_instruct/instruct_azure_dc_zh_92K.json
+        sampling_strategy: all
+      - json_path: /mnt/bn/vl-research/data/llava_instruct/allava/Evol-Instruct-GPT4-Turbo-143K.json
+        sampling_strategy: all
+      - json_path: /mnt/bn/vl-research/data/llava_instruct/synthdog_zh/synthdog_zh_100k.json
+        sampling_strategy: all
+      - json_path: /mnt/bn/vl-research/data/llava_instruct/synthdog_en/synthdog_en_100k.json
+        sampling_strategy: all  
+  ```
+  The first three datasets can be collected via the [LLaVA-Recap](https://huggingface.co/collections/lmms-lab/llava-next-6623288e2d61edba3ddbf5ff) series data. We did a slightly modification to make the data more compatible to other data with an added prompt behind `<image>` at each question. You may download our dataset and process it in local environment. Each data we added the **same prompt** "\nPlease generate detailed descriptions of the given image.". Here's an example:
+  ```json
+    {
+    "id": "000000000009",
+    "image": "coco/train2017/000000000009.jpg",
+    "conversations": [
+      {
+        "from": "human",
+        "value": "<image>\nPlease generate detailed descriptions of the given image."
+      },
+      {
+        "from": "gpt",
+        "value": "The image shows a meal served in a blue tray with compartments. In the top left compartment, there is a slice of bread with a spread that appears to be butter, accompanied by a few almonds and a slice of what looks like a baked potato or sweet potato. The top right compartment contains a variety of fruits, including what seems to be pineapple, orange slices, and possibly a piece of melon.\n\nIn the bottom left compartment, there is a piece of bread with a spread that could be butter or margarine, and a small portion of what might be a meatball or a similar type of meat covered in a sauce. The bottom right compartment contains a serving of broccoli, which appears to be steamed or lightly cooked.\n\nThe meal is presented in a way that suggests it is a balanced meal, with a variety of food groups represented carbohydrates (bread), protein (meatball), healthy fats (almonds and butter), and fruits and vegetables (broccoli and the fruit assortment). The image is a close-up photograph with a focus on the food, and the colors are vibrant, indicating freshness."
+      }
+    ]
+  }
+  ```
+  The rest files are being uploaded to [LLaVA-OneVision Mid-Stage Data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Mid-Data), except from `instruct_azure_dc_zh_92K` since it's internal data due and have policy issue (sad but I could preview one example). The `instruct_azure_dc_zh_92K` images are from COCO and QAs are generated by GPT-4V service provided by AzureAI. We uses this data in early stage experiments wishing to add Chinese knowledge to our model, and just kept using it (yes, just legacy, no other specific reason).
+  ```json
+    {
+    "id": "000000051920",
+    "image": "azure_dc_zh_92K/000000051920.jpg",
+    "conversations": [
+      {
+        "from": "human",
+        "value": "<image>\n这幅画中的关键信息有哪些？"
+      },
+      {
+        "from": "gpt",
+        "value": "这张图片展示了一位穿着深色（可能是黑色或深蓝色）毛衣的人坐在一张桌子前，正在用右手拿着一片披萨。披萨看起来有多种配料，包括绿色的菠菜和橙红色的番茄，放在一个银色的圆形披萨盘上。披萨盘上还剩下大约六片披萨。在背景中，可以看到一个穿着浅色衣服的人站在柜台后面，柜台上方有一个黑板菜单，上面用白色粉笔写着菜单项，但是字迹不清晰，无法阅读。整个场景看起来像是在一个披萨店内部，环境显得比较休闲。"
+      }
+    ],
+    "data_source": "azure_dc_zh_92K"
+  }
+  ```
+- **single-image stage data mixture**
+We have placed our single-image stage data in [single-image-yaml](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/scripts/train/single_image.yaml) for users to review. You can download each subset from [onevision-data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data). 
+Inside the data yaml, the first indicates the previous llava-1.6/next 790K data, you can download them in [llava-next-data](https://huggingface.co/datasets/lmms-lab/LLaVA-NeXT-Data).
+Inside the yaml, the naming would be different with our paper figure due to writing consideration. For users who need to explore our dataset, you can check the [upload script](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/0070d0ae4931c9b19d9cc57c38e16a87c270a61c/playground/upload_data.py#L175) to find the mapping from our local dataset to HF's version.
+- **onevision stage data mixture**
+Our onevision stage data is available in [onevision-yaml](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/scripts/train/onevision.yaml). The single-image portion can be downloaded from the above Huggingface link for onevision data. Here's a breakdown of each part:
+  - Around 800K higher-quality data re-sampled from the previous stage (yes, it's data replay!).
+  - Multi-image data is released in [M4-Instruct Data](https://huggingface.co/datasets/lmms-lab/M4-Instruct-Data). We combine the different subsets into two jsons (as they are mainly from DEMON and Mantis) in our training yaml, the jsons are:
+    - /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_ofa_DEMON-FULL_filtered_311085.json
+    - /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_ofa_mantis-instruct_reformatted.json
+  - Video Data: We have released the video part along with [llava-video-data](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K). Users can download the data, and we utilize the subset used in LLaVA-OneVision:
+    - We have included captions and open-ended questions in the `0_30_s_academic_v0_1` split, along with 240,000 open-ended QA items and 15,000 caption entries, as part of the video data in LLaVA-Hound for LLaVA-OneVision.
+    - 0_30_s_academic_v0_1 captions
+    - 0_30_s_academic_v0_1 open-ended QA
+    - LLaVA-Hound: Same as above.
--- a/scripts/train/direct_finetune_clip.sh
+++ b/scripts/train/direct_finetune_clip.sh
+export OMP_NUM_THREADS=8
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_DEBUG=INFO
+LLM_VERSION="Qwen/Qwen2-7B-Instruct"
+LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
+VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336"
+VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
+############### Pretrain ################
+PROMPT_VERSION="qwen_1_5"
+BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
+echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
+ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
+    llava/train/train_mem.py \
+    --deepspeed scripts/zero3.json \
+    --model_name_or_path ${LLM_VERSION} \
+    --version ${PROMPT_VERSION} \
+    --data_path=llava_1_6.json \
+    --image_folder your_image_folder \
+    --pretrain_mm_mlp_adapter="/checkpoints/projectors/${BASE_RUN_NAME}/mm_projector.bin" \
+    --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
+    --mm_vision_tower_lr=2e-6 \
+    --vision_tower ${VISION_MODEL_VERSION} \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --group_by_modality_length True \
+    --image_aspect_ratio anyres \
+    --image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
+    --mm_patch_merge_type spatial_unpad \
+    --bf16 True \
+    --run_name $MID_RUN_NAME \
+    --output_dir "/checkpoints/${MID_RUN_NAME}" \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 3000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 32768 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 16 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --torch_compile True \
+    --torch_compile_backend "inductor" \
+    --dataloader_drop_last True \
+    --attn_implementation sdpa
+# You can delete the sdpa attn_implementation if you want to use flash attn
--- a/scripts/train/direct_finetune_siglip_a4.sh
+++ b/scripts/train/direct_finetune_siglip_a4.sh
+export OMP_NUM_THREADS=8
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_DEBUG=INFO
+LLM_VERSION="Qwen/Qwen2-7B-Instruct"
+LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
+VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
+VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
+############### Pretrain ################
+PROMPT_VERSION="qwen_1_5"
+BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
+echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
+CKPT_PATH=$LLM_VERSION # this could also be the previous stage checkpoint
+ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
+    llava/train/train_mem.py \
+    --deepspeed scripts/zero3.json \
+    --model_name_or_path ${CKPT_PATH} \
+    --version ${PROMPT_VERSION} \
+    --data_path=llava_1_6.json \
+    --image_folder your_image_folder \
+    --pretrain_mm_mlp_adapter="/checkpoints/projectors/${BASE_RUN_NAME}/mm_projector.bin" \
+    --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
+    --mm_vision_tower_lr=2e-6 \
+    --vision_tower ${VISION_MODEL_VERSION} \
+    --mm_projector_type mlp2x_gelu \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --group_by_modality_length True \
+    --image_aspect_ratio anyres \
+    --image_grid_pinpoints "[(384, 768), (768, 384), (768, 768), (1152, 384), (384, 1152)]" \
+    --mm_patch_merge_type spatial_unpad \
+    --bf16 True \
+    --run_name $MID_RUN_NAME \
+    --output_dir "/checkpoints/${MID_RUN_NAME}" \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 3000 \
+    --save_total_limit 1 \
+    --learning_rate 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 32768 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 16 \
+    --lazy_preprocess True \
+    --report_to wandb \
+    --torch_compile True \
+    --torch_compile_backend "inductor" \
+    --dataloader_drop_last True \
+    --attn_implementation sdpa
+# You can delete the sdpa attn_implementation if you want to use flash attn