Commit d5878167 authored by mashun1's avatar mashun1
Browse files

llava-next

parents
Pipeline #2589 failed with stages
in 0 seconds
#!/bin/bash
dataset_name=$1
cd /mnt/bn/vl-research/workspace/yhzhang/LLaVA
# Install yolk3k if not installed
if ! pip show yolk3k > /dev/null 2>&1; then
pip install yolk3k
fi
# Get the installed version of transformers
installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
# Get the latest version of transformers from PyPI
latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
# Check if the installed version is not the latest
if [ "$installed_version" != "$latest_version" ]; then
pip install -U transformers
fi
# Get the installed version of deepspeed
installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
# Get the latest version of deepspeed from PyPI
# latest_version=$(yolk -V deepspeed | cut -d ' ' -f 2)
# Check if the installed version is not the latest
if [ "$installed_version" != "0.12.2" ]; then
pip install deepspeed==0.12.2
fi
# Install yolk3k if not installed
if ! pip show flash-attn > /dev/null 2>&1; then
pip install flash-attn --no-build-isolation
fi
################## MISTRAL ##################
PROMPT_VERSION=mistral_instruct
MODEL_VERSION="Mistral-7B-Instruct-v0.2"
################## MISTRAL ##################
################## project ##################
PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-mlp2x_gelu-pretrain_blip558k_plain"
################## data ##################
DATA_NAME=$dataset_name
# wandb configure
export WANDB_API_KEY="03fc62d68025c9498cf6493432551badd7d4f953"
wandb login $WANDB_API_KEY
export WANDB_NAME=$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME
export WANDB_PROJECT=LLaVA_Mixtral
export WANDB_MODE=online
wandb online
deepspeed --master_port 26000 \
llava/train/train_mem.py \
--deepspeed ./scripts/zero2.json \
--model_name_or_path ./checkpoints/$MODEL_VERSION \
--version $PROMPT_VERSION \
--data_path ./playground/data/$DATA_NAME.json \
--image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
--vision_tower openai/clip-vit-large-patch14 \
--pretrain_mm_mlp_adapter ./checkpoints/$PROJECT_NAME/mm_projector.bin \
--mm_vision_select_layer -2 \
--mm_projector_type mlp2x_gelu \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--bf16 True \
--output_dir ./checkpoints/llava--$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME--finetune \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
--lazy_preprocess True
# --report_to wandb
#!/bin/bash
dataset_name=$1
cd /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next
# Install yolk3k if not installed
if ! pip show yolk3k > /dev/null 2>&1; then
pip install yolk3k
fi
pip install pydantic
# Get the installed version of transformers
installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
# Get the latest version of transformers from PyPI
latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
# Check if the installed version is not the latest
if [ "$installed_version" != "4.36.2" ]; then
pip install transformers==4.36.2
fi
# Get the installed version of deepspeed
installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
# Check if the installed version is not the latest
if [ "$installed_version" != "0.12.2" ]; then
pip install deepspeed==0.12.2
fi
# Install flash-atten if not installed
if ! pip show flash-attn > /dev/null 2>&1; then
pip install flash-attn --no-build-isolation
fi
################## MISTRAL ##################
PROMPT_VERSION=mistral_instruct
MODEL_VERSION="Mistral-7B-Instruct-v0.2"
################## MISTRAL ##################
################## project ##################
PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain"
################## data ##################
DATA_NAME=$dataset_name
# wandb configure
export WANDB_API_KEY=e464cc107357c7b38e87f239bc3eb2ce5fb73c7c
export WANDB_PROJECT=llava
export WANDB_NAME=$PROJECT_NAME--$DATA_NAME--336px--anyres--sft
export WANDB_MODE=online
wandb online
deepspeed --master_port 26000 \
llava/train/train_mem.py \
--deepspeed ./scripts/zero3.json \
--model_name_or_path /mnt/bn/vl-research/workspace/project/2023/LLaVA/checkpoints/$MODEL_VERSION \
--version $PROMPT_VERSION \
--data_path ./playground/data/$DATA_NAME.json \
--image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
--vision_tower openai/clip-vit-large-patch14-336 \
--pretrain_mm_mlp_adapter /mnt/bn/vl-research/workspace/project/2023/LLaVA/checkpoints/ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain/mm_projector.bin \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--unfreeze_mm_vision_tower True \
--mm_vision_tower_lr 2e-6 \
--image_aspect_ratio anyres \
--image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--output_dir ./checkpoints/$PROJECT_NAME--$DATA_NAME--336px--anyres--sft \
--num_train_epochs 9 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "epoch" \
--save_steps 1500 \
--learning_rate 5e-6 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 4096 \
--gradient_checkpointing True \
--dataloader_num_workers 8 \
--lazy_preprocess True \
--report_to wandb
#!/bin/bash
dataset_name=$1
cd /mnt/bn/vl-research/workspace/yhzhang/LLaVA
# Install yolk3k if not installed
if ! pip show yolk3k > /dev/null 2>&1; then
pip install yolk3k
fi
pip install pydantic
# Get the installed version of transformers
installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
# Get the latest version of transformers from PyPI
latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
# Check if the installed version is not the latest
if [ "$installed_version" != "4.36.2" ]; then
pip install transformers==4.36.2
fi
# Get the installed version of deepspeed
installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
# Check if the installed version is not the latest
if [ "$installed_version" != "0.12.2" ]; then
pip install deepspeed==0.12.2
fi
# Install flash-atten if not installed
if ! pip show flash-attn > /dev/null 2>&1; then
pip install flash-attn --no-build-isolation
fi
################## MISTRAL ##################
PROMPT_VERSION=mistral_instruct
MODEL_VERSION="Mistral-7B-Instruct-v0.2"
################## MISTRAL ##################
################## project ##################
PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain"
################## data ##################
DATA_NAME=$dataset_name
# wandb configure
export WANDB_API_KEY=e464cc107357c7b38e87f239bc3eb2ce5fb73c7c
export WANDB_PROJECT=llava
export WANDB_NAME=$PROJECT_NAME--$DATA_NAME--336px--unfreeze--anyres--sft
export WANDB_MODE=online
wandb online
deepspeed --master_port 26000 \
llava/train/train_mem.py \
--deepspeed ./scripts/zero3.json \
--model_name_or_path ./checkpoints/$MODEL_VERSION \
--version $PROMPT_VERSION \
--data_path ./playground/data/$DATA_NAME.json \
--image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data \
--vision_tower openai/clip-vit-large-patch14-336 \
--pretrain_mm_mlp_adapter /mnt/bn/vl-research/workspace/project/2023/LLaVA/checkpoints/ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain/mm_projector.bin \
--mm_vision_select_layer -2 \
--mm_projector_type mlp2x_gelu \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--image_aspect_ratio anyres \
--image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--output_dir ./checkpoints/$PROJECT_NAME--$DATA_NAME--336px--anyres--unfreeze--sft \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
#!/bin/bash
# set up wandb
export WANDB_API_KEY=a651c244635bc6f913ab654af3f0eebaecdc9381
export WANDB_ENTITY=llava-vl
export WANDB_PROJECT=llava-next
export PYTHONWARNINGS="ignore"
cd /mnt/bn/vl-research/workspace/boli01/projects/lmms-eval
pip install -e .
# set up llava dev env
cd /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next
################## MISTRAL ##################
PROMPT_VERSION=mistral_instruct
MODEL_VERSION="Mistral-7B-Instruct-v0.2"
################## MISTRAL ##################
################## project ##################
PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-clip_large_336px-mlp2x_gelu-pretrain_blip558k_plain"
################## data ##################
DATA_NAME='llava_caps20k_chartqa19k'
export WANDB_NAME=$PROJECT_NAME--$DATA_NAME--336px--anyres--sft
export WANDB_MODE=online
wandb online
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" deepspeed --master_port 26000 --include localhost:0,1,2,3,4,5,6,7 llava/train/train_mem.py \
--deepspeed ./scripts/zero3_offload.json \
--model_name_or_path mistralai/$MODEL_VERSION \
--version $PROMPT_VERSION \
--data_path ./playground/data/llava_instruct/$DATA_NAME.json \
--image_folder /mnt/bn/vl-research/data/llava \
--vision_tower openai/clip-vit-large-patch14-336 \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--unfreeze_mm_vision_tower True \
--mm_vision_tower_lr 2e-6 \
--image_aspect_ratio anyres \
--image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--output_dir ./checkpoints/$PROJECT_NAME--llava1.6--336px--anyres--sft \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1500 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 4096 \
--gradient_checkpointing True \
--dataloader_num_workers 32 \
--lazy_preprocess True \
--report_to wandb \
--run_name $WANDB_NAME
# starting here is the args for evaluation
--eval_num_processes 4 \
--task_names mme,docvqa_val \
--model_args pretrained=./checkpoints/$PROJECT_NAME--$DATA_NAME--336px--anyres--sft \
--limit 8 \
--batch_size 1 \
--log_samples \
--log_samples_suffix debug \
--output_path ./logs/
#!/bin/bash
cd /mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA
# Install yolk3k if not installed
if ! pip show yolk3k > /dev/null 2>&1; then
pip install yolk3k
fi
# Get the installed version of transformers
installed_version=$(pip show transformers | grep Version | cut -d ' ' -f 2)
# Get the latest version of transformers from PyPI
latest_version=$(yolk -V transformers | cut -d ' ' -f 2)
# Check if the installed version is not the latest
if [ "$installed_version" != "$latest_version" ]; then
pip install -U transformers
fi
# Get the installed version of deepspeed
installed_version=$(pip show deepspeed | grep Version | cut -d ' ' -f 2)
# Get the latest version of deepspeed from PyPI
latest_version=$(yolk -V deepspeed | cut -d ' ' -f 2)
# Check if the installed version is not the latest
if [ "$installed_version" != "$latest_version" ]; then
pip install deepspeed==0.12.2
fi
# Install yolk3k if not installed
if ! pip show flash-attn > /dev/null 2>&1; then
pip install flash-attn --no-build-isolation
fi
################## MISTRAL ##################
PROMPT_VERSION=mistral_instruct
MODEL_VERSION="Mistral-7B-Instruct-v0.2"
################## VICUNA ##################
################## project ##################
PROJECT_NAME="ds_llava-Mistral-7B-Instruct-v0.2-mlp2x_gelu-pretrain_blip558k_plain"
################## data ##################
DATA_NAME="llava_instruct_150k"
# wandb configure
export WANDB_API_KEY="03fc62d68025c9498cf6493432551badd7d4f953"
wandb login $WANDB_API_KEY
export WANDB_NAME=$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME
export WANDB_PROJECT=LLaVA_Mixtral
export WANDB_MODE=online
wandb online
deepspeed --master_port 26000 \
llava/train/train_mem.py \
--deepspeed ./scripts/zero2.json \
--model_name_or_path ./checkpoints/$MODEL_VERSION \
--version $PROMPT_VERSION \
--data_path ./playground/data/$DATA_NAME.json \
--image_folder /mnt/bn/vl-research/workspace/boli01/data/playground/data/coco/train2017 \
--vision_tower openai/clip-vit-large-patch14 \
--pretrain_mm_mlp_adapter ./checkpoints/$PROJECT_NAME/mm_projector.bin \
--mm_vision_select_layer -2 \
--mm_projector_type mlp2x_gelu \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--bf16 True \
--output_dir ./checkpoints/llava--$PROJECT_NAME--$MODEL_VERSION--$DATA_NAME--finetune \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
--lazy_preprocess True \
--report_to wandb
#!/bin/bash
# Uncomment and set the following variables correspondingly to run this script:
################## VICUNA ##################
# PROMPT_VERSION=v1
# MODEL_VERSION="vicuna-v1-3-7b"
################## VICUNA ##################
################## LLaMA-2 ##################
# PROMPT_VERSION="llava_llama_2"
# MODEL_VERSION="llama-2-7b-chat"
################## LLaMA-2 ##################
deepspeed llava/train/train_mem.py \
--deepspeed ./scripts/zero2.json \
--lora_enable True \
--bits 4 \
--model_name_or_path ./checkpoints/$MODEL_VERSION \
--version $PROMPT_VERSION \
--data_path ./playground/data/llava_instruct_80k.json \
--image_folder /path/to/coco/train2017 \
--vision_tower openai/clip-vit-large-patch14 \
--pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--bf16 True \
--output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--lazy_preprocess True \
--dataloader_num_workers 16 \
--report_to wandb
#!/bin/bash
deepspeed llava/train/train_mem.py \
--deepspeed ./scripts/zero2.json \
--model_name_or_path lmsys/vicuna-13b-v1.3 \
--version $PROMPT_VERSION \
--data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \
--image_folder /Data/ScienceQA/data/scienceqa/images/train \
--vision_tower openai/clip-vit-large-patch14 \
--pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--bf16 True \
--output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \
--num_train_epochs 12 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
--lazy_preprocess True \
--report_to wandb
import argparse
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
def merge_lora(args):
model_name = get_model_name_from_path(args.model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map="cpu")
model.save_pretrained(args.save_model_path)
tokenizer.save_pretrained(args.save_model_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, required=True)
parser.add_argument("--model-base", type=str, required=True)
parser.add_argument("--save-model-path", type=str, required=True)
args = parser.parse_args()
merge_lora(args)
#!/bin/bash
# Uncomment and set the following variables correspondingly to run this script:
# MODEL_VERSION=vicuna-v1-3-7b
# MODEL_VERSION=llama-2-7b-chat
########### DO NOT CHANGE ###########
########### USE THIS FOR BOTH ###########
PROMPT_VERSION=plain
########### DO NOT CHANGE ###########
deepspeed llava/train/train_mem.py \
--deepspeed ./scripts/zero2.json \
--model_name_or_path ./checkpoints/$MODEL_VERSION \
--version $PROMPT_VERSION \
--data_path /path/to/pretrain_data.json \
--image_folder /path/to/images \
--vision_tower openai/clip-vit-large-patch14 \
--tune_mm_mlp_adapter True \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--bf16 True \
--output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 24000 \
--learning_rate 2e-3 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
--lazy_preprocess True \
--report_to wandb
import json
import os
import argparse
from tqdm import tqdm
import yaml
def check_missing_images(json_path, images_folder):
data = json.load(open(json_path, "r"))
missing_data = []
for i, d in enumerate(tqdm(data)):
image = d["image"] if "image" in d else ""
if image != "":
path = os.path.join(images_folder, image)
if not os.path.exists(path):
print(f"Missing image: {path}")
missing_data.append(d)
return missing_data
def read_yaml_to_llava_data(yaml_path, images_folder):
print(f"Reading YAML file: {yaml_path}")
with open(yaml_path, "r") as f:
data = yaml.safe_load(f)
llava_json_paths = data["datasets"]
for item in llava_json_paths:
json_path = item["json_path"]
missing_data = check_missing_images(json_path, images_folder)
if len(missing_data) > 0:
print(f"Missing images in {json_path}:")
for d in missing_data:
print(d)
def direct_check_llava_data(json_path, images_folder):
missing_data = check_missing_images(json_path, images_folder)
if len(missing_data) > 0:
print(f"Missing images in {json_path}:")
for d in missing_data:
print(d)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Check for missing images in dataset.")
parser.add_argument("--yaml_path", type=str, default="", help="Path to the YAML file containing the dataset.")
parser.add_argument("--json_path", type=str, default="", help="Path to the JSON file containing the dataset.")
parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing the images.")
args = parser.parse_args()
if args.json_path != "":
direct_check_llava_data(args.json_path, args.images_folder)
elif args.yaml_path != "":
read_yaml_to_llava_data(args.yaml_path, args.images_folder)
#!/bin/bash
CHUNKS=8
for IDX in {0..7}; do
CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
--model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
--question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
--image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
--answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
--num-chunks $CHUNKS \
--chunk-idx $IDX \
--conv-mode llava_v1 &
done
#!/bin/bash
CHUNKS=8
output_file="test_llava-13b.jsonl"
# Clear out the output file if it exists.
> "$output_file"
# Loop through the indices and concatenate each file.
for idx in $(seq 0 $((CHUNKS-1))); do
cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file"
done
python llava/eval/eval_science_qa.py \
--base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \
--result-file ./test_llava-13b.jsonl \
--output-file ./test_llava-13b_output.json \
--output-result ./test_llava-13b_result.json
#!/bin/bash
export HIP_VISIBLE_DEVICES=4,5,6,7
# evaluate
./scripts/interleave/eval_interleave_3d.sh /home/LLaVA-NeXT/ckpts/llava-next-interleave-qwen-7b /home/LLaVA-NeXT/data/interleave_bench multi_image_in_domain
#./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_image_out_domain
#./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_view_in_domain
alias python=python3
CKPT_PATH=$1
NAME=$(echo "$CKPT_PATH" | awk -F'/' '{print $NF}')
echo $NAME
##### set images path
DATA_PATH=$2
EVAL_TYPE=$3
JSON_PATH=$2/$3.json
############################### eval multi-image
RESULT_NAME="logs/${NAME}/${EVAL_TYPE}"
echo $RESULT_NAME
mkdir -p logs/${NAME}
file_path=${RESULT_NAME}/result.jsonl
bash scripts/interleave/eval_multiprocess.sh \
${CKPT_PATH} \
${JSON_PATH} \
${RESULT_NAME} \
${DATA_PATH} \
"" \
4 0
python3 llava/eval/evaluate_interleave.py --result-dir ${RESULT_NAME}
#!/bin/bash
# Check if three arguments are passed
if [ "$#" -ne 7 ]; then
echo "Usage: $0 <model_path> <question_path> <base_answer_path> <image_folder> <extra_prompt> <N> <temperature>"
exit 1
fi
# Assign the command line arguments to variables
model_path=$1
question_path=$2
base_answer_path=$3
image_folder=$4
extra_prompt=$5
N=$6
temperature=$7
# Loop over each chunk/process
for (( chunk_id=0; chunk_id<N; chunk_id++ ))
do
# Define the answer path for each chunk
answer_path="${base_answer_path}/result_${chunk_id}.jsonl"
if [ -f "$answer_path" ]; then
rm "$answer_path"
fi
# Run the Python program in the background
CUDA_VISIBLE_DEVICES="$chunk_id" python3 llava/eval/model_vqa.py --model-path "$model_path" --question-file "$question_path" --answers-file "$answer_path" --num-chunks "$N" --chunk-idx "$chunk_id" --image-folder "$image_folder" --extra-prompt "$extra_prompt" --temperature "$temperature" &
# Uncomment below if you need a slight delay between starting each process
# sleep 0.1
done
# Wait for all background processes to finish
wait
merged_file="${base_answer_path}/result.jsonl"
if [ -f "$merged_file" ]; then
rm "$merged_file"
fi
# Merge all the JSONL files into one
#cat "${base_answer_path}"_*.jsonl > "${base_answer_path}.jsonl"
for ((i=0; i<N; i++)); do
input_file="${base_answer_path}/result_${i}.jsonl"
cat "$input_file" >> "${base_answer_path}/result.jsonl"
done
# remove the unmerged files
for (( chunk_id=0; chunk_id<N; chunk_id++ ))
do
# Define the answer path for each chunk
answer_path="${base_answer_path}/result_${chunk_id}.jsonl"
if [ -f "$answer_path" ]; then
rm "$answer_path"
fi
done
\ No newline at end of file
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
device = "cuda" # the device to load the model onto
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B-Chat", torch_dtype=torch.bfloat16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B-Chat")
prompt = "Give me a short introduction to large language model."
messages = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)
generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
import json
import os
from tqdm import tqdm
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k.json") as f:
llava_v1_5_mix665k = json.load(f) # 665298
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_instruct_150k.json") as f:
llava_instruct_150k = json.load(f) # 157712
# Create sets of "id" fields
mix665k_ids = set()
for item in llava_v1_5_mix665k:
all_conv = ""
for cur_conversation in item["conversations"]:
all_conv += cur_conversation["value"]
mix665k_ids.add(f'{item["id"]}_{all_conv}')
instruct_150k_ids = set()
for item in llava_instruct_150k:
all_conv = ""
for cur_conversation in item["conversations"]:
all_conv += cur_conversation["value"]
instruct_150k_ids.add(f'{item["id"]}_{all_conv}')
share_gpt_ids = set()
for item in llava_v1_5_mix665k:
if "image" not in item:
all_conv = ""
for cur_conversation in item["conversations"]:
all_conv += cur_conversation["value"]
share_gpt_ids.add(f'{item["id"]}_{all_conv}') # 40688
# Get "id" fields that are in mix665k but not in instruct_150k and share_gpt
new_ids = mix665k_ids - instruct_150k_ids - share_gpt_ids # 466898
# Get "id" fields that are in mix665k but not in share_gpt
# new_ids = mix665k_ids - share_gpt_ids #624610
# import pdb; pdb.set_trace()
# Filter mix665k data based on new_ids
new_data = []
for item in llava_v1_5_mix665k:
all_conv = ""
for cur_conversation in item["conversations"]:
all_conv += cur_conversation["value"]
if f'{item["id"]}_{all_conv}' in new_ids:
new_data.append(item)
import pdb
pdb.set_trace()
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/mixtral_instruct_135K_of_158K_V1.5.json") as f:
new_mixtral_instruct = json.load(f)
# mixtral_instruct_50K_of_80K_V1.json@
# print(len(new_data))
# for _ in new_mixtral_instruct:
# # import pdb; pdb.set_trace()
# if "coco" not in _["image"]:
# _["image"] = f"coco/train2017/{_['image']}"
# new_data.append(_)
# print(len(instruct_150k_ids))
print(len(new_data))
# for _ in tqdm(new_data):
# if "image" in _:
# if "000000442654" in _["image"]:
# all_conv = ""
# for cur_conversation in _["conversations"]:
# all_conv += cur_conversation["value"]
# # if not os.path.exists(f'/mnt/bn/vl-research/workspace/boli01/data/playground/data/{_["image"]}'):
# import pdb; pdb.set_trace()
# Write new_data to a new JSON file
with open("/mnt/bn/vl-research/workspace/boli01/zzzprojects/LLaVA/playground/data/llava_v1_5_mix665k_minus_llava_instruct_150k_minus_sharegpt_plus_mixtral_instruct_135K_of_158K_V1.5.json", "w") as f:
json.dump(new_data, f)
# About Training Scripts
We first release the basic training scripts for LLaVA-NeXT. It's based on previous LLaVA's training scripts and researchers familiar with LLaVA will find it easy to use.
We will gradually release the more detailed training scripts for our LLaVA OneVision models including the mid stage, single-image final stage and one-vision final stage.
> They are basically the same as the basic training scripts, but with some modifications, such as the data yaml.
- `finetune_clip.sh`: This could be seen as the first image version LLaVA-NeXT (2024-01) training script, with `anyres` strategy and maximum 2x2 image grids.
- `finetune_siglip.sh`: Same but with `siglip` encoder, each grid becomes 729 tokens.
- `finetune_onevision.sh`: This is our latest training script, with `anyres_max_9` strategy and image grids weaving from 1x1 to 6x6, at most to 2304x2304 resolution. Inside the script, we also incorporate the multi-image and video data into training loop. the detail token strategy could be found in our paper.
# About the LLaVA-OneVision Data
We need to address the fact that our data has been collected and used in different projects/people. LLaVA-OneVision is our first attempt to integrate these datasets. For the data that has already been uploaded, we will refer you to the corresponding locations. We kindly ask everyone to gather the "fragments" and piece them together into a "diamond" in your own environment.
Here we explain the some technical details on our data.
- **pretrain data** - BLIP558K (same as previous llava 1.5 series)
- **mid stage data mixture**
```yaml
datasets:
- json_path: /mnt/bn/vl-research/data/llava_instruct/blip558k_stage1.5_finetune_w_prompt.json
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/coco118k_stage1.5_finetune_w_prompt.json
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/cc3m_recap_data_prompt_v2.json
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/ureader_tr_sft.json
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/instruct_azure_dc_zh_92K.json
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/allava/Evol-Instruct-GPT4-Turbo-143K.json
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/synthdog_zh/synthdog_zh_100k.json
sampling_strategy: all
- json_path: /mnt/bn/vl-research/data/llava_instruct/synthdog_en/synthdog_en_100k.json
sampling_strategy: all
```
The first three datasets can be collected via the [LLaVA-Recap](https://huggingface.co/collections/lmms-lab/llava-next-6623288e2d61edba3ddbf5ff) series data. We did a slightly modification to make the data more compatible to other data with an added prompt behind `<image>` at each question. You may download our dataset and process it in local environment. Each data we added the **same prompt** "\nPlease generate detailed descriptions of the given image.". Here's an example:
```json
{
"id": "000000000009",
"image": "coco/train2017/000000000009.jpg",
"conversations": [
{
"from": "human",
"value": "<image>\nPlease generate detailed descriptions of the given image."
},
{
"from": "gpt",
"value": "The image shows a meal served in a blue tray with compartments. In the top left compartment, there is a slice of bread with a spread that appears to be butter, accompanied by a few almonds and a slice of what looks like a baked potato or sweet potato. The top right compartment contains a variety of fruits, including what seems to be pineapple, orange slices, and possibly a piece of melon.\n\nIn the bottom left compartment, there is a piece of bread with a spread that could be butter or margarine, and a small portion of what might be a meatball or a similar type of meat covered in a sauce. The bottom right compartment contains a serving of broccoli, which appears to be steamed or lightly cooked.\n\nThe meal is presented in a way that suggests it is a balanced meal, with a variety of food groups represented carbohydrates (bread), protein (meatball), healthy fats (almonds and butter), and fruits and vegetables (broccoli and the fruit assortment). The image is a close-up photograph with a focus on the food, and the colors are vibrant, indicating freshness."
}
]
}
```
The rest files are being uploaded to [LLaVA-OneVision Mid-Stage Data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Mid-Data), except from `instruct_azure_dc_zh_92K` since it's internal data due and have policy issue (sad but I could preview one example). The `instruct_azure_dc_zh_92K` images are from COCO and QAs are generated by GPT-4V service provided by AzureAI. We uses this data in early stage experiments wishing to add Chinese knowledge to our model, and just kept using it (yes, just legacy, no other specific reason).
```json
{
"id": "000000051920",
"image": "azure_dc_zh_92K/000000051920.jpg",
"conversations": [
{
"from": "human",
"value": "<image>\n这幅画中的关键信息有哪些?"
},
{
"from": "gpt",
"value": "这张图片展示了一位穿着深色(可能是黑色或深蓝色)毛衣的人坐在一张桌子前,正在用右手拿着一片披萨。披萨看起来有多种配料,包括绿色的菠菜和橙红色的番茄,放在一个银色的圆形披萨盘上。披萨盘上还剩下大约六片披萨。在背景中,可以看到一个穿着浅色衣服的人站在柜台后面,柜台上方有一个黑板菜单,上面用白色粉笔写着菜单项,但是字迹不清晰,无法阅读。整个场景看起来像是在一个披萨店内部,环境显得比较休闲。"
}
],
"data_source": "azure_dc_zh_92K"
}
```
- **single-image stage data mixture**
We have placed our single-image stage data in [single-image-yaml](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/scripts/train/single_image.yaml) for users to review. You can download each subset from [onevision-data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data).
Inside the data yaml, the first indicates the previous llava-1.6/next 790K data, you can download them in [llava-next-data](https://huggingface.co/datasets/lmms-lab/LLaVA-NeXT-Data).
Inside the yaml, the naming would be different with our paper figure due to writing consideration. For users who need to explore our dataset, you can check the [upload script](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/0070d0ae4931c9b19d9cc57c38e16a87c270a61c/playground/upload_data.py#L175) to find the mapping from our local dataset to HF's version.
- **onevision stage data mixture**
Our onevision stage data is available in [onevision-yaml](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/scripts/train/onevision.yaml). The single-image portion can be downloaded from the above Huggingface link for onevision data. Here's a breakdown of each part:
- Around 800K higher-quality data re-sampled from the previous stage (yes, it's data replay!).
- Multi-image data is released in [M4-Instruct Data](https://huggingface.co/datasets/lmms-lab/M4-Instruct-Data). We combine the different subsets into two jsons (as they are mainly from DEMON and Mantis) in our training yaml, the jsons are:
- /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_ofa_DEMON-FULL_filtered_311085.json
- /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/llava_ofa_mantis-instruct_reformatted.json
- Video Data: We have released the video part along with [llava-video-data](https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K). Users can download the data, and we utilize the subset used in LLaVA-OneVision:
- We have included captions and open-ended questions in the `0_30_s_academic_v0_1` split, along with 240,000 open-ended QA items and 15,000 caption entries, as part of the video data in LLaVA-Hound for LLaVA-OneVision.
- 0_30_s_academic_v0_1 captions
- 0_30_s_academic_v0_1 open-ended QA
- LLaVA-Hound: Same as above.
export OMP_NUM_THREADS=8
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DEBUG=INFO
LLM_VERSION="Qwen/Qwen2-7B-Instruct"
LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
VISION_MODEL_VERSION="openai/clip-vit-large-patch14-336"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
############### Pretrain ################
PROMPT_VERSION="qwen_1_5"
BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
llava/train/train_mem.py \
--deepspeed scripts/zero3.json \
--model_name_or_path ${LLM_VERSION} \
--version ${PROMPT_VERSION} \
--data_path=llava_1_6.json \
--image_folder your_image_folder \
--pretrain_mm_mlp_adapter="/checkpoints/projectors/${BASE_RUN_NAME}/mm_projector.bin" \
--mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
--mm_vision_tower_lr=2e-6 \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--image_aspect_ratio anyres \
--image_grid_pinpoints "[(336, 672), (672, 336), (672, 672), (1008, 336), (336, 1008)]" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--run_name $MID_RUN_NAME \
--output_dir "/checkpoints/${MID_RUN_NAME}" \
--num_train_epochs 1 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 3000 \
--save_total_limit 1 \
--learning_rate 1e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 32768 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
--lazy_preprocess True \
--report_to wandb \
--torch_compile True \
--torch_compile_backend "inductor" \
--dataloader_drop_last True \
--attn_implementation sdpa
# You can delete the sdpa attn_implementation if you want to use flash attn
export OMP_NUM_THREADS=8
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=eth0
export NCCL_DEBUG=INFO
LLM_VERSION="Qwen/Qwen2-7B-Instruct"
LLM_VERSION_CLEAN="${LLM_VERSION//\//_}"
VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
############### Pretrain ################
PROMPT_VERSION="qwen_1_5"
BASE_RUN_NAME="llavanext-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-mlp2x_gelu-pretrain_blip558k_plain"
echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
CKPT_PATH=$LLM_VERSION # this could also be the previous stage checkpoint
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \
llava/train/train_mem.py \
--deepspeed scripts/zero3.json \
--model_name_or_path ${CKPT_PATH} \
--version ${PROMPT_VERSION} \
--data_path=llava_1_6.json \
--image_folder your_image_folder \
--pretrain_mm_mlp_adapter="/checkpoints/projectors/${BASE_RUN_NAME}/mm_projector.bin" \
--mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
--mm_vision_tower_lr=2e-6 \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--group_by_modality_length True \
--image_aspect_ratio anyres \
--image_grid_pinpoints "[(384, 768), (768, 384), (768, 768), (1152, 384), (384, 1152)]" \
--mm_patch_merge_type spatial_unpad \
--bf16 True \
--run_name $MID_RUN_NAME \
--output_dir "/checkpoints/${MID_RUN_NAME}" \
--num_train_epochs 1 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 3000 \
--save_total_limit 1 \
--learning_rate 1e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 32768 \
--gradient_checkpointing True \
--dataloader_num_workers 16 \
--lazy_preprocess True \
--report_to wandb \
--torch_compile True \
--torch_compile_backend "inductor" \
--dataloader_drop_last True \
--attn_implementation sdpa
# You can delete the sdpa attn_implementation if you want to use flash attn
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment