Commit 34164470 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
{"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": "<|endoftext|>", "add_prefix_space": false, "errors": "replace", "do_lower_case": true, "name_or_path": "openai/clip-vit-base-patch32", "model_max_length": 77, "special_tokens_map_file": "/home/suraj/.cache/huggingface/transformers/18a566598f286c9139f88160c99f84eec492a26bd22738fa9cb44d5b7e0a5c76.cce1206abbad28826f000510f22f354e53e66a97f7c23745a7dfe27609cc07f5", "tokenizer_class": "CLIPTokenizer"}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
bash run.sh mobilevlm_v2_1.7b pretrain mtgv/MobileVLM_V2-1.7B openai/clip-vit-large-patch14-336
#torch==2.0.1
#torchvision==0.15.2
#deepspeed==0.9.5
transformers==4.33.1
tokenizers==0.13.3
sentencepiece==0.1.99
shortuuid==1.0.11
accelerate==0.21.0
peft==0.4.0
#bitsandbytes==0.41.0
pydantic==1.10.13
markdown2==2.4.8
numpy
scikit-learn==1.2.2
gradio==3.35.2
requests==2.28.2
httpx==0.24.0
uvicorn==0.22.0
fastapi==0.103.0
einops==0.6.1
einops-exts==0.0.4
timm==0.9.12
#flash-attn==2.3.2
#!/usr/bin/env bash
WORK_DIR=$(cd "$(dirname "$0")";pwd)
export PYTHONPATH=${WORK_DIR}
ARCH=$1
TASK=$2
case ${TASK} in
"pretrain-finetune-test")
cd ${WORK_DIR}
OUTPUT_DIR=${WORK_DIR}/outputs/${ARCH}_$(date +"%Y%m%d_%H%M%S")
mkdir -p ${OUTPUT_DIR}
LANGUAGE_MODEL=$3
VISION_MODEL=$4
bash run.sh ${ARCH} pretrain ${LANGUAGE_MODEL} ${VISION_MODEL} ${OUTPUT_DIR}
bash run.sh ${ARCH} finetune ${LANGUAGE_MODEL} ${VISION_MODEL} ${OUTPUT_DIR}
bash run.sh ${ARCH} test ${OUTPUT_DIR}/mobilevlm_v2-2.finetune
;;
"pretrain")
echo ">>> Start Pre-training ..."
cd ${WORK_DIR}
LANGUAGE_MODEL=$3
VISION_MODEL=$4
OUTPUT_DIR=$5
OUTPUT_DIR_PT=${OUTPUT_DIR}/mobilevlm_v2-1.pretrain
mkdir -p ${OUTPUT_DIR_PT}
deepspeed mobilevlm/train/train_mem.py \
--deepspeed scripts/deepspeed/zero2.json \
--model_name_or_path ${LANGUAGE_MODEL} \
--version plain \
--data_path data/pretrain_data/share-captioner_coco_lcs_sam_1246k_1107.json \
--image_folder data/pretrain_data \
--vision_tower ${VISION_MODEL} \
--vision_tower_type clip \
--mm_projector_type ldpnetv2 \
--mm_projector_lr 1e-3 \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--fp16 True \
--output_dir ${OUTPUT_DIR_PT} \
--num_train_epochs 1 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 24000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 False \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee ${OUTPUT_DIR_PT}/log.txt &&
echo "Done."
;;
"finetune")
echo ">>> Start Multi-task Training ..."
cd ${WORK_DIR}
LANGUAGE_MODEL=$3
VISION_MODEL=$4
OUTPUT_DIR=$5
OUTPUT_DIR_PT=${OUTPUT_DIR}/mobilevlm_v2-1.pretrain
OUTPUT_DIR_FT=${OUTPUT_DIR}/mobilevlm_v2-2.finetune
mkdir -p ${OUTPUT_DIR_FT}
deepspeed mobilevlm/train/train_mem.py \
--deepspeed scripts/deepspeed/zero3.json \
--model_name_or_path ${OUTPUT_DIR_PT} \
--version v1 \
--data_path data/finetune_data/MobileVLM_V2_FT_Mix2M.json \
--image_folder data/finetune_data \
--vision_tower ${VISION_MODEL} \
--vision_tower_type clip \
--mm_projector_type ldpnetv2 \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio pad \
--group_by_modality_length True \
--fp16 True \
--output_dir ${OUTPUT_DIR_FT} \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 2000 \
--save_total_limit 1 \
--learning_rate 4e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 False \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee -a ${OUTPUT_DIR_FT}/log.txt &&
echo "Done."
;;
"test")
echo ">>> Start Evaluation ..."
cd ${WORK_DIR}
OUTPUT_DIR=$3
bash scripts/benchmark.sh ${OUTPUT_DIR}
;;
*)
echo "error with ${DATASET_ID}"
esac
#!/usr/bin/env bash
WORK_DIR=$(cd "$(dirname "$0")";pwd)
export PYTHONPATH=${WORK_DIR}
ARCH=$1
TASK=$2
case ${TASK} in
"pretrain-finetune-test")
cd ${WORK_DIR}
OUTPUT_DIR=${WORK_DIR}/outputs/${ARCH}_$(date +"%Y%m%d_%H%M%S")
mkdir -p ${OUTPUT_DIR}
LANGUAGE_MODEL=$3
VISION_MODEL=$4
bash run_v1.sh ${ARCH} pretrain ${LANGUAGE_MODEL} ${VISION_MODEL} ${OUTPUT_DIR}
bash run_v1.sh ${ARCH} finetune ${LANGUAGE_MODEL} ${VISION_MODEL} ${OUTPUT_DIR}
bash run_v1.sh ${ARCH} test ${OUTPUT_DIR}/mobilevlm-2.finetune
;;
"pretrain")
echo ">>> Start Feature-Alignment Pretrain ..."
cd ${WORK_DIR}
LANGUAGE_MODEL=$3
VISION_MODEL=$4
OUTPUT_DIR=$5
OUTPUT_DIR_PT=${OUTPUT_DIR}/mobilevlm-1.pretrain
mkdir -p ${OUTPUT_DIR_PT}
deepspeed mobilevlm/train/train_mem.py \
--deepspeed scripts/deepspeed/zero2.json \
--model_name_or_path ${LANGUAGE_MODEL} \
--version plain \
--data_path data/pretrain_data/blip_laion_cc_sbu_558k.json \
--image_folder data/pretrain_data/images \
--vision_tower ${VISION_MODEL} \
--vision_tower_type clip \
--mm_projector_type ldpnet \
--tune_mm_mlp_adapter True \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--bf16 True \
--output_dir ${OUTPUT_DIR_PT} \
--num_train_epochs 1 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 24000 \
--save_total_limit 1 \
--learning_rate 1e-3 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee ${OUTPUT_DIR_PT}/log.txt &&
echo "Done."
;;
"finetune")
echo ">>> Start Visual-Instruction Tuning ..."
cd ${WORK_DIR}
LANGUAGE_MODEL=$3
VISION_MODEL=$4
OUTPUT_DIR=$5
OUTPUT_DIR_PT=${OUTPUT_DIR}/mobilevlm-1.pretrain
OUTPUT_DIR_FT=${OUTPUT_DIR}/mobilevlm-2.finetune
mkdir -p ${OUTPUT_DIR_FT}
declare -A LR_CONF
LR_CONF=([mobilevlm1.7b]=2e-5 [mobilevlm3b]=4e-5)
declare -A DS_CONF # DeepSpeed
DS_CONF=([mobilevlm1.7b]=zero2 [mobilevlm3b]=zero3) # empirically
deepspeed mobilevlm/train/train_mem.py \
--deepspeed scripts/deepspeed/${DS_CONF[${ARCH}]}.json \
--model_name_or_path ${LANGUAGE_MODEL} \
--version v1 \
--data_path data/finetune_data/llava_v1_5_mix665k.json \
--image_folder data/finetune_data \
--vision_tower ${VISION_MODEL} \
--vision_tower_type clip \
--pretrain_mm_mlp_adapter ${OUTPUT_DIR_PT}/mm_projector.bin \
--mm_projector_type ldpnet \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio pad \
--group_by_modality_length True \
--bf16 True \
--output_dir ${OUTPUT_DIR_FT} \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--learning_rate ${LR_CONF[${ARCH}]} \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee -a ${OUTPUT_DIR_FT}/log.txt &&
echo "Done."
;;
"test")
echo ">>> Start Evaluation ..."
cd ${WORK_DIR}
OUTPUT_DIR=$3
bash scripts/benchmark.sh ${OUTPUT_DIR}
;;
"finetune.lora")
echo ">>> Start Visual-Instruction Tuning with LoRA..."
cd ${WORK_DIR}
LANGUAGE_MODEL=$3
VISION_MODEL=$4
OUTPUT_DIR=$5
OUTPUT_DIR_PT=${OUTPUT_DIR}/mobilevlm-1.pretrain
OUTPUT_DIR_FT=${OUTPUT_DIR}/mobilevlm-2.finetune-lora
mkdir -p ${OUTPUT_DIR_FT}
declare -A DS_CONF
DS_CONF=([mobilevlm1.7b]=zero2 [mobilevlm3b]=zero3)
deepspeed mobilevlm/train/train_mem.py \
--deepspeed scripts/deepspeed/${DS_CONF[${ARCH}]}.json \
--lora_enable True --lora_r 128 --lora_alpha 256 \
--learning_rate 2e-4 \
--model_name_or_path ${LANGUAGE_MODEL} \
--version v1 \
--data_path data/finetune_data/llava_v1_5_mix665k.json \
--image_folder data/finetune_data \
--vision_tower ${VISION_MODEL} \
--vision_tower_type clip \
--pretrain_mm_mlp_adapter ${OUTPUT_DIR_PT}/mm_projector.bin \
--mm_projector_type ldpnet \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio pad \
--group_by_modality_length True \
--bf16 True \
--output_dir ${OUTPUT_DIR_FT} \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee -a ${OUTPUT_DIR_FT}/log.txt &&
python3 scripts/mergelora.py ${LANGUAGE_MODEL} ${OUTPUT_DIR}/mobilevlm-2.finetune-lora ${OUTPUT_DIR}/mobilevlm-2.finetune
echo "Done."
;;
*)
echo "error with ${DATASET_ID}"
esac
#!/usr/bin/env bash
WORK_DIR=$(cd "$(dirname "$0")/..";pwd)
export PYTHONPATH=${WORK_DIR}
CHCEKPOINT_PATH=$1
OUTPUT_DIR_EVAL=$(cd "$(dirname "${CHCEKPOINT_PATH}")";pwd)/mobilevlm-3.evaluation
mkdir -p ${OUTPUT_DIR_EVAL}
CONV_MODE=v1
cd ${WORK_DIR}
DATASET_NAME=mme
MODEL_GENERATOR=mobilevlm.eval.model_vqa_loader
DATA_ROOT=${WORK_DIR}/data/benchmark_data/mme
SPLIT_NAME=llava_mme
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/benchmark/${DATASET_NAME}.sh \
${MODEL_GENERATOR} ${CHCEKPOINT_PATH} ${CONV_MODE} ${SPLIT_NAME} ${DATA_ROOT} ${OUTPUT_DIR_EVAL}/${DATASET_NAME}
DATASET_NAME=gqa
MODEL_GENERATOR=mobilevlm.eval.model_vqa_loader
DATA_ROOT=${WORK_DIR}/data/benchmark_data/gqa
SPLIT_NAME=llava_gqa_testdev_balanced
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/benchmark/${DATASET_NAME}.sh \
${MODEL_GENERATOR} ${CHCEKPOINT_PATH} ${CONV_MODE} ${SPLIT_NAME} ${DATA_ROOT} ${OUTPUT_DIR_EVAL}/${DATASET_NAME}
DATASET_NAME=textvqa
MODEL_GENERATOR=mobilevlm.eval.model_vqa_loader
DATA_ROOT=${WORK_DIR}/data/benchmark_data/textvqa
SPLIT_NAME=llava_textvqa_val_v051_ocr
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/benchmark/${DATASET_NAME}.sh \
${MODEL_GENERATOR} ${CHCEKPOINT_PATH} ${CONV_MODE} ${SPLIT_NAME} ${DATA_ROOT} ${OUTPUT_DIR_EVAL}/${DATASET_NAME}
DATASET_NAME=pope
MODEL_GENERATOR=mobilevlm.eval.model_vqa_loader
DATA_ROOT=${WORK_DIR}/data/benchmark_data/pope
SPLIT_NAME=llava_pope_test
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/benchmark/${DATASET_NAME}.sh \
${MODEL_GENERATOR} ${CHCEKPOINT_PATH} ${CONV_MODE} ${SPLIT_NAME} ${DATA_ROOT} ${OUTPUT_DIR_EVAL}/${DATASET_NAME}
DATASET_NAME=mmbench
MODEL_GENERATOR=mobilevlm.eval.model_vqa_mmbench
DATA_ROOT=${WORK_DIR}/data/benchmark_data/mmbench
SPLIT_NAME=mmbench_dev_en_20231003
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/benchmark/${DATASET_NAME}.sh \
${MODEL_GENERATOR} ${CHCEKPOINT_PATH} ${CONV_MODE} ${SPLIT_NAME} ${DATA_ROOT} ${OUTPUT_DIR_EVAL}/${DATASET_NAME}
DATASET_NAME=sqa
MODEL_GENERATOR=mobilevlm.eval.model_vqa_science
DATA_ROOT=${WORK_DIR}/data/benchmark_data/sqa
SPLIT_NAME=llava_test_CQM-A
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/benchmark/${DATASET_NAME}.sh \
${MODEL_GENERATOR} ${CHCEKPOINT_PATH} ${CONV_MODE} ${SPLIT_NAME} ${DATA_ROOT} ${OUTPUT_DIR_EVAL}/${DATASET_NAME}
#!/bin/bash
GPUS="${CUDA_VISIBLE_DEVICES:-0}"
IFS=',' read -ra GPULIST <<< "$GPUS"
CHUNKS=${#GPULIST[@]}
MODEL_LOADER=$1
MODEL_DIR=$2
CONV_MODE=$3
SPLIT_NME=$4
DATA_ROOT=$5
SAVE_PATH=$6/${SPLIT_NME}
for IDX in $(seq 0 $((CHUNKS-1))); do
CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ${MODEL_LOADER} \
--model-path ${MODEL_DIR} \
--question-file ${DATA_ROOT}/${SPLIT_NME}.jsonl \
--image-folder ${DATA_ROOT}/images \
--answers-file ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl \
--num-chunks $CHUNKS \
--chunk-idx $IDX \
--temperature 0 \
--conv-mode ${CONV_MODE} &
done
wait
RESULT_FILE=${SAVE_PATH}/merge.jsonl
> "${RESULT_FILE}" # Clear out the output file if it exists
for IDX in $(seq 0 $((CHUNKS-1))); do
cat ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl >> "${RESULT_FILE}" # Loop through the indices and concatenate each file
rm ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl
done
python ${DATA_ROOT}/convert_gqa_for_eval.py --src ${RESULT_FILE} --dst ${SAVE_PATH}/merge.json
python ${DATA_ROOT}/eval.py \
--questions ${DATA_ROOT}/testdev_balanced_questions.json \
--predictions ${SAVE_PATH}/merge.json
#!/bin/bash
GPUS="${CUDA_VISIBLE_DEVICES:-0}"
IFS=',' read -ra GPULIST <<< "$GPUS"
CHUNKS=${#GPULIST[@]}
MODEL_LOADER=$1
MODEL_DIR=$2
CONV_MODE=$3
SPLIT_NME=$4
DATA_ROOT=$5
SAVE_PATH=$6/${SPLIT_NME}
for IDX in $(seq 0 $((CHUNKS-1))); do
CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ${MODEL_LOADER} \
--model-path ${MODEL_DIR} \
--question-file ${DATA_ROOT}/${SPLIT_NME}.tsv \
--answers-file ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl \
--num-chunks $CHUNKS \
--chunk-idx $IDX \
--lang en \
--single-pred-prompt \
--temperature 0 \
--conv-mode ${CONV_MODE} &
done
wait
RESULT_FILE=${SAVE_PATH}/predictions.jsonl
> "${RESULT_FILE}" # Clear out the output file if it exists
for IDX in $(seq 0 $((CHUNKS-1))); do
cat ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl >> "${RESULT_FILE}" # Loop through the indices and concatenate each file
rm ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl
done
python ${DATA_ROOT}/convert_mmbench_for_submission.py \
--annotation-file ${DATA_ROOT}/${SPLIT_NME}.tsv \
--result-dir ${SAVE_PATH} \
--upload-dir ${SAVE_PATH} \
--experiment predictions
python ${DATA_ROOT}/eval.py --result ${SAVE_PATH}/predictions.xlsx --meta ${DATA_ROOT}/${SPLIT_NME}.tsv
#!/bin/bash
GPUS="${CUDA_VISIBLE_DEVICES:-0}"
IFS=',' read -ra GPULIST <<< "$GPUS"
CHUNKS=${#GPULIST[@]}
MODEL_LOADER=$1
MODEL_DIR=$2
CONV_MODE=$3
SPLIT_NME=$4
DATA_ROOT=$5
SAVE_PATH=$6/${SPLIT_NME}
for IDX in $(seq 0 $((CHUNKS-1))); do
CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ${MODEL_LOADER} \
--model-path ${MODEL_DIR} \
--question-file ${DATA_ROOT}/${SPLIT_NME}.jsonl \
--image-folder ${DATA_ROOT}/images \
--answers-file ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl \
--num-chunks $CHUNKS \
--chunk-idx $IDX \
--temperature 0 \
--conv-mode ${CONV_MODE} &
done
wait
RESULT_FILE=${SAVE_PATH}/predictions.jsonl
> "${RESULT_FILE}" # Clear out the output file if it exists
for IDX in $(seq 0 $((CHUNKS-1))); do
cat ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl >> "${RESULT_FILE}" # Loop through the indices and concatenate each file
rm ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl
done
cd ${DATA_ROOT}
python convert_answer_to_mme.py --data_path ${DATA_ROOT}/images --experiment ${SAVE_PATH}/predictions.jsonl
python ${DATA_ROOT}/calculation.py --results_dir ${SAVE_PATH}/predictions
#!/bin/bash
GPUS="${CUDA_VISIBLE_DEVICES:-0}"
IFS=',' read -ra GPULIST <<< "$GPUS"
CHUNKS=${#GPULIST[@]}
MODEL_LOADER=$1
MODEL_DIR=$2
CONV_MODE=$3
SPLIT_NME=$4
DATA_ROOT=$5
SAVE_PATH=$6/${SPLIT_NME}
for IDX in $(seq 0 $((CHUNKS-1))); do
CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ${MODEL_LOADER} \
--model-path ${MODEL_DIR} \
--question-file ${DATA_ROOT}/${SPLIT_NME}.jsonl \
--image-folder ${DATA_ROOT}/val2014 \
--answers-file ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl \
--num-chunks $CHUNKS \
--chunk-idx $IDX \
--temperature 0 \
--conv-mode ${CONV_MODE} &
done
wait
RESULT_FILE=${SAVE_PATH}/predictions.jsonl
> "${RESULT_FILE}" # Clear out the output file if it exists
for IDX in $(seq 0 $((CHUNKS-1))); do
cat ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl >> "${RESULT_FILE}" # Loop through the indices and concatenate each file
rm ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl
done
python ${DATA_ROOT}/eval.py \
--question-file ${DATA_ROOT}/${SPLIT_NME}.jsonl \
--annotation-dir ${DATA_ROOT}/coco \
--result-file ${SAVE_PATH}/predictions.jsonl
#!/bin/bash
GPUS="${CUDA_VISIBLE_DEVICES:-0}"
IFS=',' read -ra GPULIST <<< "$GPUS"
CHUNKS=${#GPULIST[@]}
MMEVAL_ROOT=$(cd "$(dirname "$0")/..";pwd)
EVAL_ROOT=${MMEVAL_ROOT}/benchmarks/scienceqa
MODEL_LOADER=$1
MODEL_DIR=$2
CONV_MODE=$3
SPLIT_NME=$4
DATA_ROOT=$5
SAVE_PATH=$6/${SPLIT_NME}
for IDX in $(seq 0 $((CHUNKS-1))); do
CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ${MODEL_LOADER} \
--model-path ${MODEL_DIR} \
--question-file ${DATA_ROOT}/${SPLIT_NME}.json \
--image-folder ${DATA_ROOT}/images/test \
--answers-file ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl \
--num-chunks $CHUNKS \
--chunk-idx $IDX \
--single-pred-prompt \
--temperature 0 \
--conv-mode ${CONV_MODE} &
done
wait
RESULT_FILE=${SAVE_PATH}/predictions.jsonl
> "${RESULT_FILE}" # Clear out the output file if it exists
for IDX in $(seq 0 $((CHUNKS-1))); do
cat ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl >> "${RESULT_FILE}" # Loop through the indices and concatenate each file
rm ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl
done
python ${DATA_ROOT}/eval.py \
--base-dir ${DATA_ROOT} \
--result-file ${SAVE_PATH}/predictions.jsonl \
--output-file ${SAVE_PATH}/output.jsonl \
--output-result ${SAVE_PATH}/result.json
#!/bin/bash
GPUS="${CUDA_VISIBLE_DEVICES:-0}"
IFS=',' read -ra GPULIST <<< "$GPUS"
CHUNKS=${#GPULIST[@]}
MODEL_LOADER=$1
MODEL_DIR=$2
CONV_MODE=$3
SPLIT_NME=$4
DATA_ROOT=$5
SAVE_PATH=$6/${SPLIT_NME}
for IDX in $(seq 0 $((CHUNKS-1))); do
CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m ${MODEL_LOADER} \
--model-path ${MODEL_DIR} \
--question-file ${DATA_ROOT}/${SPLIT_NME}.jsonl \
--image-folder ${DATA_ROOT}/train_images \
--answers-file ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl \
--num-chunks $CHUNKS \
--chunk-idx $IDX \
--temperature 0 \
--conv-mode ${CONV_MODE} &
done
wait
RESULT_FILE=${SAVE_PATH}/predictions.jsonl
> "${RESULT_FILE}" # Clear out the output file if it exists
for IDX in $(seq 0 $((CHUNKS-1))); do
cat ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl >> "${RESULT_FILE}" # Loop through the indices and concatenate each file
rm ${SAVE_PATH}/${CHUNKS}_${IDX}.jsonl
done
python ${DATA_ROOT}/eval.py \
--annotation-file ${DATA_ROOT}/TextVQA_0.5.1_val.json \
--result-file ${SAVE_PATH}/predictions.jsonl
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"gradient_accumulation_steps": "auto",
"zero_optimization": {
"stage": 2,
"overlap_comm": false,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto"
}
}
\ No newline at end of file
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"gradient_accumulation_steps": "auto",
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
}
}
\ No newline at end of file
import sys
import torch
import argparse
from PIL import Image
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent.resolve()))
from mobilevlm.model.mobilevlm import load_pretrained_model
from mobilevlm.conversation import conv_templates, SeparatorStyle
from mobilevlm.utils import disable_torch_init, process_images, tokenizer_image_token, KeywordsStoppingCriteria
from mobilevlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
def inference_once(args):
disable_torch_init()
model_name = args.model_path.split('/')[-1]
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.load_8bit, args.load_4bit)
images = [Image.open(args.image_file).convert("RGB")]
images_tensor = process_images(images, image_processor, model.config).to(model.device, dtype=torch.float16)
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + args.prompt)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
# Input
input_ids = (tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda())
stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)
# Inference
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=images_tensor,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
top_p=args.top_p,
num_beams=args.num_beams,
max_new_tokens=args.max_new_tokens,
use_cache=True,
stopping_criteria=[stopping_criteria],
)
# Result-Decode
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids")
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[: -len(stop_str)]
print(f"🚀 {model_name}: {outputs.strip()}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="mtgv/MobileVLM-1.7B")
parser.add_argument("--conv-mode", type=str, default="v1")
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--num_beams", type=int, default=1)
parser.add_argument("--max_new_tokens", type=int, default=512)
parser.add_argument("--load_8bit", type=bool, default=False)
parser.add_argument("--load_4bit", type=bool, default=False)
args = parser.parse_args()
inference_once(args)
import os
import sys
import torch
from pathlib import Path
from peft import PeftModel
from transformers import AutoTokenizer, AutoConfig, BitsAndBytesConfig
sys.path.append(str(Path(__file__).parent.parent.resolve()))
from mobilevlm.model.mobilellama import MobileLlamaForCausalLM
from mobilevlm.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
def merge_lora(model_base, model_path, save_path):
kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
# Loading weight from base model
model = MobileLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
print("🔜 Don't worry, we will load vision-tower weight soon later...")
token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
if model.lm_head.weight.shape[0] != token_num:
model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
# Loading additional non-lora weights
non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
if any(k.startswith('model.model.') for k in non_lora_trainables):
non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
model.load_state_dict(non_lora_trainables, strict=False)
# Loading lora weights and merge
model = PeftModel.from_pretrained(model, model_path)
model = model.merge_and_unload()
# Loading vision-tower weights
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
if mm_use_im_patch_token:
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
model.resize_token_embeddings(len(tokenizer))
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
vision_tower.load_model()
vision_tower.to(device=model.device, dtype=torch.float16)
print("✅ The vision-tower is loaded successful!")
# save
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
merge_lora(sys.argv[1], sys.argv[2], sys.argv[3])
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment