"src/vscode:/vscode.git/clone" did not exist on "54241df601e527fd310ab2cbdddd90bd2478bc84"
Commit 1b9205c9 authored by yangzhong's avatar yangzhong
Browse files

v1.0

parents
Pipeline #2931 failed with stages
in 0 seconds
This diff is collapsed.
from argparse import Namespace
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import open_clip
from sft_data_utils import make_supervised_data_module
IGNORE_INDEX = -100
if __name__=='__main__':
# Constant for unit test.
tokenizer_path = 'lmsys/vicuna-7b-v1.5'
clip_vision_encoder_path = 'ViT-H-14-378-quickgelu'
clip_vision_encoder_pretrained = 'dfn5b'
cache_dir='/export/share/manlis/models'
# load tokenizer and ensure there is a pad token
text_tokenizer = AutoTokenizer.from_pretrained(
tokenizer_path,
local_files_only=False,
trust_remote_code=True,
cache_dir=cache_dir,
use_fast=False,
)
if text_tokenizer.pad_token is None or text_tokenizer.pad_token == text_tokenizer.eos_token:
# add a pad token if it doesn't exist
text_tokenizer.add_special_tokens({"pad_token": "<pad>"})
# add special tokens to the tokenizer and language models
special_tokens = {
"media_token": "<image>",
}
text_tokenizer.add_special_tokens(
{"additional_special_tokens": list(special_tokens.values())}
)
# load vision encoder
_, _, image_processor = open_clip.create_model_and_transforms(
clip_vision_encoder_path,
pretrained=clip_vision_encoder_pretrained,
cache_dir=cache_dir,
force_image_size=378,
)
# Create dataset.
args = Namespace(
data_sampler_group_by_length=False,
data_path='/export/share/manlis/data/lavis/llava_instruct_665k_sharegpt4v/annotations/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json',
batch_size=8,
world_size=8,
gradient_accumulation_steps=1,
rank=0,
workers=4,
image_aspect_ratio='pad',
is_multimodal=True,
mm_use_im_start_end=False,
)
train_dataset, total_num_samples = make_supervised_data_module(tokenizer=text_tokenizer,
image_processor=image_processor,
data_args=args)
# Iter through all data samples.
print(len(train_dataset.dataloader))
for i, sample in enumerate(train_dataset.dataloader):
if (sample['labels'] == IGNORE_INDEX).all():
print(f"sample {i} token mismatch")
pass
This diff is collapsed.
This diff is collapsed.
import torch
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' # hf镜像源
from pathlib import Path
import argparse
from omegaconf import OmegaConf
import torch
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
from open_flamingo import create_model_and_transforms
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--dest_fn",
type=str,
default="/blip-3_pytorch/pretrain_model/xgen-mm-phi3-mini-base-r-v1.5.pt",
)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
# Load model from HF hub.
#model_name_or_path = "/blip-3/pretrain_model/xgen-mm-phi3-mini-base-r-v1.5/"
model_name_or_path = "Salesforce/xgen-mm-phi3-mini-base-r-v1.5"
model = AutoModelForVision2Seq.from_pretrained(
model_name_or_path, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path, trust_remote_code=True, use_fast=True, legacy=False
)
image_processor = AutoImageProcessor.from_pretrained(
model_name_or_path, trust_remote_code=True
)
tokenizer = model.update_special_tokens(tokenizer)
# Test weight loading.
# Set local model configs.
cfg = dict(
model_family="xgenmm_v1",
lm_path="microsoft/Phi-3-mini-4k-instruct",
vision_encoder_path="google/siglip-so400m-patch14-384",
vision_encoder_pretrained="google",
num_vision_tokens=128,
image_aspect_ratio="anyres",
anyres_patch_sampling=True,
anyres_grids=[(1, 2), (2, 1), (2, 2), (3, 1), (1, 3)],
)
cfg = OmegaConf.create(cfg)
additional_kwargs = {
"num_vision_tokens": cfg.num_vision_tokens,
"image_aspect_ratio": cfg.image_aspect_ratio,
"anyres_patch_sampling": cfg.anyres_patch_sampling,
}
# Initialize the model.
local_model, _, _ = create_model_and_transforms(
clip_vision_encoder_path=cfg.vision_encoder_path,
clip_vision_encoder_pretrained=cfg.vision_encoder_pretrained,
lang_model_path=cfg.lm_path,
tokenizer_path=cfg.lm_path,
model_family=cfg.model_family,
**additional_kwargs,
)
try:
local_model.load_state_dict(model.vlm.state_dict(), strict=True)
print("Testing weight loading OK.")
except Exception as e:
print(e)
# Export model weight.
print(f"Saving converted model weight to {args.dest_fn}")
Path(args.dest_fn).parent.mkdir(parents=True, exist_ok=True)
torch.save(model.vlm.state_dict(), args.dest_fn)
#IMAGE_FOLDER_DICT = {
IMAGE_FOLDER_DICT_GCP = {
#"LLaVA-Pretrain": "/public/opendas/DL_DATA/LLaVA-Pretrain",
# "ai2d": "/export/home/blip3_data/ocr_datasets/ai2d",
# "dvqa": "/export/home/blip3_data/ocr_datasets/DVQA",
# "docvqa": "/export/home/blip3_data/ocr_datasets/DocVQA", # Put this before vg, bc docvqa files contain characters.
# "ChartQA_Dataset": "/export/home/blip3_data/ocr_datasets/chartQA/ChartQA_Dataset",
"coco/som_train2017": "/blip-3_pytorch/dataset/SoM-LLaVA/som_train2017",
# "coco/train2017": "/export/home/blip3_data/coco/images/train2017",
# "ocr_vqa": "/export/home/blip3_data/ocr_vqa",
# "vg": "/export/home/blip3_data/visual-genome",
# "gqa": "/export/home/blip3_data/GQA",
# "share_textvqa": "/export/home/blip3_data/share_textvqa", # Put this before the substring below.
# "textvqa": "/export/home/blip3_data/TextVQA",
# 'wikiart': "/export/home/blip3_data/wikiart",
# 'sam/images': '/export/home/blip3_data/sam/images',
# "web-celebrity": "/export/home/blip3_data/web-celebrity",
# "web-landmark": "/export/home/blip3_data/web-landmark",
# "llava/llava_pretrain": "/export/home/blip3_data/llava/llava_pretrain",
# "train2017": "/export/home/blip3_data/coco/images/train2017",
}
# Data args.
# Note: this is an example data config, not for reproducing xgen-mm-instruct.
data_path: {
#'/blip-3/dataset/blip_laion_cc_sbu_558k_fixed.json': 558128
#'/blip-3/dataset/LLaVA-Pretrain/blip_laion_cc_sbu_558k_fixed.json': 558128
# '/mnt/xgen-mm/LLaVA-Pretrain/llava_all_path.json': 558128
# # Llava-665K
# '/export/home/blip3_data/llava_instruct_665k_sharegpt4v/annotations/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.json': 665058, # Total: 665058,
# SoM-llava.
'/blip-3_pytorch/dataset/SoM-LLaVA/som_qa_coco20k.json': 20160,
'/blip-3_pytorch/dataset/SoM-LLaVA/som_listing_coco10k.json': 10000,
# # Text-only. (37k)
# # '/export/share/manlis/data/allava-4v/Evol-Instruct-GPT4-Turbo-143K-filterd.json': 20000, # Total: 143000
# '/export/home/blip3_data/text-only-sft-data/Python-Code-23k-ShareGPT.json': 10000, # Total 22608
# '/export/home/blip3_data/text-only-sft-data/gsm8k-main-train.json': 7473,
# '/export/home/blip3_data/text-only-sft-data/slimorca-dedup.json': 10000, # Total: 363491
# '/export/home/blip3_data/text-only-sft-data/orca-math-word-problems-200k.json': 10000, # Total: 200035
# '/export/home/blip3_data/text-only-sft-data/lima-train.json': 5000, #Total: 1030
# # OCR (72k)
# '/export/home/blip3_data/ocr_datasets/ai2d/ai2d_multichoice_llava_format_single_img_token_train.json': 10000, # Total: 2482
# '/export/home/blip3_data/ocr_datasets/DVQA/dvqa_llava_format.json': 20000, # Total: 2325316
# '/export/home/blip3_data/ocr_datasets/DocVQA/docvqa_llava_format.json': 20649,
# '/export/home/blip3_data/ocr_datasets/chartQA/chartqa_train_augmented_llava_format.json': 20901,
# '/export/home/blip3_data/ocr_datasets/chartQA/chartqa_train_human_llava_format.json': 7398,
}
运行python /blip-3_pytorch/down_dataset_hf.py 从hf下载SoM-LLaVA数据集
import os
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from huggingface_hub import hf_hub_download, snapshot_download
snapshot_download(repo_id="zzxslp/SoM-LLaVA", repo_type="dataset", local_dir='/blip-pytorch/dataset/SoM-LLaVA')
import os
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from huggingface_hub import hf_hub_download, snapshot_download
snapshot_download(repo_id="Salesforce/xgen-mm-phi3-mini-base-r-v1.5", local_dir='/blip-3_pytorch/pretrain_model/xgen-mm-phi3-mini-base-r-v1.5')
import json
from pathlib import Path
# 1. 配置路径(仅需修改这1个参数:JSON文件的路径)
json_file = Path("/public/opendas/DL_DATA/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json")
# 自动获取JSON所在目录(即图片根目录,无需手动改!)
image_root_dir = json_file.parent # 结果:/blip-3/dataset/LLaVA-Pretrain/
# 修复后的JSON保存路径(在原路径后加"_fixed",避免覆盖原始文件)
fixed_json_file = "/blip-3/dataset/blip_laion_cc_sbu_558k_fixed.json"
# 2. 读取原始JSON数据
print(f"正在读取原始JSON:{json_file}")
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError("JSON文件内容必须是列表格式(每个元素为一个样本)")
print(f"成功读取 {len(data)} 个样本")
# 3. 修复每个样本的图片路径(核心逻辑)
fixed_count = 0 # 成功修复的样本数
missing_count = 0 # 原始路径缺失的样本数
for idx, sample in enumerate(data):
# 从样本中获取图片相对路径(常见字段名:image、file_path、img_path,根据你的JSON调整!)
# 先尝试"image"字段(如果你的JSON用其他字段,比如"file_path",就改成sample.get("file_path"))
relative_img_path = sample.get("image")
if not relative_img_path:
missing_count += 1
print(f"警告:第{idx}个样本缺失图片路径,将跳过")
continue
# 拼接绝对路径:JSON所在目录 + 相对路径
absolute_img_path = image_root_dir / relative_img_path
# 转换为字符串格式(避免Path对象在JSON中被序列化为字典)
sample["image"] = str(absolute_img_path)
# (可选)验证路径是否存在,提前排查无效图片
if not absolute_img_path.exists():
print(f"警告:第{idx}个样本的图片不存在 → {absolute_img_path}")
else:
fixed_count += 1
# 4. 保存修复后的JSON
with open(fixed_json_file, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
# 打印修复结果
print("\n" + "="*50)
print(f"路径修复完成!")
print(f"原始样本数:{len(data)}")
print(f"成功修复(路径有效或已拼接):{fixed_count}")
print(f"缺失图片路径的样本:{missing_count}")
print(f"修复后的JSON:{fixed_json_file}")
# 打印第一个样本的路径示例,确认是否正确
if len(data) > 0 and "image" in data[0]:
print(f"示例路径(第一个样本):{data[0]['image']}")
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment