Commit 876a36a4 authored by raojy's avatar raojy
Browse files

first

parent eda2afb8
import torch
from transformers import AutoModel, AutoTokenizer
path = "OpenGVLab/InternVL-Chat-V1-5"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True,
load_in_8bit=True,
).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
model.save_pretrained("release/InternVL-Chat-V1-5-Int8")
tokenizer.save_pretrained("release/InternVL-Chat-V1-5-Int8")
print("finished")
import argparse
import os.path
import torch
from internvl.model.internvl_chat import InternVLChatModel
argparse = argparse.ArgumentParser()
argparse.add_argument("model_path", type=str, default="")
argparse.add_argument("output_path", type=str, default="")
args = argparse.parse_args()
model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
model = model.mlp1.to(torch.bfloat16)
ckpt = model.state_dict()
output_path = os.path.join(args.output_path, "mlp_projector.pth")
torch.save(ckpt, output_path)
print("finished")
import concurrent.futures
import json
import os
import av
import numpy as np
import torch
from decord import VideoReader, cpu
from PIL import Image
from tqdm.auto import tqdm
num_segments = 1
# root directory of evaluation dimension 10
dimension10_dir = "./videos/20bn-something-something-v2"
# root directory of evaluation dimension 11
dimension11_dir = "./videos/EPIC-KITCHENS"
# root directory of evaluation dimension 12
dimension12_dir = "./videos/BreakfastII_15fps_qvga_sync"
def transform_video(buffer):
try:
buffer = buffer.numpy()
except AttributeError:
try:
buffer = buffer.asnumpy()
except AttributeError:
print("Both buffer.numpy() and buffer.asnumpy() failed.")
buffer = None
images_group = list()
for fid in range(len(buffer)):
images_group.append(Image.fromarray(buffer[fid]))
return images_group
def get_index(num_frames, num_segments):
if num_segments > num_frames:
offsets = np.array([idx for idx in range(num_frames)])
else:
# uniform sampling
seg_size = float(num_frames - 1) / num_segments
start = int(seg_size / 2)
offsets = np.array(
[start + int(np.round(seg_size * idx)) for idx in range(num_segments)]
)
return offsets
def fetch_images(qa_item):
use_pyav = False
segment = None
if qa_item["question_type_id"] == 10:
data_path = os.path.join(dimension10_dir, qa_item["data_id"])
start = 0.0
end = 0.0
elif qa_item["question_type_id"] == 11:
data_path = os.path.join(dimension11_dir, qa_item["data_id"].split("/")[-1])
segment = qa_item["segment"]
start, end = segment[0], segment[1]
elif qa_item["question_type_id"] == 12:
data_path = os.path.join(dimension12_dir, qa_item["data_id"])
segment = qa_item["segment"]
start, end = segment[0], segment[1]
use_pyav = True
if use_pyav:
# using pyav for decoding videos in evaluation dimension 12
reader = av.open(data_path)
frames = [
torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)
]
video_len = len(frames)
start_frame, end_frame = start, end
end_frame = min(end_frame, video_len)
offset = get_index(end_frame - start_frame, num_segments)
frame_indices = offset + start_frame
buffer = torch.stack([frames[idx] for idx in frame_indices])
else:
# using decord for decoding videos in evaluation dimension 10-11
vr = VideoReader(data_path, num_threads=1, ctx=cpu(0))
video_len = len(vr)
fps = vr.get_avg_fps()
if segment is not None:
# obtain start and end frame for the video segment in evaluation dimension 11
start_frame = int(min(max(start * fps, 0), video_len - 1))
end_frame = int(min(max(end * fps, 0), video_len - 1))
tot_frames = int(end_frame - start_frame)
offset = get_index(tot_frames, num_segments)
frame_indices = offset + start_frame
else:
# sample frames of the video in evaluation dimension 10
frame_indices = get_index(video_len - 1, num_segments)
vr.seek(0)
buffer = vr.get_batch(frame_indices)
return transform_video(buffer)
def fetch_images_parallel(qa_item):
return qa_item, fetch_images(qa_item)
if __name__ == "__main__":
data = json.load(open("SEED-Bench.json"))
video_img_dir = "SEED-Bench-video-image"
ques_type_id_to_name = {id: n for n, id in data["question_type"].items()}
video_data = [x for x in data["questions"] if x["data_type"] == "video"]
with open(output, "w") as f, concurrent.futures.ThreadPoolExecutor() as executor:
future_to_images = {
executor.submit(fetch_images_parallel, qa_item): qa_item
for qa_item in video_data
}
for future in tqdm(
concurrent.futures.as_completed(future_to_images),
total=len(future_to_images),
):
qa_item = future_to_images[future]
try:
qa_item, images = future.result()
except Exception as exc:
print(f"{qa_item} generated an exception: {exc}")
else:
img_file = f"{qa_item['question_type_id']}_{qa_item['question_id']}.png"
images[0].save(os.path.join(video_img_dir, img_file))
import argparse
import torch
from internvl.model.internvl_chat import InternVLChatModel
argparse = argparse.ArgumentParser()
argparse.add_argument("model_path", type=str, default="")
argparse.add_argument("output_path", type=str, default="")
args = argparse.parse_args()
model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
model = model.vision_model.to(torch.bfloat16)
model.save_pretrained(args.output_path)
print("finished")
import argparse
import json
import os
from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm
FOOT = ImageFont.truetype("/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf", 50)
def custom_image(img_paths, save_path, image_size=448):
captions = [
"CAM_FRONT_LEFT",
"CAM_FRONT",
"CAM_FRONT_RIGHT",
"CAM_BACK_LEFT",
"CAM_BACK",
"CAM_BACK_RIGHT",
]
width = image_size * 2
height = image_size
# count = 0
all_images = {}
for image_id, image_files in tqdm(img_paths.items()):
all_images[image_id] = dict()
all_images[image_id]["images_path"] = image_files
all_images[image_id]["images_size"] = {k: (0, 0) for k in image_files.keys()}
imgs = {}
for caption, image_file in image_files.items():
image_path = os.path.join(
args.data_root,
image_file.replace("../nuscenes/samples/", "/nuscenes/samples/"),
)
img = Image.open(image_path).convert("RGB")
old_wide, old_height = img.size
all_images[image_id]["images_size"][caption] = (old_wide, old_height)
img = img.resize((width, height))
draw = ImageDraw.Draw(img)
text = caption
draw.text((0, 0), text, fill=(255, 0, 255), font=FOOT)
imgs[caption] = img
result_width = width * 3
result_height = height * 2
result_img = Image.new("RGB", (result_width, result_height))
imgs = [imgs[caption] for caption in captions]
for i in range(len(imgs)):
row = i // 3
col = i % 3
left = col * width
top = row * height
right = left + width
bottom = top + height
result_img.paste(imgs[i], (left, top))
result_path = os.path.join(save_path, image_id + ".jpg")
result_img.save(result_path)
def get_images(ann_file):
with open(ann_file, "r") as f: # , \
train_file = json.load(f)
images = {}
for scene_id in train_file.keys():
scene_data = train_file[scene_id]["key_frames"]
for frame_id in scene_data.keys():
image_id = scene_id + "_" + frame_id
if image_id not in images:
images[image_id] = scene_data[frame_id]["image_paths"]
else:
print(image_id)
return images
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--data-root",
type=str,
default="InternVL-Domain-Adaptation-Data/images/drivelm",
)
parser.add_argument(
"--ann-file", type=str, default="path/to/v1_1_val_nus_q_only.json"
)
args = parser.parse_args()
images = get_images(args.ann_file)
save_path = os.path.join(args.data_root, "stitch")
os.makedirs(save_path, exist_ok=True)
custom_image(img_paths=images, save_path=save_path)
import argparse
import json
import os
from collections import OrderedDict
from copy import deepcopy
import torch
from safetensors import safe_open
from transformers import (
AutoConfig,
AutoModel,
AutoModelForImageTextToText,
AutoTokenizer,
)
def compute_l2_distance(model1, model2):
state_dict1 = model1.state_dict()
state_dict2 = model2.state_dict()
total_l2 = 0.0
total_params = 0
common_keys = set(state_dict1.keys()) & set(state_dict2.keys())
for key in common_keys:
t1 = state_dict1[key].float().cpu()
t2 = state_dict2[key].float().cpu()
if t1.shape != t2.shape:
print(f"⚠️ Shape mismatch at key: {key}, skipping.")
continue
diff = t1 - t2
l2 = torch.norm(diff, p=2)
total_l2 += l2.item()
total_params += diff.numel()
print(f"\n✅ Total L2 distance: {total_l2:.6f}")
print(
f"✅ Average per-parameter L2: {total_l2 / total_params:.8f}"
if total_params > 0
else "⚠️ No matching parameters."
)
return total_l2
def convert_keys_to_hf(custom_state_dict):
new_state_dict = OrderedDict()
qkv_split_buffer = {}
for key, value in custom_state_dict.items():
# === 1. mlp1.* → multi_modal_projector
if key.startswith("mlp1.0."):
new_key = "model." + key.replace(
"mlp1.0.", "multi_modal_projector.layer_norm."
)
elif key.startswith("mlp1.1."):
new_key = "model." + key.replace(
"mlp1.1.", "multi_modal_projector.linear_1."
)
elif key.startswith("mlp1.3."):
new_key = "model." + key.replace(
"mlp1.3.", "multi_modal_projector.linear_2."
)
# === 2. embeddings ===
elif key == "vision_model.embeddings.class_embedding":
new_key = "model.vision_tower.embeddings.cls_token"
elif key.startswith("vision_model.embeddings.patch_embedding"):
new_key = "model." + key.replace(
"vision_model.embeddings.patch_embedding",
"vision_tower.embeddings.patch_embeddings.projection",
)
elif key == "vision_model.embeddings.position_embedding":
new_key = "model.vision_tower.embeddings.position_embeddings"
# === 3. encoder ===
elif key.startswith("vision_model.encoder.layers."):
parts = key.split(".")
layer_id = parts[3]
suffix = ".".join(parts[4:])
base = f"model.vision_tower.encoder.layer.{layer_id}."
if suffix.startswith("attn.qkv.weight"):
qkv_split_buffer[(layer_id, "weight")] = value
continue
elif suffix.startswith("attn.qkv.bias"):
qkv_split_buffer[(layer_id, "bias")] = value
continue
elif suffix.startswith("attn.proj."):
new_key = base + "attention.projection_layer." + suffix.split(".")[-1]
elif suffix.startswith("norm1."):
new_key = base + "layernorm_before." + suffix.split(".")[-1]
elif suffix.startswith("norm2."):
new_key = base + "layernorm_after." + suffix.split(".")[-1]
elif suffix == "ls1":
new_key = base + "lambda_1"
elif suffix == "ls2":
new_key = base + "lambda_2"
else:
new_key = base + suffix
# === 4. language_model.model. → language_model.
elif (
key == "language_model.lm_head.weight"
or key == "language_model.model.lm_head.weight"
):
new_key = "lm_head.weight"
elif key.startswith("language_model.model."):
new_key = "model." + key.replace("language_model.model.", "language_model.")
# === 5. already has model. prefix or default
elif key.startswith("model."):
new_key = key
else:
new_key = "model." + key
new_state_dict[new_key] = value
# === 6. Split QKV ===
for (layer_id, typ), tensor in qkv_split_buffer.items():
d = tensor.shape[0] // 3
q, k, v = tensor[:d], tensor[d : 2 * d], tensor[2 * d :]
base = f"model.vision_tower.encoder.layer.{layer_id}.attention."
if typ == "weight":
new_state_dict[base + "q_proj.weight"] = q
new_state_dict[base + "k_proj.weight"] = k
new_state_dict[base + "v_proj.weight"] = v
else:
new_state_dict[base + "q_proj.bias"] = q
new_state_dict[base + "k_proj.bias"] = k
new_state_dict[base + "v_proj.bias"] = v
return new_state_dict
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert custom safetensors weights and compare with HuggingFace model."
)
parser.add_argument(
"--custom_path",
type=str,
required=True,
help="Path to original safetensors checkpoint folder",
)
parser.add_argument(
"--hf_path",
type=str,
required=True,
help="Path to pretrained HuggingFace model",
)
parser.add_argument(
"--save_path", type=str, required=True, help="Path to save the converted model"
)
args = parser.parse_args()
mllm_custom_path = args.custom_path
mllm_hf_path = args.hf_path
mllm_save_path = args.save_path
# Load custom model configuration
config = AutoConfig.from_pretrained(mllm_hf_path, trust_remote_code=True)
model = AutoModelForImageTextToText.from_config(config, trust_remote_code=True).to(
"cuda"
)
# Load HF safetensor weights
checkpoint_paths = [
os.path.join(mllm_custom_path, f)
for f in os.listdir(mllm_custom_path)
if f.endswith(".safetensors")
]
print(f"\n🔍 Found checkpoint files: {checkpoint_paths}")
model_state_dict_hf = {}
for checkpoint_path in checkpoint_paths:
with safe_open(checkpoint_path, framework="pt") as f:
for k in f.keys():
model_state_dict_hf[k] = f.get_tensor(k)
# Convert key naming style
model_state_dict = convert_keys_to_hf(model_state_dict_hf)
# Load weights into model
missing_keys, unexpected_keys = model.load_state_dict(
model_state_dict, strict=False
)
print(f"\n❌ Missing keys: {missing_keys}")
print(f"⚠️ Unexpected keys: {unexpected_keys}")
# Load original model for comparison
model_compare = AutoModelForImageTextToText.from_pretrained(
mllm_hf_path, trust_remote_code=True
)
compute_l2_distance(model, model_compare)
# Save the converted model
model.save_pretrained(mllm_save_path)
tokenizer = AutoTokenizer.from_pretrained(mllm_hf_path, trust_remote_code=True)
tokenizer.save_pretrained(mllm_save_path)
import argparse
import json
import os
from copy import deepcopy
import torch
from safetensors import safe_open
from transformers import AutoConfig, AutoModel, AutoTokenizer
def compute_l2_distance(model1, model2):
state_dict1 = model1.state_dict()
state_dict2 = model2.state_dict()
total_l2 = 0.0
total_params = 0
common_keys = set(state_dict1.keys()) & set(state_dict2.keys())
for key in common_keys:
t1 = state_dict1[key].float().cpu()
t2 = state_dict2[key].float().cpu()
if t1.shape != t2.shape:
print(f"⚠️ Shape mismatch at key: {key}, skipping.")
continue
diff = t1 - t2
l2 = torch.norm(diff, p=2)
total_l2 += l2.item()
total_params += diff.numel()
print(f"\n✅ Total L2 distance: {total_l2:.6f}")
print(
f"✅ Average per-parameter L2: {total_l2 / total_params:.8f}"
if total_params > 0
else "⚠️ No matching parameters."
)
return total_l2
def convert_keys_back(hf_state_dict):
new_state_dict = {}
# Temporary buffer for QKV parts, separated into weight and bias
qkv_buffer = {}
for key, value in hf_state_dict.items():
# === 1. multi_modal_projector → mlp1.*
if key.startswith("multi_modal_projector.layer_norm."):
new_key = key.replace("multi_modal_projector.layer_norm.", "mlp1.0.")
elif key.startswith("multi_modal_projector.linear_1."):
new_key = key.replace("multi_modal_projector.linear_1.", "mlp1.1.")
elif key.startswith("multi_modal_projector.linear_2."):
new_key = key.replace("multi_modal_projector.linear_2.", "mlp1.3.")
# === 2. embeddings ===
elif key == "vision_tower.embeddings.cls_token":
new_key = "vision_model.embeddings.class_embedding"
elif key.startswith("vision_tower.embeddings.patch_embeddings.projection."):
new_key = key.replace(
"vision_tower.embeddings.patch_embeddings.projection",
"vision_model.embeddings.patch_embedding",
)
elif key == "vision_tower.embeddings.position_embeddings":
new_key = "vision_model.embeddings.position_embedding"
# === 3. encoder.layer.X → encoder.layers.X
elif key.startswith("vision_tower.encoder.layer."):
parts = key.split(".")
layer_id = parts[3]
suffix = ".".join(parts[4:])
base = f"vision_model.encoder.layers.{layer_id}."
# Handle QKV weight and bias separately
if suffix in {
"attention.q_proj.weight",
"attention.k_proj.weight",
"attention.v_proj.weight",
"attention.q_proj.bias",
"attention.k_proj.bias",
"attention.v_proj.bias",
}:
if layer_id not in qkv_buffer:
qkv_buffer[layer_id] = {"weight": {}, "bias": {}}
if suffix.endswith(".weight"):
if "q_proj" in suffix:
qkv_buffer[layer_id]["weight"]["q_proj"] = value
elif "k_proj" in suffix:
qkv_buffer[layer_id]["weight"]["k_proj"] = value
elif "v_proj" in suffix:
qkv_buffer[layer_id]["weight"]["v_proj"] = value
elif suffix.endswith(".bias"):
if "q_proj" in suffix:
qkv_buffer[layer_id]["bias"]["q_proj"] = value
elif "k_proj" in suffix:
qkv_buffer[layer_id]["bias"]["k_proj"] = value
elif "v_proj" in suffix:
qkv_buffer[layer_id]["bias"]["v_proj"] = value
continue # Postpone concatenation
elif suffix.startswith("attention.projection_layer."):
new_key = base + "attn.proj." + suffix.split(".")[-1]
elif suffix.startswith("layernorm_before."):
new_key = base + "norm1." + suffix.split(".")[-1]
elif suffix.startswith("layernorm_after."):
new_key = base + "norm2." + suffix.split(".")[-1]
elif suffix == "lambda_1":
new_key = base + "ls1"
elif suffix == "lambda_2":
new_key = base + "ls2"
else:
new_key = base + suffix
else:
new_key = key
new_state_dict[new_key] = value
# === 4. Concatenate QKV weights and biases ===
for layer_id, qkv_parts in qkv_buffer.items():
base = f"vision_model.encoder.layers.{layer_id}.attn.qkv"
# Concatenate weights
if all(k in qkv_parts["weight"] for k in ("q_proj", "k_proj", "v_proj")):
qkv_weight = torch.cat(
[
qkv_parts["weight"]["q_proj"],
qkv_parts["weight"]["k_proj"],
qkv_parts["weight"]["v_proj"],
],
dim=0,
)
new_state_dict[base + ".weight"] = qkv_weight
# Concatenate biases
if all(k in qkv_parts["bias"] for k in ("q_proj", "k_proj", "v_proj")):
qkv_bias = torch.cat(
[
qkv_parts["bias"]["q_proj"],
qkv_parts["bias"]["k_proj"],
qkv_parts["bias"]["v_proj"],
],
dim=0,
)
new_state_dict[base + ".bias"] = qkv_bias
return new_state_dict
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert HF model weights to original custom key format and compare."
)
parser.add_argument(
"--custom_path",
type=str,
required=True,
help="Path to custom model config and tokenizer",
)
parser.add_argument(
"--hf_path",
type=str,
required=True,
help="Path to HF-formatted safetensor weights",
)
parser.add_argument(
"--save_path", type=str, required=True, help="Path to save converted model"
)
args = parser.parse_args()
mllm_custom_path = args.custom_path
mllm_hf_path = args.hf_path
mllm_save_path = args.save_path
# Load custom model configuration
config = AutoConfig.from_pretrained(mllm_custom_path, trust_remote_code=True)
model = AutoModel.from_config(config, trust_remote_code=True)
# Load HF safetensor weights
checkpoint_paths = [
os.path.join(mllm_hf_path, f)
for f in os.listdir(mllm_hf_path)
if f.endswith(".safetensors")
]
print(f"\n🔍 Found checkpoint files: {checkpoint_paths}")
model_state_dict_hf = {}
for checkpoint_path in checkpoint_paths:
with safe_open(checkpoint_path, framework="pt") as f:
for k in f.keys():
model_state_dict_hf[k] = f.get_tensor(k)
# Convert key naming style
model_state_dict = convert_keys_back(model_state_dict_hf)
# Load weights into model
missing_keys, unexpected_keys = model.load_state_dict(
model_state_dict, strict=False
)
print(f"\n❌ Missing keys: {missing_keys}")
print(f"⚠️ Unexpected keys: {unexpected_keys}")
# Load original model for comparison
model_compare = AutoModel.from_pretrained(mllm_custom_path, trust_remote_code=True)
compute_l2_distance(model, model_compare)
# Save the converted model
model.save_pretrained(mllm_save_path)
tokenizer = AutoTokenizer.from_pretrained(mllm_custom_path, trust_remote_code=True)
tokenizer.save_pretrained(mllm_save_path)
import argparse
import json
argparse = argparse.ArgumentParser()
argparse.add_argument("path", type=str)
args = argparse.parse_args()
assert args.path.endswith(".json")
data = json.load(open(args.path))
writer = open(args.path.replace(".json", ".jsonl"), "w")
for idx, item in enumerate(data):
conversations = item["conversations"]
if conversations[0]["from"] == "system":
item["conversations"] = item["conversations"][1:]
item["id"] = idx
writer.write(json.dumps(item, ensure_ascii=False) + "\n")
writer.close()
import argparse
import json
import os
argparse = argparse.ArgumentParser()
argparse.add_argument("path", type=str)
args = argparse.parse_args()
assert args.path.endswith(".jsonl")
f = open(args.path)
data = [json.loads(line) for line in f.readlines()]
writer = open(args.path.replace(".jsonl", "_new.jsonl"), "w")
for idx, item in enumerate(data):
item["id"] = idx
conversations = item["conversations"]
if conversations[0]["from"] == "system":
item["conversations"] = item["conversations"][1:]
writer.write(json.dumps(item, ensure_ascii=False) + "\n")
writer.close()
import argparse
import torch
from internvl.model.internvl_chat import InternVLChatModel
from transformers import AutoTokenizer
argparse = argparse.ArgumentParser()
argparse.add_argument("input_path", type=str, help="Path to the input model")
argparse.add_argument("output_path", type=str, help="Path to the output model")
args = argparse.parse_args()
print("Loading model...")
model = InternVLChatModel.from_pretrained(
args.input_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
).eval()
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(args.input_path, trust_remote_code=True)
if model.config.use_backbone_lora:
model.vision_model.merge_and_unload()
model.vision_model = model.vision_model.model
model.config.use_backbone_lora = 0
if model.config.use_llm_lora:
model.language_model.merge_and_unload()
model.language_model = model.language_model.model
model.config.use_llm_lora = 0
print("Saving model...")
model.save_pretrained(args.output_path)
print("Saving tokenizer...")
tokenizer.save_pretrained(args.output_path)
print("Done!")
import argparse
import torch
from internvl.model.internvl_chat import InternVLChatModel
from transformers import AutoModel, AutoTokenizer
argparse = argparse.ArgumentParser()
argparse.add_argument("model_path", type=str, default="")
argparse.add_argument("llm_path", type=str, default="")
args = argparse.parse_args()
if args.model_path[-1] == "/":
args.model_path = args.model_path[:-1]
model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
llm = AutoModel.from_pretrained(
args.llm_path, trust_remote_code=True, torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(args.llm_path, trust_remote_code=True)
model.language_model = llm
model.config.llm_config = llm.config
model.to(torch.bfloat16)
output_path = args.model_path + "_replace_llm"
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print("finished")
import argparse
import torch
from internvl.model.internvl_chat import InternVLChatModel
from transformers import AutoTokenizer
argparse = argparse.ArgumentParser()
argparse.add_argument("model_path", type=str, default="")
argparse.add_argument("output_path", type=str, default="")
argparse.add_argument("force_image_size", type=int, default=448)
args = argparse.parse_args()
model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
model.vision_model.resize_pos_embeddings(
old_size=model.config.vision_config.image_size,
new_size=args.force_image_size,
patch_size=14,
)
model.config.vision_config.image_size = args.force_image_size
model.config.force_image_size = args.force_image_size
model.save_pretrained(args.output_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
tokenizer.save_pretrained(args.output_path)
print("finished")
{
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 1e9,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 1e9,
"contiguous_gradients": true
},
"fp16": {
"enabled": "auto",
"auto_cast": true,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": [
0.9,
0.999
],
"eps": 1e-8,
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": true
}
{
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 1e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 1e8,
"contiguous_gradients": true
},
"fp16": {
"enabled": "auto",
"auto_cast": true,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": [
0.9,
0.999
],
"eps": 1e-8,
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
{
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": 1e9,
"stage3_prefetch_bucket_size": 1e9,
"stage3_param_persistence_threshold": 1e7,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"fp16": {
"enabled": "auto",
"auto_cast": true,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": [
0.9,
0.999
],
"eps": 1e-8,
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": true
}
{
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": 1e9,
"stage3_prefetch_bucket_size": 1e9,
"stage3_param_persistence_threshold": 1e4,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"fp16": {
"enabled": "auto",
"auto_cast": true,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": [
0.9,
0.999
],
"eps": 1e-8,
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": true
}
{
"zero_optimization": {
"stage": 3,
"overlap_comm": false,
"contiguous_gradients": true,
"sub_group_size": 1e7,
"reduce_bucket_size": 1e7,
"stage3_prefetch_bucket_size": 1e7,
"stage3_param_persistence_threshold": 1e4,
"stage3_max_live_parameters": 1e8,
"stage3_max_reuse_distance": 1e8,
"stage3_gather_16bit_weights_on_model_save": true,
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
}
},
"fp16": {
"enabled": "auto",
"auto_cast": true,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": [
0.9,
0.999
],
"eps": 1e-8,
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": true
}
{
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e8,
"reduce_bucket_size": 1e8,
"stage3_prefetch_bucket_size": 1e8,
"stage3_param_persistence_threshold": 1e4,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"fp16": {
"enabled": "auto",
"auto_cast": true,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": [
0.9,
0.999
],
"eps": 1e-8,
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": true
}
{
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": 1e9,
"stage3_prefetch_bucket_size": 1e9,
"stage3_param_persistence_threshold": 1e5,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"fp16": {
"enabled": "auto",
"auto_cast": true,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": [
0.9,
0.999
],
"eps": 1e-8,
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": true
}
{
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": 1e9,
"stage3_prefetch_bucket_size": 1e9,
"stage3_param_persistence_threshold": 1e5,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"fp16": {
"enabled": "auto",
"auto_cast": true,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": [
0.9,
0.999
],
"eps": 1e-8,
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": true
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment