Commit 5e887c2c authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #1060 canceled with stages
import pandas as pd
import io
import base64
import json
from PIL import Image
'''
This scripts convert mmbench_dev tsv file to jsonl
'''
datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
global_choices = ['A', 'B', 'C', 'D']
def decode_base64_to_image(base64_string):
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
return image
with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f:
for idx in range(len(datas)):
data = datas.iloc[idx]
index = int(data['index'])
question = data['question']
hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
choices = []
for opt in global_choices:
if pd.isna(data[opt]):
continue
choices.append(data[opt])
answer = global_choices.index(data['answer'])
image = decode_base64_to_image(data['image'])
image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index)
f.write(json.dumps({
"index": index,
"image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index,
"hint": hint,
"question": question,
"choices": choices,
"answer": answer,
}) + "\n")
import pandas as pd
import io
import base64
import json
from PIL import Image
'''
This script convert mmbench_test tsv file to jsonl
This script is very similar to mmbench_converter_dev except there's no answer for accuracy calculation
'''
datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
global_choices = ['A', 'B', 'C', 'D']
def decode_base64_to_image(base64_string):
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
return image
with open('./data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl', 'w') as f:
for idx in range(len(datas)):
data = datas.iloc[idx]
index = int(data['index'])
question = data['question']
hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
choices = []
for opt in global_choices:
if pd.isna(data[opt]):
continue
choices.append(data[opt])
# answer = global_choices.index(data['answer'])
image = decode_base64_to_image(data['image'])
image.save("data/mmbench/mmbench_test_20230712/images/%d.jpg" % index)
f.write(json.dumps({
"index": index,
"image": "data/mmbench/mmbench_test_20230712/images/%d.jpg" % index,
"hint": hint,
"question": question,
"choices": choices,
# "answer": answer,
}) + "\n")
import pandas as pd
import json
'''
This script provides `global top-1 accuracy` metric calculation for mmbench_dev.
'''
predictions = json.load(open('mmbench_dev_20230712.json'))
index2predictions = {}
for pred in predictions:
index2predictions[pred['index']] = pred['prediction']
datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
glb_opts = ['A', 'B', 'C', 'D']
index2answer = {}
for idx in range(len(datas)):
data = datas.iloc[idx]
index2answer[data['index']] = glb_opts.index(data['answer'])
identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
correct = 0
total = 0
for index in identity_indexes:
for _ in range(4):
cycle_index = int(_ * 1e6 + index)
if index2predictions.get(cycle_index, None) is not None:
if index2predictions[cycle_index] == index2answer[cycle_index]:
continue
else:
print(cycle_index)
break
else:
correct += 1
total += 1
print(correct, total)
import pandas as pd
import json
import random
'''
This script provides metric calculation for mmbench_dev with the same accuarcy algo as OpenCompass server
'''
predictions = json.load(open('mmbench_dev_20230712.json'))
index2predictions = {}
for pred in predictions:
index2predictions[pred['index']] = pred['prediction']
from collections import Counter
def most_common_elements(lst):
counter = Counter(lst)
max_count = max(counter.values())
most_common = [element for element, count in counter.items() if count == max_count]
return random.choice(most_common) # random sample from random choice
datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
glb_opts = ['A', 'B', 'C', 'D']
index2answer = {}
index2choices = {}
index2rawanswer = {}
for idx in range(len(datas)):
data = datas.iloc[idx]
choices = []
for opt in glb_opts:
if not pd.isna(data[opt]):
choices.append(data[opt])
index2choices[data['index']] = choices
index2answer[data['index']] = glb_opts.index(data['answer'])
index2rawanswer[data['index']] = choices[glb_opts.index(data['answer'])]
identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
correct = 0
total = 0
for index in identity_indexes:
raw_preds = []
raw_answer = []
for _ in range(4):
cycle_index = int(_ * 1e6 + index)
if index2predictions.get(cycle_index, None) is not None:
raw_answer = index2rawanswer[cycle_index]
raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
raw_preds.append(raw_pred)
if len(set(raw_preds)) == 1:
if raw_preds[0] == raw_answer:
correct += 1
else:
result = most_common_elements(raw_preds)
if result == raw_answer:
correct += 1
total += 1
print(correct, total, correct / total * 100.)
import pandas as pd
import json
import random
'''
This script convert the output file of our inference processor to target formation of OpenCompass evaluator server
'''
predictions = json.load(open('mmbench_test_20230712.json'))
index2predictions = {}
for pred in predictions:
index2predictions[pred['index']] = pred['prediction']
from collections import Counter
def most_common_elements(lst):
counter = Counter(lst)
max_count = max(counter.values())
most_common = [element for element, count in counter.items() if count == max_count]
print(most_common)
return random.choice(most_common)
# return most_common
datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
datas = datas.drop('image', axis=1)
glb_opts = ['A', 'B', 'C', 'D']
index2choices = {}
for idx in range(len(datas)):
data = datas.iloc[idx]
choices = []
for opt in glb_opts:
if not pd.isna(data[opt]):
choices.append(data[opt])
index2choices[data['index']] = choices
identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
processed_index2predictions = {}
for index in identity_indexes:
raw_preds = []
for _ in range(4):
cycle_index = int(_ * 1e6 + index)
if index2predictions.get(cycle_index, None) is not None:
raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
raw_preds.append(raw_pred)
if len(set(raw_preds)) == 1:
pred_answer = raw_preds[0]
else:
pred_answer = most_common_elements(raw_preds)
print(index, pred_answer)
for _ in range(4):
cycle_index = int(_ * 1e6 + index)
if index2predictions.get(cycle_index, None) is not None:
processed_index2predictions[cycle_index] = index2choices[cycle_index].index(pred_answer)
predictions = []
for idx in range(len(datas)):
data = datas.iloc[idx]
index = data['index']
prediction = glb_opts[processed_index2predictions[index]]
predictions.append(prediction)
datas['prediction'] = predictions
datas.to_excel("mmbench_test_20230712_230831_constrained.xlsx", index=False)
# constrained means we force the model predict same answer when tested on a question for multiple times
# MME Benchmark
[MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) is a comprehensive evaluation benchmark for multimodal large language models. It measures both perception and cognition abilities on a total of 14 subtasks, including existence, count, position, color, poster, celebrity, scene, landmark, artwork, OCR, commonsense reasoning, numerical calculation, text translation, and code reasoning.
Qwen-VL-Chat achieves SOTAs on both perception and cognition evaluation.
Perception Evaluation
| Rank | Model | Version | Score |
|:----:|:---------------:|:------------------------:|:-------:|
| 1 | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)**| **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)** | **1487.57** |
| 2 | Skywork-MM | Skywork-MM-13B | 1419.08 |
| 3 | MMICL | FlanT5xxl | 1376.00 |
| 4 | Lynx | vicuna-7b | 1373.23 |
| 5 | BLIVA | FlanT5xxl | 1337.73 |
Cognition Evaluation
| Rank | Model | Version | Score |
|:----:|:----------------:|:--------------:|:----------:|
| 1 | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)** | **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)** | **360.71** |
| 2 | MMICL | FlanT5xxl | 360.36 |
| 3 | Skywork-MM | Skywork-MM-13B | 356.43 |
| 4 | BLIVA | FlanT5xxl | 331.43 |
| 5 | LRV-Instruction | LRV-7B | 328.21 |
Full Metrics
```
=========== Perception ===========
total score: 1487.576330532213
existence score: 158.33333333333331
count score: 150.0
position score: 128.33333333333334
color score: 170.0
posters score: 178.57142857142856
celebrity score: 120.58823529411764
scene score: 152.25
landmark score: 164.0
artwork score: 125.5
OCR score: 140.0
=========== Cognition ===========
total score: 360.71428571428567
commonsense_reasoning score: 130.7142857142857
numerical_calculation score: 40.0
text_translation score: 147.5
code_reasoning score: 42.5
```
## How To Reproduce Results of MME Benchmark
1. Download MME images and eval_tool from the [MME repo](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/blob/Evaluation/README.md)
2. Rearrange images by executing `python get_images.py`
3. Evaluate Qwen-VL-Chat results by executing `python eval.py`
4. Calculate MME results by executing `python calculation.py --results_dir Qwen-VL-Chat`, which the calculation script comes from the MME eval_tool.
import os
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
checkpoint = 'Qwen/Qwen-VL-Chat'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
checkpoint, device_map='cuda', trust_remote_code=True).eval()
model.generation_config = GenerationConfig.from_pretrained(checkpoint, trust_remote_code=True)
model.generation_config.top_p = 0.01
root = 'Your_Results'
output = 'Qwen-VL-Chat'
os.makedirs(output, exist_ok=True)
for filename in os.listdir(root):
with open(os.path.join(root, filename), 'r') as fin, open(os.path.join(output, filename), 'w') as fout:
lines = fin.read().splitlines()
filename = filename.replace('.txt', '')
for line in tqdm(lines):
img, question, gt = line.strip().split('\t')
img_path = os.path.join('images', filename, img)
assert os.path.exists(img_path), img_path
query = f'<img>{img_path}</img>\n{question}'
response, _ = model.chat(tokenizer, query=query, history=None)
print(img, question, gt, response, sep='\t', file=fout)
import os
from tqdm import tqdm
os.system('rm -rf images')
os.system('mkdir images')
os.system('cp -r ../MME_Benchmark_release/OCR images/')
os.system('mkdir images/artwork')
os.system('cp ../MME_Benchmark_release/artwork/questions_answers_YN/* images/artwork/')
with open('LaVIN/artwork.txt') as fin:
paths = [ line.strip().split('\t', 1)[0] for line in fin ]
paths = list(set(paths))
for path in tqdm(paths):
os.system(f'cp ../MME_Benchmark_release/artwork/images/toy_dataset/{path} images/artwork/{path}')
os.system('mkdir images/celebrity')
os.system('cp ../MME_Benchmark_release/celebrity/images/* images/celebrity/')
os.system('cp ../MME_Benchmark_release/celebrity/questions_answers_YN/* images/celebrity/')
os.system('cp -r ../MME_Benchmark_release/code_reasoning images/')
os.system('cp -r ../MME_Benchmark_release/color images/')
os.system('cp -r ../MME_Benchmark_release/commonsense_reasoning images/')
os.system('cp -r ../MME_Benchmark_release/count images/')
os.system('cp -r ../MME_Benchmark_release/existence images/')
os.system('mkdir images/landmark')
os.system('cp ../MME_Benchmark_release/landmark/images/* images/landmark/')
os.system('cp ../MME_Benchmark_release/landmark/questions_answers_YN/* images/landmark/')
os.system('cp -r ../MME_Benchmark_release/numerical_calculation images/')
os.system('cp -r ../MME_Benchmark_release/position images/')
os.system('mkdir images/posters')
os.system('cp ../MME_Benchmark_release/posters/images/* images/posters/')
os.system('cp ../MME_Benchmark_release/posters/questions_answers_YN/* images/posters/')
os.system('mkdir images/scene')
os.system('cp ../MME_Benchmark_release/scene/images/* images/scene/')
os.system('cp ../MME_Benchmark_release/scene/questions_answers_YN/* images/scene/')
os.system('cp -r ../MME_Benchmark_release/text_translation images/')
# Seed-Bench Evaluation
[SEED-Bench](https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard) is a multimodal benchmark of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both **image** and **video** understanding.
Qwen-VL and Qwen-VL-Chat achieve SOTAs on this benchmark.
<p align="center">
<img src="leaderboard.jpg"/>
<p>
## How To Process Video by Qwen-VL
Qwen-VL and Qwen-VL-Chat didn't train any video data or tasks during training, but they can understand some videos in a zero-shot way. For the video question-answering task, we utilize four uniformly sampled frames per video sample. These frames are treated as separate images and are stitched into the context. For example:
```
{
"question_id": "v0",
"prompt": "<img>video_imgs_4/v0_0.jpg</img>\n<img>video_imgs_4/v0_1.jpg</img>\n<img>video_imgs_4/v0_2.jpg</img>\n<img>video_imgs_4/v0_3.jpg</img>\nQuestion: Can you identify the action taking place in the video?\nOptions: A. pretending to take something out of something\nB. pretending to take something from somewhere\nC. feigning to insert something into something\nD. simulating putting something onto something\nAnswer:"
}
```
The above JSON line can be used as the input by `eval_mm/seed_bench/eval.py` and output the following results:
```
{"question_id": "v0", "prediction": "B"}
```
Please see [eval_mm/seed_bench/eval.py](eval.py) for more inference details.
## How To Reproduce Results of Seed-Bench
1. Download all images and videos by following the [instruction](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md). Then modify the root path in `eval_mm/seed_bench/trans.py` with your customized path.
```
# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
seed_bench_input_path = 'SEED-Bench.json'
# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
# root directory of evaluation dimension 10
dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
# root directory of evaluation dimension 11
dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
# root directory of evaluation dimension 12
dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
```
2. Generate input files of Qwen-VL with the JSON formatting.
```
cd eval_mm/seed_bench/
python trans.py
```
This script will output two JSONL files and one directory. `image_input.jsonl` is the input file of image evaluation and `video_input_4.jsonl` is the input file of video evaluation by 4 frames. The directory `video_imgs_4` contains all 4-framed images extracted from videos. We provide our [image_input.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/image_input.jsonl) and [video_input_4.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/video_input_4.jsonl) here for reference.
3. Produce the results of Seed-Bench.
```
# The number of available GPUs
export NPROC_PER_NODE=8
# Produce the Qwen-VL-Chat results of image understanding
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
eval.py \
--checkpoint Qwen/Qwen-VL-Chat \
--dataset image_input.jsonl \
--batch-size 4 \
--num-workers 2
# Collect the result files
cat result_?.jsonl >results_chat_img.jsonl
rm result_?.jsonl
# Produce the results of video understanding
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
eval.py \
--checkpoint Qwen/Qwen-VL-Chat \
--dataset video_input_4.jsonl \
--batch-size 2 \
--num-workers 1
# Collect the result files
cat result_?.jsonl >results_chat_vid.jsonl
rm result_?.jsonl
# The file `results_chat.jsonl` can be submitted to the leaderboard
cat results_chat_img.jsonl results_chat_vid.jsonl >results_chat.jsonl
```
You can reproduce the Seed-Bench results of Qwen-VL by replacing `Qwen/Qwen-VL-Chat` with `Qwen/Qwen-VL` on the above script.
import argparse
import itertools
import json
import os
from functools import partial
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
def collate_fn(batches, pad_token_id):
input_tokens = [_['input_tokens'] for _ in batches]
target_lengths = [_['target_lengths'] for _ in batches]
answers = [_['answer'] for _ in batches]
question_id = [_['question_id'] for _ in batches]
chunk_sizes = [len(_) for _ in input_tokens]
input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
max_lengths = max([len(_) for _ in input_tokens])
input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
for _ in input_tokens]
input_tokens = torch.LongTensor(input_tokens)
attention_mask = 1 - input_tokens.eq(pad_token_id).float()
return input_tokens, attention_mask, target_lengths, answers, chunk_sizes, question_id
class MultipleChoiceDataste(torch.utils.data.Dataset):
def __init__(self, test, tokenizer):
self.datas = []
with open(test) as fin:
for line in tqdm(fin):
self.datas.append(json.loads(line.strip()))
self.tokenizer = tokenizer
def __len__(self):
return len(self.datas)
def __getitem__(self, idx):
data = self.datas[idx]
prompt = data['prompt']
prompt_tokens = self.tokenizer(prompt).input_ids
target_tokens = [
self.tokenizer(' ' + _).input_ids
for _ in ['A', 'B', 'C', 'D']
]
return {
'input_tokens': [prompt_tokens + _ for _ in target_tokens],
'target_lengths': [len(_) for _ in target_tokens],
'answer': data['answer'],
'question_id': data['question_id'],
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size,
self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--dataset', type=str, default='')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
args = parser.parse_args()
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
trust_remote_code=True)
model.generation_config = GenerationConfig.from_pretrained(args.checkpoint, trust_remote_code=True)
model.generation_config.top_p = 0.01
dataset = MultipleChoiceDataste(test=args.dataset, tokenizer=tokenizer)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
# sampler=InferenceSampler(1000),
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
)
results = []
fout = open('result_{}.jsonl'.format(torch.distributed.get_rank()), 'w')
with torch.no_grad():
for _, (input_tokens, attention_mask, target_lengths, answers,
chunk_sizes, question_ids) in tqdm(enumerate(dataloader)):
outputs = model(
input_ids=input_tokens[:, :-1].cuda(),
attention_mask=attention_mask[:, :-1].cuda(),
return_dict=True,
)
losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
0, 2, 1),
input_tokens[:,
1:].cuda(),
reduction='none')
losses = losses.split(chunk_sizes, dim=0)
for loss, target_length, answer, question_id in zip(losses, target_lengths,
answers, question_ids):
target_loss = loss.mean(-1)
for _ in range(len(target_length)):
target_loss[_] = loss[_, -target_length[_]:].mean()
pred = target_loss.argmin().item()
pred = chr(pred + 65)
if pred == answer:
results.append(1)
else:
results.append(0)
answer_record = {
'question_id': question_id,
'prediction': pred
}
print(json.dumps(answer_record), file=fout)
fout.close()
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_results = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_results, results)
merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
if torch.distributed.get_rank() == 0:
print(f"Evaluating {args.dataset} ...")
print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
torch.distributed.barrier()
import os
import av
import json
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
from decord import VideoReader, cpu
# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
seed_bench_input_path = 'SEED-Bench.json'
# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
# root directory of evaluation dimension 10
dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
# root directory of evaluation dimension 11
dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
# root directory of evaluation dimension 12
dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
def is_integer_string(s):
try:
int(s)
return True
except ValueError:
return False
def filter_questions(data, task='all'):
if task == "image":
return [q for q in data if 1 <= q["question_type_id"] <= 9]
elif task == "video":
return [q for q in data if 10 <= q["question_type_id"] <= 12]
elif task == "all":
return data
elif is_integer_string(task):
return [q for q in data if q["question_type_id"] == int(task)]
else:
raise ValueError(f"Invalid task: {task}")
def get_index(num_frames, num_segments):
if num_segments > num_frames:
offsets = np.array([
idx for idx in range(num_frames)
])
else:
# uniform sampling
seg_size = float(num_frames - 1) / num_segments
start = int(seg_size / 2)
offsets = np.array([
start + int(np.round(seg_size * idx)) for idx in range(num_segments)
])
return offsets
with open(seed_bench_input_path) as fin:
qa_anno = json.load(fin)['questions']
fout = open('image_input.jsonl', 'w')
i_anno = filter_questions(qa_anno, 'image')
for qa_item in tqdm(i_anno):
data_path = cc3m_dir + qa_item['data_id']
choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
choice_list = []
for i, c in enumerate(choices):
choice_list.append('{}. {}'.format(chr(i + 65), c))
choice_txt = '\n'.join(choice_list)
prompt = '<img>{}</img>\nQuestion: {}\nOptions: {}\nAnswer:'.format(
data_path, qa_item['question'], choice_txt)
print(json.dumps({
'question_id': qa_item['question_id'],
'prompt': prompt,
'answer': qa_item['answer'],
}), file=fout)
fout.close()
n_frames = 8
os.system('rm -rf video_input_' + str(n_frames))
os.makedirs('video_imgs_' + str(n_frames), exist_ok=True)
fout = open('video_input_{}.jsonl'.format(n_frames), 'w')
v_anno = filter_questions(qa_anno, 'video')
for qa_item in tqdm(v_anno):
if qa_item['question_type_id'] == 12:
data_path = dimension12_dir + qa_item['data_id']
elif qa_item['question_type_id'] == 11:
data_path = dimension11_dir + qa_item['data_id'].split('/')[-1]
elif qa_item['question_type_id'] == 10:
data_path = dimension10_dir + qa_item['data_id']
else:
assert False, str(qa_item)
print(data_path)
use_pyav = False
if 'segment' in qa_item.keys():
segment = qa_item['segment']
if isinstance(segment[0], int):
# using pyav for decoding videos in evaluation dimension 12
use_pyav = True
start, end = segment[0], segment[1]
else:
start = 0.0
end = 0.0
if use_pyav:
# using pyav for decoding videos in evaluation dimension 12
reader = av.open(data_path)
frames = [torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)]
video_len = len(frames)
start_frame, end_frame = start, end
end_frame = min(end_frame, video_len)
offset = get_index(end_frame - start_frame, n_frames)
frame_indices = offset + start_frame
images = torch.stack([frames[idx] for idx in frame_indices]).numpy()
else:
# using decord for decoding videos in evaluation dimension 10-11
try:
vr = VideoReader(data_path, num_threads=1, ctx=cpu(0))
video_len = len(vr)
fps = vr.get_avg_fps()
if 'segment' in qa_item.keys():
# obtain start and end frame for the video segment in evaluation dimension 11
start_frame = int(min(max(start * fps, 0), video_len - 1))
end_frame = int(min(max(end * fps, 0), video_len - 1))
tot_frames = int(end_frame - start_frame)
offset = get_index(tot_frames, n_frames)
frame_indices = offset + start_frame
else:
# sample frames of the video in evaluation dimension 10
frame_indices = get_index(video_len - 1, n_frames)
vr.seek(0)
images = vr.get_batch(frame_indices).asnumpy()
except Exception as e:
print(json.dumps({
'question_id': qa_item['question_id'],
'prompt': "Error" + str(e),
'answer': qa_item['answer'],
}), file=fout)
continue
prompt = ''
for i in range(images.shape[0]):
data = Image.fromarray(images[i])
img_path = 'video_imgs_{}/{}_{}.jpg'.format(n_frames, qa_item['question_id'], i)
data.save(img_path)
prompt += '<img>' + img_path + '</img>\n'
choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
choice_list = []
for i, c in enumerate(choices):
choice_list.append('{}. {}'.format(chr(i + 65), c))
choice_txt = '\n'.join(choice_list)
prompt += 'Question: {}\nOptions: {}\nAnswer:'.format(qa_item['question'], choice_txt)
print(json.dumps({
'question_id': qa_item['question_id'],
'prompt': prompt,
'answer': qa_item['answer'],
}), file=fout)
fout.close()
"""Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
__author__ = 'aagrawal'
__version__ = '0.9'
# Interface for accessing the VQA dataset.
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
# The following functions are defined:
# VQA - VQA class that loads VQA annotation file and prepares data structures.
# getQuesIds - Get question ids that satisfy given filter conditions.
# getImgIds - Get image ids that satisfy given filter conditions.
# loadQA - Load questions and answers with the specified question ids.
# showQA - Display the specified questions and answers.
# loadRes - Load result file and create result object.
# Help on each function can be accessed by: "help(COCO.function)"
import copy
import datetime
import json
class VQA:
def __init__(self, annotation_file=None, question_file=None):
"""Constructor of VQA helper class for reading and visualizing
questions and answers.
:param annotation_file (str): location of VQA annotation file
:return:
"""
# load dataset
self.dataset = {}
self.questions = {}
self.qa = {}
self.qqa = {}
self.imgToQA = {}
if not annotation_file == None and not question_file == None:
print('loading VQA annotations and questions into memory...')
time_t = datetime.datetime.utcnow()
dataset = json.load(open(annotation_file, 'r'))
questions = json.load(open(question_file, 'r'))
self.dataset = dataset
self.questions = questions
self.createIndex()
def createIndex(self):
# create index
print('creating index...')
imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
for ann in self.dataset['annotations']:
imgToQA[ann['image_id']] += [ann]
qa[ann['question_id']] = ann
for ques in self.questions['questions']:
qqa[ques['question_id']] = ques
print('index created!')
# create class members
self.qa = qa
self.qqa = qqa
self.imgToQA = imgToQA
def info(self):
"""Print information about the VQA annotation file.
:return:
"""
for key, value in self.datset['info'].items():
print('%s: %s' % (key, value))
def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
"""Get question ids that satisfy given filter conditions. default skips
that filter.
:param imgIds (int array) : get question ids for given imgs
quesTypes (str array) : get question ids for given question types
ansTypes (str array) : get question ids for given answer types
:return: ids (int array) : integer array of question ids
"""
imgIds = imgIds if type(imgIds) == list else [imgIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset['annotations']
else:
if not len(imgIds) == 0:
anns = sum(
[
self.imgToQA[imgId]
for imgId in imgIds if imgId in self.imgToQA
],
[],
)
else:
anns = self.dataset['annotations']
anns = (anns if len(quesTypes) == 0 else
[ann for ann in anns if ann['question_type'] in quesTypes])
anns = (anns if len(ansTypes) == 0 else
[ann for ann in anns if ann['answer_type'] in ansTypes])
ids = [ann['question_id'] for ann in anns]
return ids
def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
"""Get image ids that satisfy given filter conditions. default skips
that filter.
:param quesIds (int array) : get image ids for given question ids
quesTypes (str array) : get image ids for given question types
ansTypes (str array) : get image ids for given answer types
:return: ids (int array) : integer array of image ids
"""
quesIds = quesIds if type(quesIds) == list else [quesIds]
quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
anns = self.dataset['annotations']
else:
if not len(quesIds) == 0:
anns = sum([
self.qa[quesId] for quesId in quesIds if quesId in self.qa
], [])
else:
anns = self.dataset['annotations']
anns = (anns if len(quesTypes) == 0 else
[ann for ann in anns if ann['question_type'] in quesTypes])
anns = (anns if len(ansTypes) == 0 else
[ann for ann in anns if ann['answer_type'] in ansTypes])
ids = [ann['image_id'] for ann in anns]
return ids
def loadQA(self, ids=[]):
"""Load questions and answers with the specified question ids.
:param ids (int array) : integer ids specifying question ids
:return: qa (object array) : loaded qa objects
"""
if type(ids) == list:
return [self.qa[id] for id in ids]
elif type(ids) == int:
return [self.qa[ids]]
def showQA(self, anns):
"""Display the specified annotations.
:param anns (array of object): annotations to display
:return: None
"""
if len(anns) == 0:
return 0
for ann in anns:
quesId = ann['question_id']
print('Question: %s' % (self.qqa[quesId]['question']))
for ans in ann['answers']:
print('Answer %d: %s' % (ans['answer_id'], ans['answer']))
def loadRes(self, resFile, quesFile):
"""Load result file and return a result object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = VQA()
res.questions = json.load(open(quesFile))
res.dataset['info'] = copy.deepcopy(self.questions['info'])
res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
res.dataset['data_subtype'] = copy.deepcopy(
self.questions['data_subtype'])
res.dataset['license'] = copy.deepcopy(self.questions['license'])
print('Loading and preparing results... ')
time_t = datetime.datetime.utcnow()
anns = json.load(open(resFile))
assert type(anns) == list, 'results is not an array of objects'
annsQuesIds = [ann['question_id'] for ann in anns]
assert set(annsQuesIds) == set(
self.getQuesIds()
), 'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file.'
for ann in anns:
quesId = ann['question_id']
if res.dataset['task_type'] == 'Multiple Choice':
assert (
ann['answer'] in self.qqa[quesId]['multiple_choices']
), 'predicted answer is not one of the multiple choices'
qaAnn = self.qa[quesId]
ann['image_id'] = qaAnn['image_id']
ann['question_type'] = qaAnn['question_type']
ann['answer_type'] = qaAnn['answer_type']
print('DONE (t=%0.2fs)' %
((datetime.datetime.utcnow() - time_t).total_seconds()))
res.dataset['annotations'] = anns
res.createIndex()
return res
"""Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
# coding=utf-8
__author__ = 'aagrawal'
import re
# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
import sys
class VQAEval:
def __init__(self, vqa=None, vqaRes=None, n=2):
self.n = n
self.accuracy = {}
self.evalQA = {}
self.evalQuesType = {}
self.evalAnsType = {}
self.vqa = vqa
self.vqaRes = vqaRes
if vqa is not None:
self.params = {'question_id': vqa.getQuesIds()}
self.contractions = {
'aint': "ain't",
'arent': "aren't",
'cant': "can't",
'couldve': "could've",
'couldnt': "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
'didnt': "didn't",
'doesnt': "doesn't",
'dont': "don't",
'hadnt': "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
'hasnt': "hasn't",
'havent': "haven't",
'hed': "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
'hes': "he's",
'howd': "how'd",
'howll': "how'll",
'hows': "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
'Im': "I'm",
'Ive': "I've",
'isnt': "isn't",
'itd': "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
'itll': "it'll",
"let's": "let's",
'maam': "ma'am",
'mightnt': "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
'mightve': "might've",
'mustnt': "mustn't",
'mustve': "must've",
'neednt': "needn't",
'notve': "not've",
'oclock': "o'clock",
'oughtnt': "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
'shant': "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
'shouldve': "should've",
'shouldnt': "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": 'somebodyd',
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
'somebodyll': "somebody'll",
'somebodys': "somebody's",
'someoned': "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
'someonell': "someone'll",
'someones': "someone's",
'somethingd': "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
'somethingll': "something'll",
'thats': "that's",
'thered': "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
'therere': "there're",
'theres': "there's",
'theyd': "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
'theyll': "they'll",
'theyre': "they're",
'theyve': "they've",
'twas': "'twas",
'wasnt': "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
'weve': "we've",
'werent': "weren't",
'whatll': "what'll",
'whatre': "what're",
'whats': "what's",
'whatve': "what've",
'whens': "when's",
'whered': "where'd",
'wheres': "where's",
'whereve': "where've",
'whod': "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
'wholl': "who'll",
'whos': "who's",
'whove': "who've",
'whyll': "why'll",
'whyre': "why're",
'whys': "why's",
'wont': "won't",
'wouldve': "would've",
'wouldnt': "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
'yall': "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
'youd': "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
'youll': "you'll",
'youre': "you're",
'youve': "you've",
}
self.manualMap = {
'none': '0',
'zero': '0',
'one': '1',
'two': '2',
'three': '3',
'four': '4',
'five': '5',
'six': '6',
'seven': '7',
'eight': '8',
'nine': '9',
'ten': '10',
}
self.articles = ['a', 'an', 'the']
self.periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')
self.commaStrip = re.compile('(\d)(,)(\d)')
self.punct = [
';',
r'/',
'[',
']',
'"',
'{',
'}',
'(',
')',
'=',
'+',
'\\',
'_',
'-',
'>',
'<',
'@',
'`',
',',
'?',
'!',
]
def evaluate(self, quesIds=None):
if quesIds == None:
quesIds = [quesId for quesId in self.params['question_id']]
gts = {}
res = {}
for quesId in quesIds:
gts[quesId] = self.vqa.qa[quesId]
res[quesId] = self.vqaRes.qa[quesId]
# =================================================
# Compute accuracy
# =================================================
accQA = []
accQuesType = {}
accAnsType = {}
print('computing accuracy')
step = 0
for quesId in quesIds:
resAns = res[quesId]['answer']
resAns = resAns.replace('\n', ' ')
resAns = resAns.replace('\t', ' ')
resAns = resAns.strip()
resAns = self.processPunctuation(resAns)
resAns = self.processDigitArticle(resAns)
gtAcc = []
gtAnswers = [ans['answer'] for ans in gts[quesId]['answers']]
if len(set(gtAnswers)) > 1:
for ansDic in gts[quesId]['answers']:
ansDic['answer'] = self.processPunctuation(
ansDic['answer'])
for gtAnsDatum in gts[quesId]['answers']:
otherGTAns = [
item for item in gts[quesId]['answers']
if item != gtAnsDatum
]
matchingAns = [
item for item in otherGTAns if item['answer'] == resAns
]
acc = min(1, float(len(matchingAns)) / 3)
gtAcc.append(acc)
quesType = gts[quesId]['question_type']
ansType = gts[quesId]['answer_type']
avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
accQA.append(avgGTAcc)
if quesType not in accQuesType:
accQuesType[quesType] = []
accQuesType[quesType].append(avgGTAcc)
if ansType not in accAnsType:
accAnsType[ansType] = []
accAnsType[ansType].append(avgGTAcc)
self.setEvalQA(quesId, avgGTAcc)
self.setEvalQuesType(quesId, quesType, avgGTAcc)
self.setEvalAnsType(quesId, ansType, avgGTAcc)
if step % 100 == 0:
self.updateProgress(step / float(len(quesIds)))
step = step + 1
self.setAccuracy(accQA, accQuesType, accAnsType)
print('Done computing accuracy')
def processPunctuation(self, inText):
outText = inText
for p in self.punct:
if (p + ' ' in inText or ' ' + p
in inText) or (re.search(self.commaStrip, inText) != None):
outText = outText.replace(p, '')
else:
outText = outText.replace(p, ' ')
outText = self.periodStrip.sub('', outText, re.UNICODE)
return outText
def processDigitArticle(self, inText):
outText = []
tempText = inText.lower().split()
for word in tempText:
word = self.manualMap.setdefault(word, word)
if word not in self.articles:
outText.append(word)
else:
pass
for wordId, word in enumerate(outText):
if word in self.contractions:
outText[wordId] = self.contractions[word]
outText = ' '.join(outText)
return outText
def setAccuracy(self, accQA, accQuesType, accAnsType):
self.accuracy['overall'] = round(100 * float(sum(accQA)) / len(accQA),
self.n)
self.accuracy['perQuestionType'] = {
quesType: round(
100 * float(sum(accQuesType[quesType])) /
len(accQuesType[quesType]),
self.n,
)
for quesType in accQuesType
}
self.accuracy['perAnswerType'] = {
ansType: round(
100 * float(sum(accAnsType[ansType])) /
len(accAnsType[ansType]), self.n)
for ansType in accAnsType
}
def setEvalQA(self, quesId, acc):
self.evalQA[quesId] = round(100 * acc, self.n)
def setEvalQuesType(self, quesId, quesType, acc):
if quesType not in self.evalQuesType:
self.evalQuesType[quesType] = {}
self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
def setEvalAnsType(self, quesId, ansType, acc):
if ansType not in self.evalAnsType:
self.evalAnsType[ansType] = {}
self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
def updateProgress(self, progress):
barLength = 20
status = ''
if isinstance(progress, int):
progress = float(progress)
if not isinstance(progress, float):
progress = 0
status = 'error: progress var must be float\r\n'
if progress < 0:
progress = 0
status = 'Halt...\r\n'
if progress >= 1:
progress = 1
status = 'Done...\r\n'
block = int(round(barLength * progress))
text = '\rFinshed Percent: [{0}] {1}% {2}'.format(
'#' * block + '-' * (barLength - block), int(progress * 100),
status)
sys.stdout.write(text)
sys.stdout.flush()
# This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca.
from dataclasses import dataclass, field
import json
import math
import logging
import os
from typing import Dict, Optional, List
import torch
from torch.utils.data import Dataset
from deepspeed import zero
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
import transformers
from transformers import Trainer, GPTQConfig, deepspeed
from transformers.trainer_pt_utils import LabelSmoother
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate.utils import DistributedType
IGNORE_TOKEN_ID = LabelSmoother.ignore_index
@dataclass
class ModelArguments:
model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")
@dataclass
class DataArguments:
data_path: str = field(
default=None, metadata={"help": "Path to the training data."}
)
eval_data_path: str = field(
default=None, metadata={"help": "Path to the evaluation data."}
)
lazy_preprocess: bool = False
@dataclass
class TrainingArguments(transformers.TrainingArguments):
cache_dir: Optional[str] = field(default=None)
optim: str = field(default="adamw_torch")
model_max_length: int = field(
default=8192,
metadata={
"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
},
)
use_lora: bool = False
fix_vit: bool = True
@dataclass
class LoraArguments:
lora_r: int = 64
lora_alpha: int = 16
lora_dropout: float = 0.05
lora_target_modules: List[str] = field(
default_factory=lambda: ["c_attn", "attn.c_proj", "w1", "w2"] ##["in_proj","out_proj","c_fc"]
)
lora_weight_path: str = ""
lora_bias: str = "none"
q_lora: bool = False
def maybe_zero_3(param):
if hasattr(param, "ds_id"):
assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
with zero.GatheredParameters([param]):
param = param.data.detach().cpu().clone()
else:
param = param.detach().cpu().clone()
return param
# Borrowed from peft.utils.get_peft_model_state_dict
def get_peft_state_maybe_zero_3(named_params, bias):
if bias == "none":
to_return = {k: t for k, t in named_params if "lora_" in k}
elif bias == "all":
to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
elif bias == "lora_only":
to_return = {}
maybe_lora_bias = {}
lora_bias_names = set()
for k, t in named_params:
if "lora_" in k:
to_return[k] = t
bias_name = k.split("lora_")[0] + "bias"
lora_bias_names.add(bias_name)
elif "bias" in k:
maybe_lora_bias[k] = t
for k, t in maybe_lora_bias:
if bias_name in lora_bias_names:
to_return[bias_name] = t
else:
raise NotImplementedError
to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
return to_return
local_rank = None
def rank0_print(*args):
if local_rank == 0:
print(*args)
def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str, bias="none"):
"""Collects the state dict and dump to disk."""
# check if zero3 mode enabled
if deepspeed.is_deepspeed_zero3_enabled():
state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
else:
if trainer.args.use_lora:
state_dict = get_peft_state_maybe_zero_3(
trainer.model.named_parameters(), bias
)
else:
state_dict = trainer.model.state_dict()
if trainer.args.should_save and trainer.args.local_rank == 0:
trainer._save(output_dir, state_dict=state_dict)
def preprocess(
sources,
tokenizer: transformers.PreTrainedTokenizer,
max_len: int,
system_message: str = "You are a helpful assistant."
) -> Dict:
roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
im_start = tokenizer.im_start_id
im_end = tokenizer.im_end_id
nl_tokens = tokenizer('\n').input_ids
_system = tokenizer('system').input_ids + nl_tokens
_user = tokenizer('user').input_ids + nl_tokens
_assistant = tokenizer('assistant').input_ids + nl_tokens
# Apply prompt templates
input_ids, targets = [], []
for i, source in enumerate(sources):
if roles[source[0]["from"]] != roles["user"]:
source = source[1:]
input_id, target = [], []
system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
input_id += system
target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
assert len(input_id) == len(target)
for j, sentence in enumerate(source):
role = roles[sentence["from"]]
_input_id = tokenizer(role).input_ids + nl_tokens + \
tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
input_id += _input_id
if role == '<|im_start|>user':
_target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
elif role == '<|im_start|>assistant':
_target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
_input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
else:
raise NotImplementedError
target += _target
assert len(input_id) == len(target)
input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
target += [IGNORE_TOKEN_ID] * (max_len - len(target))
input_ids.append(input_id[:max_len])
targets.append(target[:max_len])
input_ids = torch.tensor(input_ids, dtype=torch.int)
targets = torch.tensor(targets, dtype=torch.int)
return dict(
input_ids=input_ids,
labels=targets,
attention_mask=input_ids.ne(tokenizer.pad_token_id),
)
class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""
def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
super(SupervisedDataset, self).__init__()
rank0_print("Formatting inputs...")
sources = [example["conversations"] for example in raw_data]
data_dict = preprocess(sources, tokenizer, max_len)
self.input_ids = data_dict["input_ids"]
self.labels = data_dict["labels"]
self.attention_mask = data_dict["attention_mask"]
def __len__(self):
return len(self.input_ids)
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
return dict(
input_ids=self.input_ids[i],
labels=self.labels[i],
attention_mask=self.attention_mask[i],
)
class LazySupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""
def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
super(LazySupervisedDataset, self).__init__()
self.tokenizer = tokenizer
self.max_len = max_len
rank0_print("Formatting inputs...Skip in lazy mode")
self.tokenizer = tokenizer
self.raw_data = raw_data
self.cached_data_dict = {}
def __len__(self):
return len(self.raw_data)
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
if i in self.cached_data_dict:
return self.cached_data_dict[i]
ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
ret = dict(
input_ids=ret["input_ids"][0],
labels=ret["labels"][0],
attention_mask=ret["attention_mask"][0],
)
self.cached_data_dict[i] = ret
return ret
def make_supervised_data_module(
tokenizer: transformers.PreTrainedTokenizer, data_args, max_len,
) -> Dict:
"""Make dataset and collator for supervised fine-tuning."""
dataset_cls = (
LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
)
rank0_print("Loading data...")
train_json = json.load(open(data_args.data_path, "r"))
train_dataset = dataset_cls(train_json, tokenizer=tokenizer, max_len=max_len)
if data_args.eval_data_path:
eval_json = json.load(open(data_args.eval_data_path, "r"))
eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, max_len=max_len)
else:
eval_dataset = None
return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
def train():
global local_rank
parser = transformers.HfArgumentParser(
(ModelArguments, DataArguments, TrainingArguments, LoraArguments)
)
(
model_args,
data_args,
training_args,
lora_args,
) = parser.parse_args_into_dataclasses()
if getattr(training_args, 'deepspeed', None) and getattr(lora_args, 'q_lora', False):
training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
compute_dtype = (
torch.float16
if training_args.fp16
else (torch.bfloat16 if training_args.bf16 else torch.float32)
)
local_rank = training_args.local_rank
device_map = None
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if lora_args.q_lora:
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else None
if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
logging.warning(
"FSDP or ZeRO3 are not incompatible with QLoRA."
)
# Set RoPE scaling factor
config = transformers.AutoConfig.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
trust_remote_code=True,
)
config.use_cache = False
# Load model and tokenizer
model = transformers.AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
config=config,
cache_dir=training_args.cache_dir,
device_map=device_map,
trust_remote_code=True,
quantization_config=GPTQConfig(
bits=4, disable_exllama=True
)
if training_args.use_lora and lora_args.q_lora
else None,
)
if not training_args.use_lora:
if training_args.fix_vit and hasattr(model,'transformer') and hasattr(model.transformer,'visual'):
model.transformer.visual.requires_grad_(False)
if hasattr(model.transformer.visual,'attn_pool'):
model.transformer.visual.attn_pool.requires_grad_(True)
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length,
padding_side="right",
use_fast=False,
trust_remote_code=True,
)
tokenizer.pad_token_id = tokenizer.eod_id
if training_args.use_lora:
if lora_args.q_lora or "chat" in model_args.model_name_or_path.lower():
modules_to_save = None
else:
modules_to_save = ["wte", "lm_head"]
lora_config = LoraConfig(
r=lora_args.lora_r,
lora_alpha=lora_args.lora_alpha,
target_modules=lora_args.lora_target_modules,
lora_dropout=lora_args.lora_dropout,
bias=lora_args.lora_bias,
task_type="CAUSAL_LM",
modules_to_save=modules_to_save # This argument serves for adding new tokens.
)
if lora_args.q_lora:
model = prepare_model_for_kbit_training(
model, use_gradient_checkpointing=training_args.gradient_checkpointing
)
model = get_peft_model(model, lora_config)
if training_args.gradient_checkpointing:
model.enable_input_require_grads()
# Load data
data_module = make_supervised_data_module(
tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
)
# Start trainner
trainer = Trainer(
model=model, tokenizer=tokenizer, args=training_args, **data_module
)
trainer.train()
trainer.save_state()
safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias)
if __name__ == "__main__":
train()
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"offload_param": {
"device": "none",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd`
GPUS_PER_NODE=8
NNODES=1
NODE_RANK=0
MASTER_ADDR=localhost
MASTER_PORT=6001
MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
# See the section for finetuning in README for more information.
DATA="path_to_data"
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
torchrun $DISTRIBUTED_ARGS finetune.py \
--model_name_or_path $MODEL \
--data_path $DATA \
--bf16 True \
--fix_vit True \
--output_dir output_qwen \
--num_train_epochs 5 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 10 \
--learning_rate 1e-5 \
--weight_decay 0.1 \
--adam_beta2 0.95 \
--warmup_ratio 0.01 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "none" \
--model_max_length 2048 \
--gradient_checkpointing True \
--lazy_preprocess True \
--deepspeed finetune/ds_config_zero3.json
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment