Commit 07dbc76b authored by dongchy920's avatar dongchy920
Browse files

MiniGemini_pytorch

parents
"""Generate json file for webpage."""
import json
import os
import re
# models = ['llama', 'alpaca', 'gpt35', 'bard']
models = ['vicuna']
def read_jsonl(path: str, key: str=None):
data = []
with open(os.path.expanduser(path)) as f:
for line in f:
if not line:
continue
data.append(json.loads(line))
if key is not None:
data.sort(key=lambda x: x[key])
data = {item[key]: item for item in data}
return data
def trim_hanging_lines(s: str, n: int) -> str:
s = s.strip()
for _ in range(n):
s = s.split('\n', 1)[1].strip()
return s
if __name__ == '__main__':
questions = read_jsonl('table/question.jsonl', key='question_id')
# alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
# bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
# gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
# llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
# review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
# review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
# review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
# review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
records = []
for qid in questions.keys():
r = {
'id': qid,
'category': questions[qid]['category'],
'question': questions[qid]['text'],
'answers': {
# 'alpaca': alpaca_answers[qid]['text'],
# 'llama': llama_answers[qid]['text'],
# 'bard': bard_answers[qid]['text'],
# 'gpt35': gpt35_answers[qid]['text'],
'vicuna': vicuna_answers[qid]['text'],
'ours': ours_answers[qid]['text'],
},
'evaluations': {
# 'alpaca': review_alpaca[qid]['text'],
# 'llama': review_llama[qid]['text'],
# 'bard': review_bard[qid]['text'],
'vicuna': review_vicuna[qid]['content'],
# 'gpt35': review_gpt35[qid]['text'],
},
'scores': {
'vicuna': review_vicuna[qid]['tuple'],
# 'alpaca': review_alpaca[qid]['score'],
# 'llama': review_llama[qid]['score'],
# 'bard': review_bard[qid]['score'],
# 'gpt35': review_gpt35[qid]['score'],
},
}
# cleanup data
cleaned_evals = {}
for k, v in r['evaluations'].items():
v = v.strip()
lines = v.split('\n')
# trim the first line if it's a pair of numbers
if re.match(r'\d+[, ]+\d+', lines[0]):
lines = lines[1:]
v = '\n'.join(lines)
cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
r['evaluations'] = cleaned_evals
records.append(r)
# Reorder the records, this is optional
for r in records:
if r['id'] <= 20:
r['id'] += 60
else:
r['id'] -= 20
for r in records:
if r['id'] <= 50:
r['id'] += 10
elif 50 < r['id'] <= 60:
r['id'] -= 50
for r in records:
if r['id'] == 7:
r['id'] = 1
elif r['id'] < 7:
r['id'] += 1
records.sort(key=lambda x: x['id'])
# Write to file
with open('webpage/data.json', 'w') as f:
json.dump({'questions': records, 'models': models}, f, indent=2)
# Copyright (c) Facebook, Inc. and its affiliates.
import re
from tqdm import tqdm
class EvalAIAnswerProcessor:
"""
Processes an answer similar to Eval AI
copied from
https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
"""
CONTRACTIONS = {
"aint": "ain't",
"arent": "aren't",
"cant": "can't",
"couldve": "could've",
"couldnt": "couldn't",
"couldn'tve": "couldn't've",
"couldnt've": "couldn't've",
"didnt": "didn't",
"doesnt": "doesn't",
"dont": "don't",
"hadnt": "hadn't",
"hadnt've": "hadn't've",
"hadn'tve": "hadn't've",
"hasnt": "hasn't",
"havent": "haven't",
"hed": "he'd",
"hed've": "he'd've",
"he'dve": "he'd've",
"hes": "he's",
"howd": "how'd",
"howll": "how'll",
"hows": "how's",
"Id've": "I'd've",
"I'dve": "I'd've",
"Im": "I'm",
"Ive": "I've",
"isnt": "isn't",
"itd": "it'd",
"itd've": "it'd've",
"it'dve": "it'd've",
"itll": "it'll",
"let's": "let's",
"maam": "ma'am",
"mightnt": "mightn't",
"mightnt've": "mightn't've",
"mightn'tve": "mightn't've",
"mightve": "might've",
"mustnt": "mustn't",
"mustve": "must've",
"neednt": "needn't",
"notve": "not've",
"oclock": "o'clock",
"oughtnt": "oughtn't",
"ow's'at": "'ow's'at",
"'ows'at": "'ow's'at",
"'ow'sat": "'ow's'at",
"shant": "shan't",
"shed've": "she'd've",
"she'dve": "she'd've",
"she's": "she's",
"shouldve": "should've",
"shouldnt": "shouldn't",
"shouldnt've": "shouldn't've",
"shouldn'tve": "shouldn't've",
"somebody'd": "somebodyd",
"somebodyd've": "somebody'd've",
"somebody'dve": "somebody'd've",
"somebodyll": "somebody'll",
"somebodys": "somebody's",
"someoned": "someone'd",
"someoned've": "someone'd've",
"someone'dve": "someone'd've",
"someonell": "someone'll",
"someones": "someone's",
"somethingd": "something'd",
"somethingd've": "something'd've",
"something'dve": "something'd've",
"somethingll": "something'll",
"thats": "that's",
"thered": "there'd",
"thered've": "there'd've",
"there'dve": "there'd've",
"therere": "there're",
"theres": "there's",
"theyd": "they'd",
"theyd've": "they'd've",
"they'dve": "they'd've",
"theyll": "they'll",
"theyre": "they're",
"theyve": "they've",
"twas": "'twas",
"wasnt": "wasn't",
"wed've": "we'd've",
"we'dve": "we'd've",
"weve": "we've",
"werent": "weren't",
"whatll": "what'll",
"whatre": "what're",
"whats": "what's",
"whatve": "what've",
"whens": "when's",
"whered": "where'd",
"wheres": "where's",
"whereve": "where've",
"whod": "who'd",
"whod've": "who'd've",
"who'dve": "who'd've",
"wholl": "who'll",
"whos": "who's",
"whove": "who've",
"whyll": "why'll",
"whyre": "why're",
"whys": "why's",
"wont": "won't",
"wouldve": "would've",
"wouldnt": "wouldn't",
"wouldnt've": "wouldn't've",
"wouldn'tve": "wouldn't've",
"yall": "y'all",
"yall'll": "y'all'll",
"y'allll": "y'all'll",
"yall'd've": "y'all'd've",
"y'alld've": "y'all'd've",
"y'all'dve": "y'all'd've",
"youd": "you'd",
"youd've": "you'd've",
"you'dve": "you'd've",
"youll": "you'll",
"youre": "you're",
"youve": "you've",
}
NUMBER_MAP = {
"none": "0",
"zero": "0",
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9",
"ten": "10",
}
ARTICLES = ["a", "an", "the"]
PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
PUNCTUATIONS = [
";",
r"/",
"[",
"]",
'"',
"{",
"}",
"(",
")",
"=",
"+",
"\\",
"_",
"-",
">",
"<",
"@",
"`",
",",
"?",
"!",
]
def __init__(self, *args, **kwargs):
pass
def word_tokenize(self, word):
word = word.lower()
word = word.replace(",", "").replace("?", "").replace("'s", " 's")
return word.strip()
def process_punctuation(self, in_text):
out_text = in_text
for p in self.PUNCTUATIONS:
if (p + " " in in_text or " " + p in in_text) or (
re.search(self.COMMA_STRIP, in_text) is not None
):
out_text = out_text.replace(p, "")
else:
out_text = out_text.replace(p, " ")
out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
return out_text
def process_digit_article(self, in_text):
out_text = []
temp_text = in_text.lower().split()
for word in temp_text:
word = self.NUMBER_MAP.setdefault(word, word)
if word not in self.ARTICLES:
out_text.append(word)
else:
pass
for word_id, word in enumerate(out_text):
if word in self.CONTRACTIONS:
out_text[word_id] = self.CONTRACTIONS[word]
out_text = " ".join(out_text)
return out_text
def __call__(self, item):
item = self.word_tokenize(item)
item = item.replace("\n", " ").replace("\t", " ").strip()
item = self.process_punctuation(item)
item = self.process_digit_article(item)
return item
class TextVQAAccuracyEvaluator:
def __init__(self):
self.answer_processor = EvalAIAnswerProcessor()
def _compute_answer_scores(self, raw_answers):
"""
compute the accuracy (soft score) of human answers
"""
answers = [self.answer_processor(a) for a in raw_answers]
assert len(answers) == 10
gt_answers = list(enumerate(answers))
unique_answers = set(answers)
unique_answer_scores = {}
for unique_answer in unique_answers:
accs = []
for gt_answer in gt_answers:
other_answers = [item for item in gt_answers if item != gt_answer]
matching_answers = [
item for item in other_answers if item[1] == unique_answer
]
acc = min(1, float(len(matching_answers)) / 3)
accs.append(acc)
unique_answer_scores[unique_answer] = sum(accs) / len(accs)
return unique_answer_scores
def eval_pred_list(self, pred_list):
pred_scores = []
for entry in tqdm(pred_list):
pred_answer = self.answer_processor(entry["pred_answer"])
unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
score = unique_answer_scores.get(pred_answer, 0.0)
pred_scores.append(score)
accuracy = sum(pred_scores) / len(pred_scores)
return accuracy
class STVQAAccuracyEvaluator:
def __init__(self):
self.answer_processor = EvalAIAnswerProcessor()
def eval_pred_list(self, pred_list):
pred_scores = []
for entry in pred_list:
pred_answer = self.answer_processor(entry["pred_answer"])
gts = [self.answer_processor(a) for a in entry["gt_answers"]]
score = 1.0 if pred_answer in gts else 0.0
pred_scores.append(score)
accuracy = sum(pred_scores) / len(pred_scores)
return accuracy
class STVQAANLSEvaluator:
def __init__(self):
import editdistance # install with `pip install editdistance`
self.get_edit_distance = editdistance.eval
def get_anls(self, s1, s2):
s1 = s1.lower().strip()
s2 = s2.lower().strip()
iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
anls = iou if iou >= 0.5 else 0.0
return anls
def eval_pred_list(self, pred_list):
pred_scores = []
for entry in pred_list:
anls = max(
self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
)
pred_scores.append(anls)
accuracy = sum(pred_scores) / len(pred_scores)
return accuracy
class TextCapsBleu4Evaluator:
def __init__(self):
# The following script requires Java 1.8.0 and pycocotools installed.
# The pycocoevalcap can be installed with pip as
# pip install git+https://github.com/ronghanghu/coco-caption.git@python23
# Original pycocoevalcap code is at https://github.com/tylin/coco-caption
# but has no python3 support yet.
try:
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
except ModuleNotFoundError:
print(
"Please install pycocoevalcap module using "
"pip install git+https://github.com/ronghanghu/coco-caption.git@python23" # noqa
)
raise
self.tokenizer = PTBTokenizer()
self.scorer = Bleu(4)
def eval_pred_list(self, pred_list):
# Create reference and hypotheses captions.
gts = {}
res = {}
for idx, entry in enumerate(pred_list):
gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
res[idx] = [{"caption": entry["pred_answer"]}]
gts = self.tokenizer.tokenize(gts)
res = self.tokenizer.tokenize(res)
score, _ = self.scorer.compute_score(gts, res)
bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
return bleu4
import argparse
import torch
import os
import json
from tqdm import tqdm
from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from mgm.conversation import conv_templates, SeparatorStyle
from mgm.model.builder import load_pretrained_model
from mgm.utils import disable_torch_init
from mgm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from PIL import Image
import math
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
def create_one_query(problem, shot_num, shot_type, use_caption):
### [1] Demo prompt
demo_prompt = ""
### [2] Test query
# problem info
question = problem['question']
unit = problem['unit']
choices = problem['choices']
# caption = problem['caption']
precision = problem['precision']
question_type = problem['question_type']
answer_type = problem['answer_type']
# hint
if shot_type == 'solution':
if question_type == "multi_choice":
assert answer_type == "text"
hint_text = f"Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end."
else:
assert answer_type in ["integer", "float", "list"]
if answer_type == "integer":
hint_text = f"Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end."
elif answer_type == "float" and precision == 1:
hint_text = f"Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end."
elif answer_type == "float" and precision == 2:
hint_text = f"Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end."
elif answer_type == "list":
hint_text = f"Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end."
else:
assert shot_type == 'code'
hint_text = "Hint: Please generate a python code to solve the problem"
# question
question_text = f"Question: {question}"
if unit:
question_text += f" (Unit: {unit})"
# choices
if choices:
# choices: (A) 1.2 (B) 1.3 (C) 1.4 (D) 1.5
texts = ["Choices:"]
for i, choice in enumerate(choices):
texts.append(f"({chr(ord('A')+i)}) {choice}")
choices_text = "\n".join(texts)
else:
choices_text = ""
# prompt
if shot_type == 'solution':
prompt = "Solution: "
else:
assert shot_type == 'code'
prompt = "Python code: "
elements = [hint_text, question_text, choices_text]
test_query = "\n".join([e for e in elements if e != ""])
### [3] Final query
query = demo_prompt + "\n\n" + test_query
query = query.strip()
return query
def eval_model(args):
# Model
disable_torch_init()
model_path = os.path.expanduser(args.model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name,
load_8bit=args.load_8bit)
questions = json.load(open(os.path.expanduser(args.question_file), "r"))
questions = [dict(pid=pid, info=qs) for pid, qs in questions.items()]
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
if os.path.exists(answers_file):
file = open(answers_file, "r")
pred_contents = [json.loads(line) for line in file]
done_pid = [sample['pid'] for sample in pred_contents]
else:
done_pid = []
ans_file = open(answers_file, "a")
for i, line in enumerate(tqdm(questions)):
idx = line['pid']
info = line['info']
if idx in done_pid:
continue
qs = create_one_query(
problem = info,
shot_num = 0,
shot_type = 'solution',
use_caption = False,
)
query = qs
if 'image' in info:
image_file = info["image"]
image = Image.open(os.path.join(args.image_folder, image_file))
if hasattr(model.config, 'image_size_aux'):
if not hasattr(image_processor, 'image_size_raw'):
image_processor.image_size_raw = image_processor.crop_size.copy()
image_processor.crop_size['height'] = model.config.image_size_aux
image_processor.crop_size['width'] = model.config.image_size_aux
image_processor.size['shortest_edge'] = model.config.image_size_aux
image_tensor = process_images([image], image_processor, model.config)[0]
image_grid = getattr(model.config, 'image_grid', 1)
if hasattr(model.config, 'image_size_aux'):
raw_shape = [image_processor.image_size_raw['height'] * image_grid,
image_processor.image_size_raw['width'] * image_grid]
image_tensor_aux = image_tensor
image_tensor = torch.nn.functional.interpolate(image_tensor[None],
size=raw_shape,
mode='bilinear',
align_corners=False)[0]
else:
image_tensor_aux = []
if image_grid >= 2:
raw_image = image_tensor.reshape(3,
image_grid,
image_processor.image_size_raw['height'],
image_grid,
image_processor.image_size_raw['width'])
raw_image = raw_image.permute(1, 3, 0, 2, 4)
raw_image = raw_image.reshape(-1, 3,
image_processor.image_size_raw['height'],
image_processor.image_size_raw['width'])
if getattr(model.config, 'image_global', False):
global_image = image_tensor
if len(global_image.shape) == 3:
global_image = global_image[None]
global_image = torch.nn.functional.interpolate(global_image,
size=[image_processor.image_size_raw['height'],
image_processor.image_size_raw['width']],
mode='bilinear',
align_corners=False)
# [image_crops, image_global]
raw_image = torch.cat([raw_image, global_image], dim=0)
image_tensor = raw_image.contiguous()
images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
if getattr(model.config, 'mm_use_im_start_end', False):
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
else:
images = None
images_aux = None
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
terminators = tokenizer.eos_token_id
if "llama_3" in args.conv_mode:
terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=images,
images_aux=images_aux,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
max_new_tokens=1024,
bos_token_id=tokenizer.bos_token_id, # Begin of sequence token
eos_token_id=terminators, # End of sequence token
pad_token_id=tokenizer.pad_token_id, # Pad token
use_cache=True,
)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
info['query'] = query
info['response'] = outputs
ans_file.write(json.dumps(info) + "\n")
ans_file.flush()
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--image-folder", type=str, default="")
parser.add_argument("--question-file", type=str, default="tables/question.json")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
parser.add_argument("--conv-mode", type=str, default="llava_v0")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--answer-prompter", action="store_true")
parser.add_argument('--load_8bit', type=bool, default=False)
parser.add_argument("--single-pred-prompt", action="store_true")
args = parser.parse_args()
eval_model(args)
\ No newline at end of file
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
import torch
import os
import json
from tqdm import tqdm
import shortuuid
from mgm.conversation import default_conversation
from mgm.utils import disable_torch_init
@torch.inference_mode()
def eval_model(model_name, questions_file, answers_file):
# Model
disable_torch_init()
model_name = os.path.expanduser(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name,
torch_dtype=torch.float16).cuda()
ques_file = open(os.path.expanduser(questions_file), "r")
ans_file = open(os.path.expanduser(answers_file), "w")
for i, line in enumerate(tqdm(ques_file)):
idx = json.loads(line)["question_id"]
qs = json.loads(line)["text"]
cat = json.loads(line)["category"]
conv = default_conversation.copy()
conv.append_message(conv.roles[0], qs)
prompt = conv.get_prompt()
inputs = tokenizer([prompt])
input_ids = torch.as_tensor(inputs.input_ids).cuda()
output_ids = model.generate(
input_ids,
do_sample=True,
use_cache=True,
temperature=0.7,
max_new_tokens=1024,)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
try:
index = outputs.index(conv.sep, len(prompt))
except ValueError:
outputs += conv.sep
index = outputs.index(conv.sep, len(prompt))
outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({"question_id": idx,
"text": outputs,
"answer_id": ans_id,
"model_id": model_name,
"metadata": {}}) + "\n")
ans_file.flush()
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
args = parser.parse_args()
eval_model(args.model_name, args.question_file, args.answers_file)
\ No newline at end of file
import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid
from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from mgm.conversation import conv_templates, SeparatorStyle
from mgm.model.builder import load_pretrained_model
from mgm.utils import disable_torch_init
from mgm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from PIL import Image
import math
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
def eval_model(args):
# Model
disable_torch_init()
model_path = os.path.expanduser(args.model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
ans_file = open(answers_file, "w")
for line in tqdm(questions):
idx = line["question_id"]
image_file = line["image"]
qs = line["text"]
cur_prompt = qs
if hasattr(model, "update_prompt"):
model.update_prompt([[cur_prompt]])
if model.config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
if hasattr(model.config, 'image_size_aux'):
if not hasattr(image_processor, 'image_size_raw'):
image_processor.image_size_raw = image_processor.crop_size.copy()
image_processor.crop_size['height'] = model.config.image_size_aux
image_processor.crop_size['width'] = model.config.image_size_aux
image_processor.size['shortest_edge'] = model.config.image_size_aux
image_tensor = process_images([image], image_processor, model.config)[0]
image_grid = getattr(model.config, 'image_grid', 1)
if hasattr(model.config, 'image_size_aux'):
raw_shape = [image_processor.image_size_raw['height'] * image_grid,
image_processor.image_size_raw['width'] * image_grid]
image_tensor_aux = image_tensor
image_tensor = torch.nn.functional.interpolate(image_tensor[None],
size=raw_shape,
mode='bilinear',
align_corners=False)[0]
else:
image_tensor_aux = []
if image_grid >= 2:
raw_image = image_tensor.reshape(3,
image_grid,
image_processor.image_size_raw['height'],
image_grid,
image_processor.image_size_raw['width'])
raw_image = raw_image.permute(1, 3, 0, 2, 4)
raw_image = raw_image.reshape(-1, 3,
image_processor.image_size_raw['height'],
image_processor.image_size_raw['width'])
if getattr(model.config, 'image_global', False):
global_image = image_tensor
if len(global_image.shape) == 3:
global_image = global_image[None]
global_image = torch.nn.functional.interpolate(global_image,
size=[image_processor.image_size_raw['height'],
image_processor.image_size_raw['width']],
mode='bilinear',
align_corners=False)
# [image_crops, image_global]
raw_image = torch.cat([raw_image, global_image], dim=0)
image_tensor = raw_image.contiguous()
images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
terminators = tokenizer.eos_token_id
if "llama_3" in args.conv_mode:
terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=images,
images_aux=images_aux,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
top_p=args.top_p,
num_beams=args.num_beams,
max_new_tokens=1024,
bos_token_id=tokenizer.bos_token_id, # Begin of sequence token
eos_token_id=terminators, # End of sequence token
pad_token_id=tokenizer.pad_token_id, # Pad token
use_cache=True)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({"question_id": idx,
"prompt": cur_prompt,
"text": outputs,
"answer_id": ans_id,
"model_id": model_name,
"metadata": {}}) + "\n")
ans_file.flush()
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--image-folder", type=str, default="")
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
parser.add_argument("--conv-mode", type=str, default="llava_v1")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--num_beams", type=int, default=1)
args = parser.parse_args()
eval_model(args)
\ No newline at end of file
import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid
from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from mgm.conversation import conv_templates, SeparatorStyle
from mgm.model.builder import load_pretrained_model
from mgm.utils import disable_torch_init
from mgm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import math
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
# Custom dataset class
class CustomDataset(Dataset):
def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
self.questions = questions
self.image_folder = image_folder
self.tokenizer = tokenizer
self.image_processor = image_processor
self.model_config = model_config
def __getitem__(self, index):
line = self.questions[index]
image_file = line["image"]
qs = line["text"]
if self.model_config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
if hasattr(self.model_config, 'image_size_aux'):
if not hasattr(self.image_processor, 'image_size_raw'):
self.image_processor.image_size_raw = self.image_processor.crop_size.copy()
self.image_processor.crop_size['height'] = self.model_config.image_size_aux
self.image_processor.crop_size['width'] = self.model_config.image_size_aux
self.image_processor.size['shortest_edge'] = self.model_config.image_size_aux
image_tensor = process_images([image], self.image_processor, self.model_config)[0]
input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
image_grid = getattr(self.model_config, 'image_grid', 1)
if hasattr(self.model_config, 'image_size_aux'):
raw_shape = [self.image_processor.image_size_raw['height'] * image_grid,
self.image_processor.image_size_raw['width'] * image_grid]
image_tensor_aux = image_tensor
image_tensor = torch.nn.functional.interpolate(image_tensor[None],
size=raw_shape,
mode='bilinear',
align_corners=False)[0]
else:
image_tensor_aux = []
if image_grid >= 2:
raw_image = image_tensor.reshape(3,
image_grid,
self.image_processor.image_size_raw['height'],
image_grid,
self.image_processor.image_size_raw['width'])
raw_image = raw_image.permute(1, 3, 0, 2, 4)
raw_image = raw_image.reshape(-1, 3,
self.image_processor.image_size_raw['height'],
self.image_processor.image_size_raw['width'])
if getattr(self.model_config, 'image_global', False):
global_image = image_tensor
if len(global_image.shape) == 3:
global_image = global_image[None]
global_image = torch.nn.functional.interpolate(global_image,
size=[self.image_processor.image_size_raw['height'],
self.image_processor.image_size_raw['width']],
mode='bilinear',
align_corners=False)
# [image_crops, image_global]
raw_image = torch.cat([raw_image, global_image], dim=0)
image_tensor = raw_image.contiguous()
return input_ids, image_tensor, image_tensor_aux
def __len__(self):
return len(self.questions)
# DataLoader
def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
assert batch_size == 1, "batch_size must be 1"
dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
return data_loader
def eval_model(args):
# Model
disable_torch_init()
model_path = os.path.expanduser(args.model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, load_8bit=args.load_8bit)
questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
ans_file = open(answers_file, "w")
if 'plain' in args.conv_mode and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
args.conv_mode = args.conv_mode + '_mmtag'
print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
for (input_ids, image_tensor, image_tensor_aux), line in tqdm(zip(data_loader, questions), total=len(questions)):
idx = line["question_id"]
cur_prompt = line["text"]
input_ids = input_ids.to(device=model.device, non_blocking=True)
if hasattr(model, "update_prompt"):
model.update_prompt([[cur_prompt]])
terminators = tokenizer.eos_token_id
if "llama_3" in args.conv_mode:
terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor.to(dtype=model.dtype, device=model.device, non_blocking=True),
images_aux=image_tensor_aux.to(dtype=model.dtype, device=model.device, non_blocking=True) if len(image_tensor_aux)>0 else None,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
top_p=args.top_p,
num_beams=args.num_beams,
max_new_tokens=args.max_new_tokens,
bos_token_id=tokenizer.bos_token_id, # Begin of sequence token
eos_token_id=terminators, # End of sequence token
pad_token_id=tokenizer.pad_token_id, # Pad token
use_cache=True)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({"question_id": idx,
"prompt": cur_prompt,
"text": outputs,
"answer_id": ans_id,
"model_id": model_name,
"metadata": {}}) + "\n")
# ans_file.flush()
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--image-folder", type=str, default="")
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
parser.add_argument("--conv-mode", type=str, default="llava_v1")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--num_beams", type=int, default=1)
parser.add_argument('--load_8bit', type=bool, default=False)
parser.add_argument("--max_new_tokens", type=int, default=128)
args = parser.parse_args()
eval_model(args)
import argparse
import torch
import os
import json
import pandas as pd
from tqdm import tqdm
import shortuuid
from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from mgm.conversation import conv_templates, SeparatorStyle
from mgm.model.builder import load_pretrained_model
from mgm.utils import disable_torch_init
from mgm.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
from PIL import Image
import math
all_options = ['A', 'B', 'C', 'D']
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
def is_none(value):
if value is None:
return True
if type(value) is float and math.isnan(value):
return True
if type(value) is str and value.lower() == 'nan':
return True
if type(value) is str and value.lower() == 'none':
return True
return False
def get_options(row, options):
parsed_options = []
for option in options:
option_value = row[option]
if is_none(option_value):
break
parsed_options.append(option_value)
return parsed_options
def eval_model(args):
# Model
disable_torch_init()
model_path = os.path.expanduser(args.model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
questions = pd.read_table(os.path.expanduser(args.question_file))
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
ans_file = open(answers_file, "w")
if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
args.conv_mode = args.conv_mode + '_mmtag'
print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
for index, row in tqdm(questions.iterrows(), total=len(questions)):
options = get_options(row, all_options)
cur_option_char = all_options[:len(options)]
if args.all_rounds:
num_rounds = len(options)
else:
num_rounds = 1
for round_idx in range(num_rounds):
idx = row['index']
question = row['question']
hint = row['hint']
image = load_image_from_base64(row['image'])
if not is_none(hint):
question = hint + '\n' + question
for option_char, option in zip(all_options[:len(options)], options):
question = question + '\n' + option_char + '. ' + option
qs = cur_prompt = question
if hasattr(model, "update_prompt"):
model.update_prompt([[cur_prompt]])
if model.config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
if args.single_pred_prompt:
if args.lang == 'cn':
qs = qs + '\n' + "请直接回答选项字母。"
else:
qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
if hasattr(model.config, 'image_size_aux'):
if not hasattr(image_processor, 'image_size_raw'):
image_processor.image_size_raw = image_processor.crop_size.copy()
image_processor.crop_size['height'] = model.config.image_size_aux
image_processor.crop_size['width'] = model.config.image_size_aux
image_processor.size['shortest_edge'] = model.config.image_size_aux
image_tensor = process_images([image], image_processor, model.config)[0]
image_grid = getattr(model.config, 'image_grid', 1)
if hasattr(model.config, 'image_size_aux'):
raw_shape = [image_processor.image_size_raw['height'] * image_grid,
image_processor.image_size_raw['width'] * image_grid]
image_tensor_aux = image_tensor
image_tensor = torch.nn.functional.interpolate(image_tensor[None],
size=raw_shape,
mode='bilinear',
align_corners=False)[0]
else:
image_tensor_aux = []
if image_grid >= 2:
raw_image = image_tensor.reshape(3,
image_grid,
image_processor.image_size_raw['height'],
image_grid,
image_processor.image_size_raw['width'])
raw_image = raw_image.permute(1, 3, 0, 2, 4)
raw_image = raw_image.reshape(-1, 3,
image_processor.image_size_raw['height'],
image_processor.image_size_raw['width'])
if getattr(model.config, 'image_global', False):
global_image = image_tensor
if len(global_image.shape) == 3:
global_image = global_image[None]
global_image = torch.nn.functional.interpolate(global_image,
size=[image_processor.image_size_raw['height'],
image_processor.image_size_raw['width']],
mode='bilinear',
align_corners=False)
# [image_crops, image_global]
raw_image = torch.cat([raw_image, global_image], dim=0)
image_tensor = raw_image.contiguous()
images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
terminators = tokenizer.eos_token_id
if "llama_3" in args.conv_mode:
terminators = [terminators, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=images,
images_aux=images_aux,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
top_p=args.top_p,
num_beams=args.num_beams,
# no_repeat_ngram_size=3,
max_new_tokens=1024,
bos_token_id=tokenizer.bos_token_id, # Begin of sequence token
eos_token_id=terminators, # End of sequence token
pad_token_id=tokenizer.pad_token_id, # Pad token
use_cache=True)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({"question_id": idx,
"round_id": round_idx,
"prompt": cur_prompt,
"text": outputs,
"options": options,
"option_char": cur_option_char,
"answer_id": ans_id,
"model_id": model_name,
"metadata": {}}) + "\n")
ans_file.flush()
# rotate options
options = options[1:] + options[:1]
cur_option_char = cur_option_char[1:] + cur_option_char[:1]
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--image-folder", type=str, default="")
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
parser.add_argument("--conv-mode", type=str, default="llava_v1")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--num_beams", type=int, default=1)
parser.add_argument("--all-rounds", action="store_true")
parser.add_argument("--single-pred-prompt", action="store_true")
parser.add_argument("--lang", type=str, default="en")
args = parser.parse_args()
eval_model(args)
import argparse
import torch
from tqdm import tqdm
import json
from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from mgm.conversation import conv_templates, SeparatorStyle
from mgm.model.builder import load_pretrained_model
from mgm.utils import disable_torch_init
from mgm.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
from PIL import Image
import requests
from PIL import Image
from io import BytesIO
def load_image(image_file):
if image_file.startswith('http') or image_file.startswith('https'):
response = requests.get(image_file)
image = Image.open(BytesIO(response.content)).convert('RGB')
else:
image = Image.open(image_file).convert('RGB')
return image
def eval_model(args):
# Model
disable_torch_init()
model_name = get_model_name_from_path(args.model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, True)
with open(args.questions_file) as f:
llvqa_data = json.load(f)
for i, llddata in enumerate(tqdm(llvqa_data)):
filename = llddata["img_path"]
if args.lang == "en":
message = llddata["question"] + "\nChoose between one of the options as follows:\n"
elif args.lang == "zh":
message = llddata["question"] + "\在下列选项中选择一个:\n"
else:
raise NotImplementedError("Q-Bench does not support languages other than English (en) and Chinese (zh) yet. Contact us (https://github.com/VQAssessment/Q-Bench/) to convert Q-Bench into more languages.")
for choice, ans in zip(["A.", "B.", "C.", "D."], llddata["candidates"]):
message += f"{choice} {ans}\n"
qs = message
if model.config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
if 'llama-2' in model_name.lower():
conv_mode = "llava_llama_2"
elif "v1" in model_name.lower():
conv_mode = "llava_v1"
elif "mpt" in model_name.lower():
conv_mode = "mpt"
else:
conv_mode = "llava_v0"
if args.conv_mode is not None and conv_mode != args.conv_mode:
print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
else:
args.conv_mode = conv_mode
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
image = load_image(args.image_folder + filename)
image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor,
num_beams=1,
do_sample=False,
temperature=0,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria])
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
outputs = outputs.strip()
llddata["response"] = outputs
with open(args.answers_file, "a") as wf:
json.dump(llddata, wf)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="llava-v1.5")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--image-folder", type=str, default="./playground/data/qbench/images_llvisionqa")
parser.add_argument("--questions-file", type=str, default="./playground/data/qbench/llvisionqa_dev.json")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
parser.add_argument("--conv-mode", type=str, default="llava_v1")
parser.add_argument("--lang", type=str, default="en")
args = parser.parse_args()
eval_model(args)
import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid
from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from mgm.conversation import conv_templates, SeparatorStyle
from mgm.model.builder import load_pretrained_model
from mgm.utils import disable_torch_init
from mgm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from PIL import Image
import math
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
def eval_model(args):
# Model
disable_torch_init()
model_path = os.path.expanduser(args.model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
questions = json.load(open(os.path.expanduser(args.question_file), "r"))
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
ans_file = open(answers_file, "w")
for i, line in enumerate(tqdm(questions)):
idx = line["id"]
question = line['conversations'][0]
qs = question['value'].replace('<image>', '').strip()
cur_prompt = qs
if 'image' in line:
image_file = line["image"]
image = Image.open(os.path.join(args.image_folder, image_file))
if hasattr(model.config, 'image_size_aux'):
if not hasattr(image_processor, 'image_size_raw'):
image_processor.image_size_raw = image_processor.crop_size.copy()
image_processor.crop_size['height'] = model.config.image_size_aux
image_processor.crop_size['width'] = model.config.image_size_aux
image_processor.size['shortest_edge'] = model.config.image_size_aux
image_tensor = process_images([image], image_processor, model.config)[0]
image_grid = getattr(model.config, 'image_grid', 1)
if hasattr(model.config, 'image_size_aux'):
raw_shape = [image_processor.image_size_raw['height'] * image_grid,
image_processor.image_size_raw['width'] * image_grid]
image_tensor_aux = image_tensor
image_tensor = torch.nn.functional.interpolate(image_tensor[None],
size=raw_shape,
mode='bilinear',
align_corners=False)[0]
else:
image_tensor_aux = []
if image_grid >= 2:
raw_image = image_tensor.reshape(3,
image_grid,
image_processor.image_size_raw['height'],
image_grid,
image_processor.image_size_raw['width'])
raw_image = raw_image.permute(1, 3, 0, 2, 4)
raw_image = raw_image.reshape(-1, 3,
image_processor.image_size_raw['height'],
image_processor.image_size_raw['width'])
if getattr(model.config, 'image_global', False):
global_image = image_tensor
if len(global_image.shape) == 3:
global_image = global_image[None]
global_image = torch.nn.functional.interpolate(global_image,
size=[image_processor.image_size_raw['height'],
image_processor.image_size_raw['width']],
mode='bilinear',
align_corners=False)
# [image_crops, image_global]
raw_image = torch.cat([raw_image, global_image], dim=0)
image_tensor = raw_image.contiguous()
images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
if getattr(model.config, 'mm_use_im_start_end', False):
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
cur_prompt = '<image>' + '\n' + cur_prompt
else:
images = None
images_aux = None
if args.single_pred_prompt:
qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
if hasattr(model, "update_prompt"):
model.update_prompt([[cur_prompt]])
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=images,
images_aux=images_aux,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
max_new_tokens=1024,
bos_token_id=tokenizer.bos_token_id, # Begin of sequence token
eos_token_id=tokenizer.eos_token_id, # End of sequence token
pad_token_id=tokenizer.pad_token_id, # Pad token
use_cache=True,
)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({"question_id": idx,
"prompt": cur_prompt,
"text": outputs,
"answer_id": ans_id,
"model_id": model_name,
"metadata": {}}) + "\n")
ans_file.flush()
ans_file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--image-folder", type=str, default="")
parser.add_argument("--question-file", type=str, default="tables/question.json")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
parser.add_argument("--conv-mode", type=str, default="llava_v0")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--answer-prompter", action="store_true")
parser.add_argument("--single-pred-prompt", action="store_true")
args = parser.parse_args()
eval_model(args)
\ No newline at end of file
"""Generate answers with GPT-3.5"""
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
import argparse
import json
import os
import time
import concurrent.futures
import openai
import tqdm
import shortuuid
MODEL = 'gpt-3.5-turbo'
MODEL_ID = 'gpt-3.5-turbo:20230327'
def get_answer(question_id: int, question: str, max_tokens: int):
ans = {
'answer_id': shortuuid.uuid(),
'question_id': question_id,
'model_id': MODEL_ID,
}
for _ in range(3):
try:
response = openai.ChatCompletion.create(
model=MODEL,
messages=[{
'role': 'system',
'content': 'You are a helpful assistant.'
}, {
'role': 'user',
'content': question,
}],
max_tokens=max_tokens,
)
ans['text'] = response['choices'][0]['message']['content']
return ans
except Exception as e:
print('[ERROR]', e)
ans['text'] = '#ERROR#'
time.sleep(1)
return ans
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
parser.add_argument('-q', '--question')
parser.add_argument('-o', '--output')
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
args = parser.parse_args()
questions_dict = {}
with open(os.path.expanduser(args.question)) as f:
for line in f:
if not line:
continue
q = json.loads(line)
questions_dict[q['question_id']] = q['text']
answers = []
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
futures = []
for qid, question in questions_dict.items():
future = executor.submit(get_answer, qid, question, args.max_tokens)
futures.append(future)
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
answers.append(future.result())
answers.sort(key=lambda x: x['question_id'])
with open(os.path.expanduser(args.output), 'w') as f:
table = [json.dumps(ans) for ans in answers]
f.write('\n'.join(table))
import argparse
import torch
from mgm.constants import (
IMAGE_TOKEN_INDEX,
DEFAULT_IMAGE_TOKEN,
DEFAULT_IM_START_TOKEN,
DEFAULT_IM_END_TOKEN,
IMAGE_PLACEHOLDER,
)
from mgm.conversation import conv_templates, SeparatorStyle
from mgm.model.builder import load_pretrained_model
from mgm.utils import disable_torch_init
from mgm.mm_utils import (
process_images,
tokenizer_image_token,
get_model_name_from_path,
)
from PIL import Image
import requests
from PIL import Image
from io import BytesIO
import re
def image_parser(args):
out = args.image_file.split(args.sep)
return out
def load_image(image_file):
if image_file.startswith("http") or image_file.startswith("https"):
response = requests.get(image_file)
image = Image.open(BytesIO(response.content)).convert("RGB")
else:
image = Image.open(image_file).convert("RGB")
return image
def load_images(image_files):
out = []
for image_file in image_files:
image = load_image(image_file)
out.append(image)
return out
def eval_model(args):
# Model
disable_torch_init()
model_name = get_model_name_from_path(args.model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
args.model_path, args.model_base, model_name
)
qs = args.query
image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
if IMAGE_PLACEHOLDER in qs:
if model.config.mm_use_im_start_end:
qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
else:
qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
else:
if model.config.mm_use_im_start_end:
qs = image_token_se + "\n" + qs
else:
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
if "llama-2" in model_name.lower():
conv_mode = "llava_llama_2"
elif "mistral" in model_name.lower():
conv_mode = "mistral_instruct"
elif "v1.6-34b" in model_name.lower():
conv_mode = "chatml_direct"
elif "v1" in model_name.lower():
conv_mode = "llava_v1"
elif "mpt" in model_name.lower():
conv_mode = "mpt"
else:
conv_mode = "llava_v0"
if args.conv_mode is not None and conv_mode != args.conv_mode:
print(
"[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
conv_mode, args.conv_mode, args.conv_mode
)
)
else:
args.conv_mode = conv_mode
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
image_files = image_parser(args)
images = load_images(image_files)
images_tensor = process_images(
images,
image_processor,
model.config
).to(model.device, dtype=torch.float16)
input_ids = (
tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
.unsqueeze(0)
.cuda()
)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=images_tensor,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
top_p=args.top_p,
num_beams=args.num_beams,
max_new_tokens=args.max_new_tokens,
use_cache=True,
)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
print(outputs)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--image-file", type=str, required=True)
parser.add_argument("--query", type=str, required=True)
parser.add_argument("--conv-mode", type=str, default=None)
parser.add_argument("--sep", type=str, default=",")
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--top_p", type=float, default=None)
parser.add_argument("--num_beams", type=int, default=1)
parser.add_argument("--max_new_tokens", type=int, default=512)
args = parser.parse_args()
eval_model(args)
\ No newline at end of file
import json
import os
from collections import defaultdict
import numpy as np
import argparse
def parse_args():
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
parser.add_argument('-d', '--dir', default=None)
parser.add_argument('-v', '--version', default=None)
parser.add_argument('-s', '--select', nargs='*', default=None)
parser.add_argument('-f', '--files', nargs='*', default=[])
parser.add_argument('-i', '--ignore', nargs='*', default=[])
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
if args.ignore is not None:
args.ignore = [int(x) for x in args.ignore]
if len(args.files) > 0:
review_files = args.files
else:
review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
for review_file in sorted(review_files):
config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
if args.select is not None and any(x not in config for x in args.select):
continue
if '0613' in config:
version = '0613'
else:
version = '0314'
if args.version is not None and args.version != version:
continue
scores = defaultdict(list)
print(config)
with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
for review_str in f:
review = json.loads(review_str)
if review['question_id'] in args.ignore:
continue
if 'category' in review:
scores[review['category']].append(review['tuple'])
scores['all'].append(review['tuple'])
else:
if 'tuple' in review:
scores['all'].append(review['tuple'])
else:
scores['all'].append(review['score'])
for k, v in sorted(scores.items()):
stats = np.asarray(v).mean(0).tolist()
stats = [round(x, 3) for x in stats]
# print(k, stats, round(stats[1]/stats[0]*100, 1))
print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
print('=================================')
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2406 2406"><path d="M1 578.4C1 259.5 259.5 1 578.4 1h1249.1c319 0 577.5 258.5 577.5 577.4V2406H578.4C259.5 2406 1 2147.5 1 1828.6V578.4z" fill="#74aa9c"/><path d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" fill="white"/></svg>
\ No newline at end of file
<svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>
\ No newline at end of file
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</title>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
<link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons">
<link rel="stylesheet" href="styles.css">
</head>
<body>
<nav class="navbar navbar-expand-lg navbar-dark bg-dark">
<a class="navbar-brand" href="#">🏔️ Vicuna Evaluation Examples</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarNav">
<ul class="navbar-nav mr-auto">
<li class="nav-item">
<a class="nav-link" href="https://chat.lmsys.org/">Demo</a>
</li>
<li class="nav-item">
<a class="nav-link" href="https://vicuna.lmsys.org">Blog</a>
</li>
<li class="nav-item">
<a class="nav-link" href="https://github.com/lm-sys/FastChat">Github</a>
</li>
</ul>
</div>
</nav>
<div class="container mt-5">
<h2 class="text-center mb-5">Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</h2>
<!-- Selection -->
<div class="form-row">
<div class="form-group col-md-2">
<label for="category-select">Category</label>
<select class="form-control" id="category-select"></select>
</div>
<div class="form-group col-md-8">
<label for="question-select">Question</label>
<select class="form-control" id="question-select"></select>
</div>
<div class="form-group col-md-2">
<div class="col-md-2"><label>&nbsp;</label></div>
<div class="btn-group" role="group" aria-label="Left and Right Controller">
<button type="button" class="form-control btn btn-primary" id="prev-question"><i class="material-icons">keyboard_arrow_left</i></button>
<button type="button" class="form-control btn btn-primary" id="next-question"><i class="material-icons">keyboard_arrow_right</i></button>
</div>
</div>
</div>
<!-- "Battle" -->
<div class="row mb-4" style="justify-content: center;">
<div class="col" style="display: flex; justify-content: center; align-items: center;">
<label class="adjustable-font-size" id="other-score-label">*/10</label>
</div>
<div class="col">
<div class="vertical-flex-layout">
<img class="shadow figure-img img-fluid" src="" alt="other logo" width="150" id="other-model-figure">
</div>
</div>
<div class="col">
<div class="vertical-flex-layout">
<!-- from: https://fonts.google.com/icons?icon.query=battle&selected=Material+Symbols+Outlined:swords:FILL@0;wght@300;GRAD@0;opsz@48&icon.style=Outlined -->
<img class="figure-img img-fluid" src="figures/swords_FILL0_wght300_GRAD0_opsz48.svg" width="60" height="60">
</div>
</div>
<div class="col">
<div class="vertical-flex-layout">
<img class="shadow figure-img img-fluid" src="figures/vicuna.jpeg" alt="vicuna logo" width="150" id="our-model-figure">
</div>
</div>
<div class="col" style="display: flex; justify-content: center; align-items: center;">
<label class="adjustable-font-size" id="our-score-label">*/10</label>
</div>
</div>
<!-- Question Card -->
<div class="card mb-4">
<div class="card-body" id="selected-question"></div>
</div>
<!-- Answer Cards -->
<div class="row">
<div class="col-md-6">
<div class="card mb-4 expandable-card">
<div class="card-header" style="padding-bottom: 0.2rem" id="other-model-header-bg">
<div class="row">
<div class="col-md-5" style="align-items: center; display: flex;">
<label id="other-model-header">Assistant #1</label>
</div>
<div class="col-md-7">
<select class="form-control" id="model-select" style="height: fit-content; margin-top: -0.3rem;"></select>
</div>
</div>
</div>
<div class="card-body">
<div class="card-text-container">
<div class="card-text" id="other-model-answer"></div>
</div>
<div class="btn btn-primary expand-btn" style="display:flex;"></div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card mb-4 expandable-card">
<div class="card-header" id="our-model-header">
Assistant #2 (Vicuna, our model)
</div>
<div class="card-body">
<div class="card-text-container">
<div class="card-text" id="our-model-answer"></div>
</div>
<div class="btn btn-primary expand-btn" style="display:flex;"></div>
</div>
</div>
</div>
</div>
<!-- Evaluation -->
<div class="card expandable-card">
<div class="card-header" style="background-color: #c9c9f2;" id="evaluation-header">GPT-4 Evaluation</div>
<div class="card-body">
<div class="card-text-container">
<div class="card-text" id="evaluation-result"></div>
</div>
<div class="btn btn-primary expand-btn" style="display:flex;"></div>
</div>
</div>
</div>
<div class="container-fluid bg-light py-2">
<div class="text-center">
<small class="text-muted">This website is co-authored with <a href="https://openai.com" target="_blank">GPT-4</a>.</small>
</div>
</div>
<!-- Marked.js -->
<script src="https://cdn.jsdelivr.net/npm/marked@4.3.0/lib/marked.umd.min.js"></script>
<!-- Bootstrap and Popper.js JavaScript dependencies -->
<script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.6/dist/umd/popper.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
<script src="script.js"></script>
<script>
// Fetch the JSON file
fetch('data.json')
.then(response => response.json())
.then(json_data => {
// Populate the models and questions.
populateModels(json_data.models);
populateQuestions(json_data.questions);
displayQuestion(currentQuestionIndex);
}).catch(error => console.error(error));
</script>
</body>
</html>
// Description: Script for the evaluation webpage.
let currentQuestionIndex = 1;
// Store the model name mapping for later use.
modelNameMapping = {
"gpt35": "ChatGPT-3.5",
"gpt4": "GPT-4",
"alpaca": "Alpaca-13b",
"vicuna": "Vicuna-13b",
"llama": "LLaMA-13b",
"bard": "Bard",
};
modelFigureMapping = {
"vicuna": "figures/vicuna.jpeg",
// Image from: https://commons.wikimedia.org/wiki/File:ChatGPT_logo.svg
"gpt35": "figures/chatgpt.svg",
// Image from: https://www.reddit.com/r/logodesign/comments/1128aat/google_ai_bard_logo_design/
"bard": "figures/bard.jpg",
// Image from: https://crfm.stanford.edu/2023/03/13/alpaca.html
"alpaca": "figures/alpaca.png",
// Image adapted from https://commons.wikimedia.org/wiki/File:Llama_on_Machu_Picchu.jpg
"llama": "figures/llama.jpg",
}
// Store the question data in a mapping for later use.
questionMapping = {};
// Store the question ids in a mapping for later use.
categoryMapping = {};
// Store the number of questions for later use.
questionsCount = 0;
function text2Markdown(text) {
// Normalize the text for markdown rendering.
text = text.trim().replaceAll('\n\n', '\n').replaceAll('\n', '\n\n');
return marked.parse(text);
}
function capitalizeFirstChar(str) {
if (!str || str.length === 0) {
return str;
}
return str.charAt(0).toUpperCase() + str.slice(1);
}
function updateQuestionSelect(question_id) {
const select = document.getElementById('question-select');
// Clear the question select.
select.innerHTML = '';
// Populate the question select.
category = questionMapping[question_id].category;
categoryMapping[category].forEach(question_id => {
const question = questionMapping[question_id];
const option = document.createElement('option');
option.value = question_id;
option.textContent = 'Q' + question_id.toString() + ': ' + question.question;
select.appendChild(option);
});
select.value = question_id;
}
function updateModelSelect() {
const select = document.getElementById('model-select');
img_path = modelFigureMapping[select.value];
document.getElementById('other-model-figure').src = img_path;
}
function populateModels(models) {
const select = document.getElementById('model-select');
models.forEach(model => {
const option = document.createElement('option');
option.value = model;
option.textContent = modelNameMapping[model];
select.appendChild(option);
});
updateModelSelect();
}
function populateQuestions(questions) {
const category_select = document.getElementById('category-select');
questionsCount = questions.length;
questions.forEach(question => {
const option = document.createElement('option');
// Store the question data in a mapping for later use.
questionMapping[question.id] = {
category: question.category,
question: question.question,
answers: question.answers,
evaluations: question.evaluations,
scores: question.scores,
};
// Store the question id in the category mapping.
if (question.category in categoryMapping) {
categoryMapping[question.category].push(question.id);
} else {
categoryMapping[question.category] = [question.id];
const category_option = document.createElement('option');
category_option.value = question.category;
category_option.textContent = capitalizeFirstChar(question.category);
category_select.appendChild(category_option);
}
});
// Set the default category.
updateQuestionSelect(currentQuestionIndex);
}
function displayQuestion(index) {
const question = questionMapping[index].question;
document.getElementById('selected-question').innerHTML = text2Markdown('**Question:** ' + question);
displayAnswers(index);
}
function displayAnswers(index) {
const question = questionMapping[index];
const otherModel = document.getElementById('model-select').value;
// render the answers with markdown
document.getElementById('other-model-answer').innerHTML = text2Markdown(question.answers[otherModel]);
document.getElementById('our-model-answer').innerHTML = text2Markdown(question.answers.vicuna);
// Display evaluation
score = question.scores[otherModel];
score_text = modelNameMapping[otherModel] + " " + score[0] + "/10, Vicuna-13b " + score[1] + "/10";
document.getElementById('evaluation-header').textContent = "GPT-4 Evaluation" + " (Score: " + score_text + ")";
document.getElementById('evaluation-result').innerHTML = text2Markdown(question.evaluations[otherModel]);
// Update model names
let assistant1_title = "Assistant #1"; // (" + modelNameMapping[otherModel] + ")";
let assistant2_title = "Assistant #2 (Vicuna-13b, our model)";
// Update scores/labels.
let assistant1_score_label = score[0].toString() + '/10';
let assistant2_score_label = score[1].toString() + '/10';
const colorRed ='#fa9'; // '#eb978d';
// const colorGreen = '#c9f2c9';
const colorBlue = '#8ef'; // '#71dbf9';
const colorYellow = '#fe7'; // '#fada57';
let otherModelHeaderColor = '';
let ourModelHeaderColor = '';
// Update the winner.
if (score[0] == score[1]) {
assistant1_title = '🏆 ' + assistant1_title;
assistant1_score_label = '🏆 ' + assistant1_score_label;
assistant2_title = '🏆 ' + assistant2_title;
assistant2_score_label = '🏆 ' + assistant2_score_label;
otherModelHeaderColor = colorYellow;
ourModelHeaderColor = colorYellow;
} else if (score[0] > score[1]) {
assistant1_title = '🏆 ' + assistant1_title;
assistant1_score_label = '🏆 ' + assistant1_score_label;
otherModelHeaderColor = colorBlue;
ourModelHeaderColor = colorRed;
} else if (score[0] < score[1]) {
assistant2_title = '🏆 ' + assistant2_title;
assistant2_score_label = '🏆 ' + assistant2_score_label;
otherModelHeaderColor = colorRed;
ourModelHeaderColor = colorBlue;
}
document.getElementById('other-model-header-bg').style.backgroundColor = otherModelHeaderColor;
document.getElementById('our-model-header').style.backgroundColor = ourModelHeaderColor;
document.getElementById('other-model-header').textContent = assistant1_title;
document.getElementById('our-model-header').textContent = assistant2_title;
document.getElementById('other-score-label').textContent = assistant1_score_label;
document.getElementById('our-score-label').textContent = assistant2_score_label;
// Update expand buttons visibility for both cards after displaying answers
// Reset the expanded state and update expand buttons visibility for both cards after displaying answers
document.querySelectorAll('.expandable-card').forEach(card => {
card.classList.remove('expanded');
updateExpandButtonVisibility(card);
const expandBtn = card.querySelector('.expand-btn');
expandBtn.innerHTML = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more'; // .textContent = 'Show more';
});
}
document.getElementById('question-select').addEventListener('change', e => {
currentQuestionIndex = parseInt(e.target.value);
displayQuestion(currentQuestionIndex);
});
document.getElementById('category-select').addEventListener('change', e => {
let currentCategory = e.target.value;
const questionIds = categoryMapping[currentCategory];
currentQuestionIndex = questionIds[0];
updateQuestionSelect(currentQuestionIndex);
displayQuestion(currentQuestionIndex);
});
// Update expand buttons whenever the model is changed
document.getElementById('model-select').addEventListener('change', () => {
displayAnswers(currentQuestionIndex);
document.querySelectorAll('.expandable-card').forEach(card => {
updateExpandButtonVisibility(card);
});
updateModelSelect();
});
function switchQuestionAndCategory() {
document.getElementById('question-select').value = currentQuestionIndex;
old_category = document.getElementById('category-select').value;
new_category = questionMapping[currentQuestionIndex].category;
if (old_category != new_category) {
document.getElementById('category-select').value = new_category;
updateQuestionSelect(currentQuestionIndex);
}
displayQuestion(currentQuestionIndex);
}
document.getElementById('prev-question').addEventListener('click', () => {
// Question index starts from 1.
currentQuestionIndex = Math.max(1, currentQuestionIndex - 1);
switchQuestionAndCategory();
});
document.getElementById('next-question').addEventListener('click', () => {
// Question index starts from 1.
currentQuestionIndex = Math.min(questionsCount, currentQuestionIndex + 1);
switchQuestionAndCategory();
});
function updateExpandButtonVisibility(card) {
const cardTextContainer = card.querySelector('.card-text-container');
const expandBtn = card.querySelector('.expand-btn');
if (cardTextContainer.scrollHeight > cardTextContainer.offsetHeight) {
expandBtn.style.display = 'flex';
} else {
expandBtn.style.display = 'none';
card.classList.add('expanded');
}
}
document.querySelectorAll('.expand-btn').forEach(btn => {
btn.addEventListener('click', e => {
const card = e.target.closest('.expandable-card');
card.classList.toggle('expanded');
const more = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more';
const less = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_up</i> Show less';
e.target.innerHTML = card.classList.contains('expanded') ? less : more;
});
});
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment