Commit 81028572 authored by luopl's avatar luopl
Browse files

init

parents
Pipeline #1722 canceled with stages
import torch
from PIL import Image
from abc import abstractproperty
import sys
import os.path as osp
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
import copy
class VILA(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
def __init__(self,
model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b',
**kwargs):
try:
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
from llava.mm_utils import process_images, tokenizer_image_token, KeywordsStoppingCriteria
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN # noqa E501
from llava.conversation import conv_templates, SeparatorStyle
except:
warnings.warn('Please install VILA before using VILA')
warnings.warn('Please install VILA from https://github.com/NVlabs/VILA')
warnings.warn('Please install VLMEvalKit after installing VILA')
warnings.warn('VILA is supported only with transformers==4.36.2')
sys.exit(-1)
warnings.warn('Please install the latest version of VILA from GitHub before you evaluate the VILA model.')
assert osp.exists(model_path) or len(model_path.split('/')) == 2
model_name = get_model_name_from_path(model_path)
try:
self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
model_path=model_path,
model_base=None,
model_name=model_name,
device='cpu',
device_map='cpu'
)
except Exception as e:
warnings.warn(f'Error loading VILA model: {e}')
exit(-1)
self.model = self.model.cuda()
if '3b' in model_path:
self.conv_mode = 'vicuna_v1'
if '8b' in model_path:
self.conv_mode = 'llama_3'
elif '13b' in model_path:
self.conv_mode = 'vicuna_v1'
elif '40b' in model_path:
self.conv_mode = 'hermes-2'
kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=512, top_p=None, num_beams=1, use_cache=True) # noqa E501
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Using the following kwargs for generation config: {self.kwargs}')
self.conv_templates = conv_templates
self.process_images = process_images
self.tokenizer_image_token = tokenizer_image_token
self. DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN
self.SeparatorStyle = SeparatorStyle
self.IMAGE_TOKEN_INDEX = IMAGE_TOKEN_INDEX
self.KeywordsStoppingCriteria = KeywordsStoppingCriteria
def use_custom_prompt(self, dataset):
assert dataset is not None
# TODO see if custom prompt needed
return False
def generate_inner(self, message, dataset=None):
content, images = '', []
for msg in message:
if msg['type'] == 'text':
content += msg['value']
elif msg['type'] == 'image':
image = Image.open(msg['value']).convert('RGB')
images.append(image)
content += (self.DEFAULT_IMAGE_TOKEN + '\n')
image_tensor = self.process_images(
images, self.image_processor,
self.model.config).to(self.model.device, dtype=torch.float16)
# Support interleave text and image
conv = self.conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], content)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = self.tokenizer_image_token(prompt, self.tokenizer, self.IMAGE_TOKEN_INDEX,
return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = self.KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids, images=image_tensor, stopping_criteria=[stopping_criteria], **self.kwargs)
output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
return output
import warnings
from .base import BaseModel
from ..smp import *
class VisualGLM(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
try:
import sat
except:
warnings.warn('Please install SwissArmyTransformer to use VisualGLM')
assert model_path is not None
self.model_path = model_path
from transformers import AutoModel
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
self.model = model
self.kwargs = kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
output, _ = self.model.chat(
image_path=image_path,
tokenizer=self.tokenizer,
query=prompt,
history=[],
**self.kwargs
)
return output
import torch
import sys
import os.path as osp
import warnings
from .base import BaseModel
from transformers import StoppingCriteriaList
from omegaconf import OmegaConf
from PIL import Image
from huggingface_hub import snapshot_download
from vlmeval.smp import *
model_cfgs = {
'XVERSE-V-13B': {
'arch': 'vxverse',
'model_type': 'pretrain_xverse13b-chat',
'max_txt_len': 512,
'end_sym': '<|endoftext|>',
'low_resource': False,
'prompt_template': 'Human: {}\nAssistant: ',
'ckpt': 'xverse/XVERSE-V-13B',
'lora_r': 128,
'lora_alpha': 256,
'lora_dropout': 0.05,
'lora_target_modules': 'all_linear',
'has_qformer': False,
'n_proj_layers': 2,
'vit_model': 'openai/clip-vit-large-patch14',
'vit_path': 'openai/clip-vit-large-patch14',
'image_size': 224,
'drop_path_rate': 0,
'vit_precision': 'fp16',
'llama_model': 'xverse/XVERSE-13B-Chat',
}
}
class VXVERSE(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self, model_name='XVERSE-V-13B', root=None, **kwargs):
if root is None:
warnings.warn('Please set root to the directory of vxverse.')
if model_name == 'XVERSE-V-13B':
cfg = model_cfgs['XVERSE-V-13B']
else:
raise NotImplementedError
ckpt_dir = cfg['ckpt']
if not osp.isdir(ckpt_dir):
cache_path = get_cache_path(ckpt_dir)
if cache_path is not None:
ckpt_dir = cache_path
else:
ckpt_dir = snapshot_download(repo_id=ckpt_dir)
assert osp.exists(ckpt_dir) and osp.isdir(ckpt_dir)
ckpt = osp.join(ckpt_dir, 'adapter_and_lora.bin')
cfg['ckpt'] = ckpt
model_cfg = OmegaConf.create(cfg)
self.model_name = model_name
self.root = root
sys.path.append(self.root)
from vxverse.common.registry import registry
from vxverse.conversation.conversation import CONV_VISION_XVERSE
device = torch.cuda.current_device()
self.device = device
model_cls = registry.get_model_class(model_cfg.arch)
model = model_cls.from_config(model_cfg)
model = model.to(device)
model.eval()
vis_processor_cfg = OmegaConf.create(dict(name='hd_image_train', image_size=224))
vis_processor = registry.get_processor_class(
vis_processor_cfg.name
).from_config(vis_processor_cfg)
self.model = model
self.vis_processor = vis_processor
self.vis_processor_cfg = vis_processor_cfg
self.CONV_VISION = CONV_VISION_XVERSE
self.CONV_VISION.system = ''
stop_words_ids = [[835], [2277, 29937]]
self.stop_words_ids = stop_words_ids
default_kwargs = dict(max_new_tokens=512)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
if self.vis_processor_cfg.name == 'hd_image_train':
patches_per_image = [[image.shape[0]]]
image = [image]
else:
patches_per_image = None
image = image.unsqueeze(0)
chat_state = self.CONV_VISION.copy()
texts = self.prepare_texts([prompt], chat_state)
texts = [text.lstrip() for text in texts]
answers = self.model.generate(
image,
texts,
patches_per_images=patches_per_image,
do_sample=False,
stop_words_ids=self.stop_words_ids,
**self.kwargs
)
return answers[0]
def prepare_texts(self, texts, conv_temp):
convs = [conv_temp.copy() for _ in range(len(texts))]
[
conv.append_message(conv.roles[0], '<ImageHere>\n{}'.format(text))
for conv, text in zip(convs, texts)
]
[conv.append_message(conv.roles[1], None) for conv in convs]
texts = [conv.get_prompt() for conv in convs]
return texts
import torch
from PIL import Image
import sys
from ..smp import *
from .base import BaseModel
from ..dataset import DATASET_TYPE
from transformers import AutoModel, GenerationConfig
class WeMM(BaseModel):
def __init__(self, model_path='feipengma/WeMM', **kwargs):
self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
self.wemm.cuda()
self.wemm.eval()
torch.cuda.empty_cache()
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += (
'\n请直接回答选项字母。' if cn_string(prompt) else
"\nAnswer with the option's letter from the given choices directly."
)
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=p) for p in tgt_path])
return message
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if dataset == 'HallusionBench':
prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.'
gen_config = None
if dataset == 'MMVet':
gen_config = GenerationConfig(
max_new_tokens=512,
do_sample=True,
temperatures=0.7,
num_beams=3,
eos_token_id=self.wemm.tokenizer.eos_token_id,
pad_token_id=self.wemm.tokenizer.pad_token_id
if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id,
)
pred = self.wemm.mm_generate(image_path, prompt, gen_config)
return pred
from .sharecaptioner import ShareCaptioner
from .xcomposer import XComposer
from .xcomposer2 import XComposer2
from .xcomposer2_4KHD import XComposer2_4KHD
from .xcomposer2d5 import XComposer2d5
__all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD', 'XComposer2d5']
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
class ShareCaptioner(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs):
assert model_path is not None
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path, device_map='cuda', trust_remote_code=True).eval()
self.model.tokenizer = tokenizer
self.model.cuda()
self.model.half()
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
option_candidate = string.ascii_uppercase
options = {
cand: line[cand]
for cand in option_candidate
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if not cn_string(prompt):
prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
else:
prompt = prompt + '\n' + '请直接回答选项字母。'
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
seg1 = '<|User|>:'
seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:'
self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True)
self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False)
image = Image.open(image_path).convert('RGB')
image = self.model.vis_processor(image).unsqueeze(0)
image = image.to(self.model.device)
tmp_bs = image.shape[0]
tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1)
tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1)
with torch.cuda.amp.autocast():
with torch.no_grad():
image = self.model.encode_img(image)
input_emb = torch.cat(
[tmp_seg_emb1, image, tmp_seg_emb2], dim=1)
out_embeds = self.model.internlm_model.generate(
inputs_embeds=input_emb,
max_length=500,
num_beams=3,
min_length=1,
do_sample=True,
repetition_penalty=1.5,
length_penalty=1.0,
temperature=1.,
eos_token_id=self.model.tokenizer.eos_token_id,
num_return_sequences=1)
for j, out in enumerate(out_embeds):
out[out == -1] = 2
response = self.model.decode_text([out])
return response
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import StoppingCriteria, StoppingCriteriaList
from PIL import Image
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
class StoppingCriteriaSub(StoppingCriteria):
def __init__(self, stops=[], encounters=1):
super().__init__()
self.stops = stops
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
for stop in self.stops:
if torch.all((stop == input_ids[0][-len(stop):])).item():
return True
return False
class XComposer(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='internlm/internlm-xcomposer-vl-7b', **kwargs):
assert model_path is not None
self.model_path = model_path
model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
model.tokenizer = tokenizer
self.model = model
self.device = self.model.internlm_model.model.embed_tokens.weight.device
self.eoh = '<TOKENS_UNUSED_0>'
self.eoa = '<TOKENS_UNUSED_1>'
stop_words_ids = [
torch.tensor([103027]).to(self.device), # end of human
torch.tensor([103028]).to(self.device), # end of bot
]
default_kwargs = {
'max_new_tokens': 512, 'num_beams': 5, 'do_sample': False,
'min_length': 1, 'repetition_penalty': 1.5, 'length_penalty': 1.0
}
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
def generate_inner(self, message, dataset=None):
if len(message) == 2:
if message[0]['type'] == 'text' and message[1]['type'] == 'image':
message = [message[1], message[0]]
kwargs = cp.deepcopy(self.kwargs)
if dataset is not None:
if DATASET_TYPE(dataset) == 'MCQ':
kwargs['max_new_tokens'] = 5
kwargs['num_beams'] = 5
with torch.cuda.amp.autocast():
with torch.no_grad():
prompt_embs = self.message_to_prompt_embs(message, dataset)
outputs = self.model.internlm_model.generate(
inputs_embeds=prompt_embs,
stopping_criteria=self.stopping_criteria,
**kwargs
)
output_token = outputs[0]
if output_token[0] == 0:
output_token = output_token[1:]
if output_token[0] == 1:
output_token = output_token[1:]
output_text = self.model.tokenizer.decode(output_token, add_special_tokens=False)
output_text = output_text.split(self.model.eoa)[0]
output_text = output_text.split('<|Bot|>')[-1].strip()
return output_text
def message_to_prompt_embs(self, message, dataset=None):
assert isinstance(message, list)
img_embeds = []
prompt_full = '<|User|>: '
for msg in message:
if msg['type'] == 'text':
prompt_full += msg['value']
elif msg['type'] == 'image':
image = Image.open(msg['value']).convert('RGB')
image = self.model.vis_processor(image).unsqueeze(0).to(self.device)
img_embeds.append(self.model.encode_img(image))
prompt_full += '<ImageHere>'
prompt_full += self.model.eoh + ' <|Bot|>: '
if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
prompt_full += 'Answer: The answer is '
elif dataset is not None and DATASET_TYPE(dataset) in ['VQA', 'QA', 'Y/N']:
prompt_full += 'Answer: '
prompt_segs = prompt_full.split('<ImageHere>')
assert len(prompt_segs) == len(img_embeds) + 1
prompt_seg_tokens = [
self.model.tokenizer(seg, return_tensors='pt', add_special_tokens=(i == 0)).to(self.device).input_ids.long()
for i, seg in enumerate(prompt_segs)
]
prompt_seg_embs = [self.model.internlm_model.model.embed_tokens(seg) for seg in prompt_seg_tokens]
all_embeddings = []
for i in range(len(img_embeds)):
all_embeddings.extend([prompt_seg_embs[i], img_embeds[i]])
all_embeddings.append(prompt_seg_embs[-1])
prompt_embs = torch.cat(all_embeddings, dim=1)
return prompt_embs
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
context = 'N/A' if hint is None else hint
mid_prompt = 'Context: ' + context + '\nQuestion: ' + question
if len(options_prompt):
mid_prompt += '\nOptions: ' + options_prompt
if len(options):
txt_prompt = 'Please answer this question by choosing the correct choice.'
else:
txt_prompt = 'Please answer this question directly. '
prompt = txt_prompt + mid_prompt
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
import torch
import torchvision
from transformers import AutoModel, AutoTokenizer
from PIL import Image
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
import re
pattern = re.compile(r'[A-Z]')
def __padding__(image):
width, height = image.size
tar = max(width, height)
top_padding = int((tar - height) / 2)
bottom_padding = tar - height - top_padding
left_padding = int((tar - width) / 2)
right_padding = tar - width - left_padding
image = torchvision.transforms.functional.pad(image, [left_padding, top_padding, right_padding, bottom_padding])
return image
meta_instruction = """
You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by
Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language
chosen by the user such as English and 中文.
- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively
based on the provided image.
"""
def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
pt1 = 0
embeds = []
im_mask = []
images = [images]
images_loc = [0]
for i, pts in enumerate(images_loc + [len(text)]):
subtext = text[pt1:pts]
if need_bos or len(subtext) > 0:
text_embeds = model.encode_text(subtext, add_special_tokens=need_bos)
embeds.append(text_embeds)
im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
need_bos = False
if i < len(images):
try:
image = Image.open(images[i]).convert('RGB')
except:
image = images[i].convert('RGB')
if padding:
image = __padding__(image)
image = model.vis_processor(image).unsqueeze(0).cuda()
image_embeds = model.encode_img(image)
embeds.append(image_embeds)
im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
pt1 = pts
embeds = torch.cat(embeds, dim=1)
im_mask = torch.cat(im_mask, dim=1)
im_mask = im_mask.bool()
outputs = model.generate(
inputs_embeds=embeds,
im_mask=im_mask,
temperature=1.0,
max_new_tokens=max_token,
num_beams=beams,
do_sample=False,
repetition_penalty=1.0)
output_token = outputs[0]
if output_token[0] == 0 or output_token[0] == 1:
output_token = output_token[1:]
output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
return output_text
class XComposer2(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='internlm/internlm-xcomposer2-vl-7b', **kwargs):
assert model_path is not None
self.model_path = model_path
model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
model.half()
tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
model.tokenizer = tokenizer
self.model = model
self.device = self.model.model.tok_embeddings.weight.device
def generate_mme(self, image_path, text):
text = text.split('Please answer')[0].strip()
text = f'{text} Answer this question briefly'
text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=5)
def generate_multichoice(self, image_path, text, dataset):
out = model_gen(self.model, text, image_path, need_bos=True, padding=False, beams=5, max_token=5)
if 'mmmu' in dataset.lower():
return out
res = pattern.findall(out)
if len(res) == 0:
print('Error:', out)
res = 'Z'
return res[0]
def generate_vqa(self, image_path, text):
out = model_gen(self.model, text, image_path, need_bos=True)
return out
def generate_vanilla(self, image_path, text):
text = (
'[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}'
'Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
).format(meta_instruction, text)
out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
return out
def generate_brief(self, image_path, text):
text = (
'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}'
'[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
).format(text)
out = model_gen(self.model, text, image_path, need_bos=True, max_token=10)
return out
def generate_driectly(self, image_path, text):
text = '[UNUSED_TOKEN_146]user\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
return out
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
with torch.cuda.amp.autocast():
if dataset is None:
return self.generate_vanilla(image_path, prompt)
assert isinstance(dataset, str)
if dataset == 'MME':
return self.generate_mme(image_path, prompt)
elif listinstr(['hallu'], dataset.lower()):
return self.generate_brief(image_path, prompt)
elif listinstr(['llava'], dataset.lower()):
return self.generate_vanilla(image_path, prompt)
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
return self.generate_multichoice(image_path, prompt, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
return self.generate_vqa(image_path, prompt)
else:
return self.generate_vanilla(image_path, prompt)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
return True
return False
def build_mcqa(self, line):
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
img_prompt = '[UNUSED_TOKEN_146]user\n'
if len(options):
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item} '
options_prompt = options_prompt.strip()
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
context = 'N/A' if hint is None else hint
mid_prompt = 'Question: ' + question + '\nContext: ' + context + '\nOptions: ' + options_prompt
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
prompt = img_prompt + mid_prompt + ans_prompt
else:
mid_prompt = f'Answer the question using a single word or phrase.{question}'
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
prompt = img_prompt + mid_prompt + ans_prompt
return prompt
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_mcqa(line)
elif DATASET_TYPE(dataset) == 'VQA':
if 'mathvista' in dataset.lower():
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
else:
q = line['question']
prompt = (
f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{q}'
'[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
)
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
import numpy as np
import torchvision.transforms as transforms
import re
pattern = re.compile(r'[A-Z]')
def padding_336(b):
width, height = b.size
tar = int(np.ceil(height / 336) * 336)
top_padding = int((tar - height) / 2)
bottom_padding = tar - height - top_padding
left_padding = 0
right_padding = 0
b = transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255, 255, 255])
return b
def HD_transform(img, im_num=16):
width, height = img.size
trans = False
if width < height:
img = img.transpose(Image.TRANSPOSE)
trans = True
width, height = img.size
ratio = (width / height)
scale = 1
while scale * np.ceil(scale / ratio) <= im_num:
scale += 1
scale -= 1
new_w = int(scale * 336)
new_h = int(new_w / ratio)
img = transforms.functional.resize(img, [new_h, new_w],)
img = padding_336(img)
width, height = img.size
assert width * height <= im_num * 336 * 336
if trans:
img = img.transpose(Image.TRANSPOSE)
return img
meta_instruction = """You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed\
by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by\
the user such as English and 中文.
- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses\
effectively based on the provided image."""
def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
pt1 = 0
embeds = []
im_mask = []
images = [images]
images_loc = [0]
for i, pts in enumerate(images_loc + [len(text)]):
subtext = text[pt1:pts]
if need_bos or len(subtext) > 0:
text_embeds = model.encode_text(subtext, add_special_tokens=need_bos)
embeds.append(text_embeds)
im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
need_bos = False
if i < len(images):
try:
image = Image.open(images[i]).convert('RGB')
except:
image = images[i].convert('RGB')
image = HD_transform(image, im_num=model.hd_num)
image = model.vis_processor(image).unsqueeze(0).cuda()
image_embeds = model.encode_img(image)
embeds.append(image_embeds)
im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
pt1 = pts
embeds = torch.cat(embeds, dim=1)
im_mask = torch.cat(im_mask, dim=1)
im_mask = im_mask.bool()
outputs = model.generate(inputs_embeds=embeds, im_mask=im_mask,
temperature=1.0, max_new_tokens=max_token, num_beams=beams,
do_sample=False, repetition_penalty=1.0)
output_token = outputs[0]
if output_token[0] == 0 or output_token[0] == 1:
output_token = output_token[1:]
output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
return output_text
class XComposer2_4KHD(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='internlm/internlm-xcomposer2-4khd-7b', **kwargs):
assert model_path is not None
self.model_path = model_path
model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
model.half()
tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
model.tokenizer = tokenizer
self.model = model
self.device = self.model.model.tok_embeddings.weight.device
self.model.hd_num = 25
def generate_mme(self, image_path, text):
text = text.split('Please answer')[0].strip()
text = f'{text} Answer this question briefly'
text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=5)
def generate_multichoice(self, image_path, text, dataset):
out = model_gen(self.model, text, image_path, need_bos=True, padding=False, beams=5, max_token=5)
if 'mmmu' in dataset.lower():
return out
res = pattern.findall(out)
if len(res) == 0:
print('Error:', out)
res = 'Z'
return res[0]
def generate_vqa(self, image_path, text):
out = model_gen(self.model, text, image_path, need_bos=True, max_token=100)
return out
def generate_vanilla(self, image_path, text):
out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
return out
def generate_brief(self, image_path, text):
text = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}\
[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
out = model_gen(self.model, text, image_path, need_bos=True, max_token=10)
return out
def generate(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if listinstr(['docvqa_test', 'infovqa_test'], dataset.lower()):
self.model.hd_num = 65
elif listinstr(['docvqa_val', 'infovqa_val', 'OCRBench'], dataset.lower()):
self.model.hd_num = 55
elif listinstr(['mmlongbench_doc'], dataset.lower()):
self.model.hd_num = 45
elif listinstr(['mmmu', 'mmbench', 'mmvet'], dataset.lower()):
self.model.hd_num = 16
else:
self.model.hd_num = 25
with torch.cuda.amp.autocast():
if dataset is None:
return self.generate_vanilla(image_path, prompt)
assert isinstance(dataset, str)
if dataset == 'MME':
return self.generate_mme(image_path, prompt)
elif listinstr(['hallu'], dataset.lower()):
return self.generate_brief(image_path, prompt)
elif listinstr(['llava', 'mmvet'], dataset.lower()):
return self.generate_vanilla(image_path, prompt)
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
return self.generate_multichoice(image_path, prompt, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
return self.generate_vqa(image_path, prompt)
else:
return self.generate_vanilla(image_path, prompt)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
return True
return False
def build_mcqa(self, line):
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
img_prompt = '[UNUSED_TOKEN_146]user\n'
if len(options):
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item} '
options_prompt = options_prompt.strip()
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
context = 'N/A' if hint is None else hint
mid_prompt = 'Question: ' + question + '\nContext: ' + context + '\nOptions: ' + options_prompt
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
prompt = img_prompt + mid_prompt + ans_prompt
else:
mid_prompt = f'Answer the question using a single word or phrase.{question}'
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
prompt = img_prompt + mid_prompt + ans_prompt
return prompt
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_mcqa(line)
elif DATASET_TYPE(dataset) == 'VQA':
if 'mathvista' in dataset.lower():
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
elif listinstr(['llava', 'mmvet'], dataset.lower()):
q = line['question']
prompt = '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}\
Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]\
assistant\n'.format(meta_instruction, q)
elif listinstr(['mmlongbench_doc'], dataset.lower()):
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
else:
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.\
{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
ret = [dict(type='text', value=prompt)]
ret.extend([dict(type='image', value=s) for s in tgt_path])
return ret
import re
import numpy as np
import torch
import torchvision.transforms as transforms
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from ...dataset import DATASET_TYPE
from ...smp import *
from ..base import BaseModel
pattern = re.compile(r'[A-Z]')
def padding_560(b):
width, height = b.size
tar = int(np.ceil(height / 560) * 560)
top_padding = int((tar - height) / 2)
bottom_padding = tar - height - top_padding
left_padding = 0
right_padding = 0
b = transforms.functional.pad(
b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255, 255, 255])
return b
def HD_transform(img, im_num=36, id_scale=1.5):
width, height = img.size
trans = False
if width < height:
img = img.transpose(Image.TRANSPOSE)
trans = True
width, height = img.size
ratio = (width / height)
scale = 1
while scale * np.ceil(scale / ratio) <= im_num:
scale += 1
scale -= 1
scale = min(np.ceil(width * id_scale / 560), scale)
new_w = int(scale * 560)
new_h = int(new_w / ratio)
img = transforms.functional.resize(img, [new_h, new_w],)
img = padding_560(img)
width, height = img.size
assert width * height <= im_num * 560 * 560
if trans:
img = img.transpose(Image.TRANSPOSE)
return img
meta_instruction = """You are an AI assistant whose name is InternLM (书生·浦语).\n" + "- InternLM (书生·浦语) \
is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).
It is designed to be helpful, honest, and harmless.\n"+"- InternLM (书生·浦语) \
can understand and communicate fluently in the language chosen by the user such as English and 中文."""
def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
embeds = []
im_mask = []
im_idx = 0
sub_q = text.split('<IM_POS>')
add_im = len(sub_q) - 1
for subtext in sub_q:
if need_bos or len(subtext) > 0:
text_embeds = model.encode_text(
subtext, add_special_tokens=need_bos)
embeds.append(text_embeds)
im_mask.append(torch.zeros(text_embeds.shape[:2]).to(model.device))
need_bos = False
if im_idx < len(images) and add_im:
try:
image = Image.open(images[im_idx]).convert('RGB')
except:
image = images[im_idx].convert('RGB')
if len(images) > 1:
image = HD_transform(image, im_num=model.hd_num // len(images), id_scale=model.id_scale)
else:
image = HD_transform(
image, im_num=model.hd_num, id_scale=model.id_scale)
image = model.vis_processor(image).unsqueeze(0).to(model.device)
image_embeds = model.encode_img(image)
im_idx += 1
add_im -= 1
embeds.append(image_embeds)
im_mask.append(torch.ones(
image_embeds.shape[:2], dtype=torch.long).to(model.device))
embeds = torch.cat(embeds, dim=1)
im_mask = torch.cat(im_mask, dim=1)
im_mask = im_mask.bool()
outputs = model.generate(inputs_embeds=embeds, im_mask=im_mask,
temperature=1.0, max_new_tokens=max_token, num_beams=beams,
do_sample=False, repetition_penalty=1.0)
output_token = outputs[0]
if output_token[0] == 0 or output_token[0] == 1:
output_token = output_token[1:]
output_text = model.tokenizer.decode(
output_token, add_special_tokens=False)
output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
return output_text
class XComposer2d5(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='internlm/internlm-xcomposer2d5-7b', id_scale=1.5, beam=3, **kwargs):
assert model_path is not None
self.model_path = model_path
self.id_scale = id_scale
self.beam = beam
model = AutoModel.from_pretrained(
self.model_path, device_map='cpu', trust_remote_code=True, local_files_only=True).cuda().eval()
model.half()
tokenizer = AutoTokenizer.from_pretrained(
self.model_path, trust_remote_code=True)
model.tokenizer = tokenizer
self.model = model
self.device = self.model.model.tok_embeddings.weight.device
self.model.hd_num = 36
self.model.id_scale = self.id_scale
def message_to_promptimg(self, message, dataset=None):
num_images = len([x for x in message if x['type'] == 'image'])
if num_images == 0:
prompt = '\n'.join([x['value']
for x in message if x['type'] == 'text'])
image = None
else:
image = [x['value'] for x in message if x['type'] == 'image']
if len(image) == 1:
prompt = ''.join([x['value']
for x in message if x['type'] == 'text'])
im_prompt = '<IM_POS>'
prompt = prompt.replace('<image 1>', '')
prompt = im_prompt + prompt
else:
prompt = ''
im_prompt = [
f'Image{im_idx+1}: <IM_POS>;' for im_idx in range(len(image))]
add_im = len(im_prompt)
im_idx = 0
for x in message:
if x['type'] == 'text':
prompt += x['value']
if add_im > im_idx:
prompt += f'Image{im_idx + 1}'
im_idx += 1
im_prompt = ' '.join(im_prompt)
for i in range(len(image)):
prompt = prompt.replace(f'<image {i+1}>', f'Image{i+1} ')
if listinstr(['mmlongbench', 'dude', 'slidevqa'], dataset.lower()): # fix bug for multi-image prompt
prompt = '[UNUSED_TOKEN_146]user\n' + im_prompt + re.sub(
re.escape('[UNUSED_TOKEN_146]user\n'), '', prompt
)
prompt = re.sub('Image1$', '', prompt)
return prompt, image
def generate_mme(self, image_path, text):
text = text.split('Please answer')[0].strip()
text = f'{text} Answer this question briefly'
text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=self.beam)
def generate_multichoice(self, image_path, text, dataset):
out = model_gen(self.model, text, image_path,
need_bos=True, padding=False, beams=self.beam, max_token=5)
if 'mmmu' in dataset.lower():
return out
res = pattern.findall(out)
if len(res) == 0:
print('Error:', out)
res = 'Z'
return res[0]
def generate_vqa(self, image_path, text):
out = model_gen(self.model, text, image_path, beams=self.beam,
need_bos=True, max_token=100)
return out
def generate_vanilla(self, image_path, text):
out = model_gen(self.model, text, image_path, beams=self.beam,
need_bos=True, max_token=500)
return out
def generate_brief(self, image_path, text):
text = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}\
[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
out = model_gen(self.model, text, image_path, beams=self.beam,
need_bos=True, max_token=10)
return out
def set_max_num(self, dataset):
if listinstr(['MME-RealWorld', 'MME-RealWorld-CN'], dataset):
self.model.hd_num = 25
def generate_inner(self, message, dataset=None):
self.set_max_num(dataset)
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
with torch.cuda.amp.autocast():
if dataset is None:
return self.generate_vanilla(image_path, prompt)
assert isinstance(dataset, str)
if dataset == 'MME':
return self.generate_mme(image_path, prompt)
elif listinstr(['hallu', 'pope'], dataset.lower()):
return self.generate_brief(image_path, prompt)
elif listinstr(['llava', 'mmvet'], dataset.lower()):
return self.generate_vanilla(image_path, prompt)
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
return self.generate_multichoice(image_path, prompt, dataset)
elif listinstr(['MME-RealWorld', 'MME-RealWorld-CN'], dataset):
return self.generate_multichoice(image_path, prompt, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
return self.generate_vqa(image_path, prompt)
else:
return self.generate_vanilla(image_path, prompt)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
return True
return False
def build_mcqa(self, line):
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
img_prompt = '[UNUSED_TOKEN_146]user\n'
if len(options):
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item} '
options_prompt = options_prompt.strip()
hint = line['hint'] if (
'hint' in line and not pd.isna(line['hint'])) else None
context = 'N/A' if hint is None else hint
mid_prompt = 'Question: ' + question + '\nContext: ' + \
context + '\nOptions: ' + options_prompt
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
prompt = img_prompt + mid_prompt + ans_prompt
else:
mid_prompt = f'Answer the question using a single word or phrase.{question}'
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
prompt = img_prompt + mid_prompt + ans_prompt
return prompt
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_mcqa(line)
elif DATASET_TYPE(dataset) == 'VQA':
if 'mathvista' in dataset.lower():
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
elif listinstr(['llava', 'mmvet'], dataset.lower()):
q = line['question']
prompt = '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}\
Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]\
assistant\n'.format(meta_instruction, q)
elif listinstr(['mmlongbench_doc', 'dude', 'slidevqa'], dataset.lower()):
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
else:
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.\
{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
ret = [dict(type='text', value=prompt)]
ret.extend([dict(type='image', value=s) for s in tgt_path])
return ret
from PIL import Image
import torch
from .base import BaseModel
from ..smp import *
class XGenMM(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5', **kwargs):
try:
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
except:
warnings.warn('Please install the latest version transformers.')
sys.exit(-1)
model = AutoModelForVision2Seq.from_pretrained(
model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto'
).eval()
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, use_fast=False, legacy=False
)
tokenizer = model.update_special_tokens(tokenizer)
tokenizer.eos_token = '<|end|>'
tokenizer.padding_side = 'left'
image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
self.model = model
self.image_processor = image_processor
self.tokenizer = tokenizer
self.kwargs = kwargs
def apply_prompt_template(self, query):
s = (
'<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
"The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
f'<|user|>\n{query}<|end|>\n<|assistant|>\n'
)
return s
def generate_inner(self, message, dataset=None):
content, images, image_sizes = '', [], []
for msg in message:
if msg['type'] == 'text':
content += msg['value']
elif msg['type'] == 'image':
image = Image.open(msg['value']).convert('RGB')
images.append(self.image_processor([image], image_aspect_ratio='anyres')['pixel_values'].to('cuda'))
image_sizes.append(image.size)
content += '<image> '
inputs = {'pixel_values': [images]}
prompt = self.apply_prompt_template(content)
language_inputs = self.tokenizer([prompt], return_tensors='pt').to('cuda')
inputs.update(language_inputs)
generation_args = {
'max_new_tokens': 1024,
'temperature': 0.0,
'do_sample': False,
'top_p': None,
'num_beams': 1
}
generation_args.update(self.kwargs)
generate_ids = self.model.generate(
**inputs, image_size=[image_sizes],
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
**generation_args
)
# remove input tokens
response = self.tokenizer.decode(generate_ids[0], skip_special_tokens=True).split('<|end|>')[0]
return response
import torch
import sys
import os.path as osp
import warnings
from PIL import Image
from vlmeval.smp import get_cache_path, load, dump, splitlen
from huggingface_hub import snapshot_download
from .base import BaseModel
"""
You can perform inference of Yi-VL through the following steps:
1. clone the repo https://github.com/01-ai/Yi to path-to-Yi
2. set up the environment and install the required packages in path-to-Yi/VL/requirements.txt
3. set Yi_ROOT in vlmeval/config.py
Yi_ROOT = path-to-Yi
You are all set now! To run a demo for Yi-VL:
```python
from vlmeval import *
model = supported_VLM['Yi_VL_6B']()
model.generate('apple.jpg', 'What is in this image?')
```
To run evaluation for Yi-VL, use `python run.py --model Yi_VL_6B --data {dataset_list}`
"""
def edit_config(repo_id):
if not osp.exists(repo_id):
root = get_cache_path(repo_id)
else:
root = repo_id
assert root is not None and osp.exists(root)
cfg = osp.join(root, 'config.json')
data = load(cfg)
mm_vision_tower = data['mm_vision_tower']
if mm_vision_tower.startswith('./vit/'):
data['mm_vision_tower'] = osp.join(root, mm_vision_tower)
assert osp.exists(data['mm_vision_tower'])
dump(data, cfg)
def disable_torch_init():
"""
Disable the redundant torch default initialization to accelerate model creation.
"""
import torch
setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
class Yi_VL(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self,
model_path='01-ai/Yi-VL-6B',
root=None,
**kwargs):
if root is None:
warnings.warn(
'Please set root to the directory of Yi, '
'which is cloned from here: https://github.com/01-ai/Yi.'
)
self.root = osp.join(root, 'VL')
sys.path.append(self.root)
if splitlen(model_path, '/') == 2 and not osp.exists(model_path):
if get_cache_path(model_path) is None:
snapshot_download(repo_id=model_path)
edit_config(model_path)
elif osp.exists(model_path):
edit_config(model_path)
from llava.mm_utils import get_model_name_from_path, load_pretrained_model
from llava.model.constants import key_info
disable_torch_init()
key_info['model_path'] = model_path
get_model_name_from_path(model_path)
self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
model_path,
device_map='cpu')
self.model = self.model.cuda()
self.conv_mode = 'mm_default'
kwargs_default = dict(temperature=0.2,
num_beams=1,
do_sample=False,
max_new_tokens=1024,
top_p=None)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
from llava.conversation import conv_templates
from llava.model.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
from llava.mm_utils import KeywordsStoppingCriteria, expand2square, tokenizer_image_token
qs = DEFAULT_IMAGE_TOKEN + '\n' + prompt
conv = conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = (
tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
.unsqueeze(0)
.cuda()
)
image = Image.open(image_path)
if getattr(self.model.config, 'image_aspect_ratio', None) == 'pad':
if image.mode == 'L':
background_color = int(sum([int(x * 255) for x in self.image_processor.image_mean]) / 3)
else:
background_color = tuple(int(x * 255) for x in self.image_processor.image_mean)
image = expand2square(image, background_color)
image_tensor = self.image_processor.preprocess(image, return_tensors='pt')[
'pixel_values'
][0]
stop_str = conv.sep
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
self.model = self.model.to(dtype=torch.bfloat16)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16).cuda(),
stopping_criteria=[stopping_criteria],
use_cache=True,
**self.kwargs)
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(
f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids'
)
outputs = self.tokenizer.batch_decode(
output_ids[:, input_token_len:], skip_special_tokens=True
)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[: -len(stop_str)]
outputs = outputs.strip()
return outputs
Gemma Terms of Use
Last modified: April 1, 2024
By using, reproducing, modifying, distributing, performing or displaying any portion or element of Gemma, Model Derivatives including via any Hosted Service, (each as defined below) (collectively, the "Gemma Services") or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement.
Section 1: DEFINITIONS
1.1 Definitions
(a) "Agreement" or "Gemma Terms of Use" means these terms and conditions that govern the use, reproduction, Distribution or modification of the Gemma Services and any terms and conditions incorporated by reference.
(b) "Distribution" or "Distribute" means any transmission, publication, or other sharing of Gemma or Model Derivatives to a third party, including by providing or making Gemma or its functionality available as a hosted service via API, web access, or any other electronic or remote means ("Hosted Service").
(c) "Gemma" means the set of machine learning language models, trained model weights and parameters identified at ai.google.dev/gemma, regardless of the source that you obtained it from.
(d) "Google" means Google LLC.
(e) "Model Derivatives" means all (i) modifications to Gemma, (ii) works based on Gemma, or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Gemma, to that model in order to cause that model to perform similarly to Gemma, including distillation methods that use intermediate data representations or methods based on the generation of synthetic data Outputs by Gemma for training that model. For clarity, Outputs are not deemed Model Derivatives.
(f) "Output" means the information content output of Gemma or a Model Derivative that results from operating or otherwise using Gemma or the Model Derivative, including via a Hosted Service.
1.2
As used in this Agreement, "including" means "including without limitation".
Section 2: ELIGIBILITY AND USAGE
2.1 Eligibility
You represent and warrant that you have the legal capacity to enter into this Agreement (including being of sufficient age of consent). If you are accessing or using any of the Gemma Services for or on behalf of a legal entity, (a) you are entering into this Agreement on behalf of yourself and that legal entity, (b) you represent and warrant that you have the authority to act on behalf of and bind that entity to this Agreement and (c) references to "you" or "your" in the remainder of this Agreement refers to both you (as an individual) and that entity.
2.2 Use
You may use, reproduce, modify, Distribute, perform or display any of the Gemma Services only in accordance with the terms of this Agreement, and must not violate (or encourage or permit anyone else to violate) any term of this Agreement.
Section 3: DISTRIBUTION AND RESTRICTIONS
3.1 Distribution and Redistribution
You may reproduce or Distribute copies of Gemma or Model Derivatives if you meet all of the following conditions:
You must include the use restrictions referenced in Section 3.2 as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Gemma or Model Derivatives and you must provide notice to subsequent users you Distribute to that Gemma or Model Derivatives are subject to the use restrictions in Section 3.2.
You must provide all third party recipients of Gemma or Model Derivatives a copy of this Agreement.
You must cause any modified files to carry prominent notices stating that you modified the files.
All Distributions (other than through a Hosted Service) must be accompanied by a "Notice" text file that contains the following notice: "Gemma is provided under and subject to the Gemma Terms of Use found at ai.google.dev/gemma/terms".
You may add your own intellectual property statement to your modifications and, except as set forth in this Section, may provide additional or different terms and conditions for use, reproduction, or Distribution of your modifications, or for any such Model Derivatives as a whole, provided your use, reproduction, modification, Distribution, performance, and display of Gemma otherwise complies with the terms and conditions of this Agreement. Any additional or different terms and conditions you impose must not conflict with the terms of this Agreement.
3.2 Use Restrictions
You must not use any of the Gemma Services:
for the restricted uses set forth in the Gemma Prohibited Use Policy at ai.google.dev/gemma/prohibited_use_policy ("Prohibited Use Policy"), which is hereby incorporated by reference into this Agreement; or
in violation of applicable laws and regulations.
To the maximum extent permitted by law, Google reserves the right to restrict (remotely or otherwise) usage of any of the Gemma Services that Google reasonably believes are in violation of this Agreement.
3.3 Generated Output
Google claims no rights in Outputs you generate using Gemma. You and your users are solely responsible for Outputs and their subsequent uses.
Section 4: ADDITIONAL PROVISIONS
4.1 Updates
Google may update Gemma from time to time.
4.2 Trademarks
Nothing in this Agreement grants you any rights to use Google's trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between you and Google. Google reserves any rights not expressly granted herein.
4.3 DISCLAIMER OF WARRANTY
UNLESS REQUIRED BY APPLICABLE LAW, THE GEMMA SERVICES, AND OUTPUTS, ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE GEMMA SERVICES OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR USE OR DISTRIBUTION OF ANY OF THE GEMMA SERVICES OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
4.4 LIMITATION OF LIABILITY
TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY, CONTRACT, OR OTHERWISE, UNLESS REQUIRED BY APPLICABLE LAW, SHALL GOOGLE OR ITS AFFILIATES BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL, OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO, ANY OF THE GEMMA SERVICES OR OUTPUTS EVEN IF GOOGLE OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
4.5 Term, Termination, and Survival
The term of this Agreement will commence upon your acceptance of this Agreement (including acceptance by your use, modification, or Distribution, reproduction, performance or display of any portion or element of the Gemma Services) and will continue in full force and effect until terminated in accordance with the terms of this Agreement. Google may terminate this Agreement if you are in breach of any term of this Agreement. Upon termination of this Agreement, you must delete and cease use and Distribution of all copies of Gemma and Model Derivatives in your possession or control. Sections 1, 2.1, 3.3, 4.2 to 4.9 shall survive the termination of this Agreement.
4.6 Governing Law and Jurisdiction
This Agreement will be governed by the laws of the State of California without regard to choice of law principles. The UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The state and federal courts of Santa Clara County, California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
4.7 Severability
If any provision of this Agreement is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
4.8 Entire Agreement
This Agreement states all the terms agreed between the parties and supersedes all other agreements between the parties as of the date of acceptance relating to its subject matter.
4.9 No Waiver
Google will not be treated as having waived any rights by not exercising (or delaying the exercise of) any rights under this Agreement.
\ No newline at end of file
META LLAMA 3 COMMUNITY LICENSE AGREEMENT
Meta Llama 3 Version Release Date: April 18, 2024
“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein.
“Documentation” means the specifications, manuals and documentation accompanying Meta Llama 3 distributed by Meta at https://llama.meta.com/get-started/.
“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
“Meta Llama 3” means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Meta at https://llama.meta.com/llama-downloads.
“Llama Materials” means, collectively, Meta’s proprietary Meta Llama 3 and Documentation (and any portion thereof) made available under this Agreement.
“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
By clicking “I Accept” below or by using or distributing any portion or element of the Llama Materials, you agree to be bound by this Agreement.
1. License Rights and Redistribution.
a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Llama Materials.
b. Redistribution and Use.
i. If you distribute or make available the Llama Materials (or any derivative works thereof), or a product or service that uses any of them, including another AI model, you shall (A) provide a copy of this Agreement with any such Llama Materials; and (B) prominently display “Built with Meta Llama 3” on a related website, user interface, blogpost, about page, or product documentation. If you use the Llama Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “Llama 3” at the beginning of any such AI model name.
ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you.
iii. You must retain in all copies of the Llama Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Meta Llama 3 is licensed under the Meta Llama 3 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.”
iv. Your use of the Llama Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://llama.meta.com/llama3/use-policy), which is hereby incorporated by reference into this Agreement.
v. You will not use the Llama Materials or any output or results of the Llama Materials to improve any other large language model (excluding Meta Llama 3 or derivative works thereof).
2. Additional Commercial Terms. If, on the Meta Llama 3 version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Meta, which Meta may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Meta otherwise expressly grants you such rights.
3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.
4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
5. Intellectual Property.
a. No trademark licenses are granted under this Agreement, and in connection with the Llama Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Llama Materials or as set forth in this Section 5(a). Meta hereby grants you a license to use “Llama 3” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. You will comply with Meta’s brand guidelines (currently accessible at https://about.meta.com/brand/resources/meta/company-brand/ ). All goodwill arising out of your use of the Mark will inure to the benefit of Meta.
b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Llama Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Meta Llama 3 outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Llama Materials.
6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Llama Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
Meta Llama 3 Acceptable Use Policy
Meta is committed to promoting safe and fair use of its tools and features, including Meta Llama 3. If you access or use Meta Llama 3, you agree to this Acceptable Use Policy (“Policy”). The most recent copy of this policy can be found at https://llama.meta.com/llama3/use-policy
Prohibited Uses
We want everyone to use Meta Llama 3 safely and responsibly. You agree you will not use, or allow others to use, Meta Llama 3 to:
1. Violate the law or others’ rights, including to:
a. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
i. Violence or terrorism
ii. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
iii. Human trafficking, exploitation, and sexual violence
iv. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
v. Sexual solicitation
vi. Any other criminal activity
b. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
c. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
d. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
e. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
f. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Llama Materials
g. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Meta Llama 3 related to the following:
a. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
b. Guns and illegal weapons (including weapon development)
c. Illegal drugs and regulated/controlled substances
d. Operation of critical infrastructure, transportation technologies, or heavy machinery
e. Self-harm or harm to others, including suicide, cutting, and eating disorders
f. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
3. Intentionally deceive or mislead others, including use of Meta Llama 3 related to the following:
a. Generating, promoting, or furthering fraud or the creation or promotion of disinformation
b. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
c. Generating, promoting, or further distributing spam
d. Impersonating another individual without consent, authorization, or legal right
e. Representing that the use of Meta Llama 3 or outputs are human-generated
f. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
g. Fail to appropriately disclose to end users any known dangers of your AI system
Please report any violation of this Policy, software “bug,” or other problems that could lead to a violation of this Policy through one of the following means:
* Reporting issues with the model: https://github.com/meta-llama/llama3
* Reporting risky content generated by the model: developers.facebook.com/llama_output_feedback
* Reporting bugs and security concerns: facebook.com/whitehat/info
* Reporting violations of the Acceptable Use Policy or unlicensed uses of Meta Llama 3: LlamaUseReport@meta.com
\ No newline at end of file
Tongyi Qianwen LICENSE AGREEMENT
Tongyi Qianwen Release Date: August 3, 2023
By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
1. Definitions
a. This Tongyi Qianwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
b. "We"(or "Us") shall mean Alibaba Cloud.
c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You.
e. "Tongyi Qianwen" shall mean the large language models (including Qwen model and Qwen-Chat model), and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us.
f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement.
g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
and conversions to other media types.
2. Grant of Rights
You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials.
3. Redistribution
You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
b. You shall cause any modified files to carry prominent notices stating that You changed the files;
c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
4. Restrictions
If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, You shall request a license from Us. You cannot exercise your rights under this Agreement without our express authorization.
5. Rules of use
a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof).
6. Intellectual Property
a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
7. Disclaimer of Warranty and Limitation of Liability
a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto.
b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
8. Survival and Termination.
a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement.
9. Governing Law and Jurisdiction.
a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.
\ No newline at end of file
icon.png

53.8 KB

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment