Commit c873301f authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
{
"crop_size": 224,
"do_center_crop": true,
"do_normalize": true,
"do_resize": true,
"feature_extractor_type": "CLIPFeatureExtractor",
"image_mean": [
0.48145466,
0.4578275,
0.40821073
],
"image_std": [
0.26862954,
0.26130258,
0.27577711
],
"resample": 3,
"size": 224
}
{"bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": "<|endoftext|>"}
\ No newline at end of file
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu22.04-dtk23.10.1-py310
ENV DEBIAN_FRONTEND=noninteractive
RUN source /opt/dtk-23.10/env.sh
WORKDIR /vary-toy
RUN pip install e .
RUN pip install ninija
# 模型唯一标识
modelCode = 645
# 模型名称
modelName=Vary—toy_pytorch
# 模型描述
modelDescription=多模态OCR模型
# 应用场景
appScenario=推理,金融,教育,政府,科研,交通,医疗
# 框架类型
frameType=pytorch
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "vary"
version = "0.1.0"
description = "Towards GPT-4 like large language and visual assistant."
readme = "README.md"
requires-python = ">=3.8"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
]
dependencies = [
"einops", "markdown2[all]", "numpy",
"requests", "sentencepiece", "tokenizers>=0.12.1",
"torch", "torchvision", "wandb",
"shortuuid", "httpx==0.24.0",
"deepspeed==0.12.3",
"peft==0.4.0",
"albumentations ",
"opencv-python",
"tiktoken",
"accelerate==0.24.1",
"transformers==4.32.1",
"bitsandbytes==0.41.0",
"scikit-learn==1.2.2",
"sentencepiece==0.1.99",
"einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
"gradio_client==0.2.9"
]
[tool.setuptools.packages.find]
exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
[tool.wheel]
exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
home = /usr/bin
implementation = CPython
version_info = 3.8.10.final.0
virtualenv = 20.16.7
include-system-site-packages = true
base-prefix = /usr
base-exec-prefix = /usr
base-executable = /usr/bin/python3
#!/bin/bash
#可修改为您自己的程序脚本#
python /home/wanglch/projects/Vary-toy/vary/demo/run_qwen_vary.py --model-name /home/wanglch/projects/Vary-toy/cache/models--HaoranWei--Vary-toy --image-file /home/wanglch/projects/Vary-toy/image/pic.jpg
\ No newline at end of file
import torch
import transformers
from dataclasses import dataclass, field
from vary.utils.constants import *
@dataclass
class DataCollatorForSupervisedDataset(object):
tokenizer: transformers.PreTrainedTokenizer
def __call__(self, instances):
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
images = [torch.stack(instance['image']) for instance in instances]
images_high = [torch.stack(instance['image_high']) for instance in instances]
images = list(zip(images, images_high))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids,
batch_first=True,
padding_value=self.tokenizer.pad_token_id)
labels = torch.nn.utils.rnn.pad_sequence(
labels,
batch_first=True,
padding_value=IGNORE_INDEX)
batch = dict(
input_ids=input_ids,
labels=labels,
attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
images=images,
)
return batch
def make_supervised_data_module(interleave, with_box, tokenizer, data_args):
if data_args.conversation_version == 'mpt':
from vary.data.conversation_dataset_qwen import ConversationDataset
dataset_cls = ConversationDataset
elif data_args.conversation_version == 'opt':
from vary.data.caption_opt import CaptionDataset
dataset_cls = CaptionDataset
train_dataset = dataset_cls(
tokenizer=tokenizer,
datasets=data_args.datasets,
multimodal_cfg=dict(
sep_image_conv_front=data_args.sep_image_conv_front,
image_token_len=data_args.image_token_len,
image_aspect_ratio=data_args.image_aspect_ratio,
use_im_start_end=data_args.use_im_start_end,
image_processor=data_args.image_processor,
image_processor_high = data_args.image_processor_high,
box_limit=data_args.box_limit,
)
)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
return dict(train_dataset=train_dataset,
eval_dataset=None,
data_collator=data_collator)
\ No newline at end of file
import io
import os
import copy
import json
import logging
import torch
import transformers
from typing import List, Optional, Tuple, Union, Dict, Sequence
from torch.utils.data import Dataset
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from vary.utils.constants import *
class BaseDataset(Dataset):
def __init__(
self,
datasets: str,
tokenizer: transformers.PreTrainedTokenizer,
multimodal_cfg: dict
):
super(BaseDataset, self).__init__()
self.tokenizer = tokenizer
self.multimodal_cfg = multimodal_cfg
logging.warning(f"Using {multimodal_cfg['image_token_len']} tokens for representing image")
def image_processor(self, image):
processor = self.multimodal_cfg['image_processor'] # the first processor, usually is the clip pretrained model (vit)
processor_high = self.multimodal_cfg['image_processor_high'] # the second processor, usually is the designed image encoder (sam/swin/cnn)
image_high = image.copy()
# TODO the 'keep', 'padding' only used for the first processor
if self.multimodal_cfg['image_aspect_ratio'] == 'keep':
max_hw, min_hw = max(image.size), min(image.size)
aspect_ratio = max_hw / min_hw
max_len, min_len = 448, 224
shortest_edge = int(min(max_len / aspect_ratio, min_len))
image = processor.preprocess(image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": shortest_edge})['pixel_values'][0]
elif self.multimodal_cfg['image_aspect_ratio'] == 'pad':
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img) # for simpler box processing
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img) # for simpler box processing
return result
image = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
image = processor.preprocess(image, return_tensors='pt', do_center_crop=False, size={"shortest_edge": 224})['pixel_values'][0]
else:
image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
image_high = processor_high(image_high)
return image, image_high
def __len__(self):
return len(self.list_data_dict)
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
pass
\ No newline at end of file
import io
import os
import copy
import json
import logging
import torch
import random
from typing import List, Optional, Tuple, Union, Dict, Sequence
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from vary.data.base_dataset import BaseDataset
from vary.utils.constants import *
from vary.utils import conversation as conversation_lib
class CaptionDataset(BaseDataset):
"""Conversation format dataset stage2 fine-tuning."""
def __init__(self, datasets, tokenizer, multimodal_cfg):
super(CaptionDataset, self).__init__(datasets, tokenizer, multimodal_cfg)
# v0 version format conversation
conversation_lib.default_conversation = conversation_lib.conv_templates["default"]
logging.warning("Formatting inputs into conversation type: v0-fixed")
logging.warning("Loading data...")
list_data_dict = []
list_image_path = []
for name in datasets.split("+"):
dataset = CONVERSATION_DATA[name] # in vary.utils
data_path = dataset['annotations']
data = json.load(open(data_path, "r"))
list_data_dict.extend(data)
image_path = dataset['images']
list_image_path.extend([image_path] * len(data))
logging.warning(f"Data from {data_path} provide {len(data)} conversations.")
assert len(list_data_dict) == len(list_image_path)
logging.warning(f"{len(list_data_dict)} conversations in total.")
a_new_list = list(zip(list_data_dict, list_image_path))
random.shuffle(a_new_list)
list_data_dict_new, list_image_path_new = zip(*a_new_list)
self.list_data_dict = list_data_dict_new
self.list_image_path = list_image_path_new
self.im_patch_token, self.im_start_token, self.im_end_token = tokenizer.convert_tokens_to_ids(
[DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
def multimodal_processor(self, sources):
for source in sources:
source[0]['value'] = DEFAULT_IMAGE_TOKEN
for sentence in source:
replace_token = DEFAULT_IMAGE_PATCH_TOKEN * self.multimodal_cfg['image_token_len']
if self.multimodal_cfg['use_im_start_end']:
replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
return sources
def _tokenize_fn(self, strings):
"""Tokenize a list of strings."""
tokenized_list = [
self.tokenizer(
text,
return_tensors="pt",
padding="longest",
max_length=self.tokenizer.model_max_length,
truncation=True,
) for text in strings
]
input_ids = labels = [
tokenized.input_ids[0] for tokenized in tokenized_list
]
for idx, ii in enumerate(input_ids):
if ii[-1] != 2:
input_ids[idx][-1] = 2
labels[idx][-1] = 2
input_ids_lens = labels_lens = [
tokenized.input_ids.ne(self.tokenizer.pad_token_id).sum().item()
for tokenized in tokenized_list
]
return dict(
input_ids=input_ids,
labels=labels,
input_ids_lens=input_ids_lens,
labels_lens=labels_lens,
)
def _mask_targets(self, target, tokenized_lens, speakers):
# cur_idx = 0
cur_idx = tokenized_lens[0]
tokenized_lens = tokenized_lens[1:]
target[:cur_idx] = IGNORE_INDEX
for tokenized_len, speaker in zip(tokenized_lens, speakers):
if speaker.lower() == "human":
target[cur_idx:tokenized_len] = IGNORE_INDEX
cur_idx += tokenized_len
def _add_speaker_and_signal(self, header, source, get_conversation=True):
"""Add speaker and start/end signal on each round."""
BEGIN_SIGNAL = "</s>"
END_SIGNAL = "\n"
conversation = header
for sentence in source:
from_str = sentence["from"]
if from_str.lower() == "human":
from_str = conversation_lib.default_conversation.roles[0]
else:
from_str = conversation_lib.default_conversation.roles[1]
sentence["value"] = sentence["value"] + END_SIGNAL
if get_conversation:
conversation += sentence["value"]
conversation += BEGIN_SIGNAL
return conversation
def token_processor(self, sources):
"""
Given a list of sources, each is a conversation list. This transform:
1. Add signal '### ' at the beginning each sentence, with end signal '\n';
2. Concatenate conversations together;
3. Tokenize the concatenated conversation;
4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
"""
# add end signal and concatenate together
conversations = []
for source in sources:
header = ''
conversation = self._add_speaker_and_signal(header, source)
conversations.append(conversation)
conversations_tokenized = self._tokenize_fn(conversations)
input_ids = conversations_tokenized["input_ids"]
targets = copy.deepcopy(input_ids)
for target, source in zip(targets, sources):
tokenized_lens = self._tokenize_fn([header] + [s["value"] for s in source])["input_ids_lens"]
speakers = [sentence["from"] for sentence in source]
self._mask_targets(target, tokenized_lens, speakers)
return dict(input_ids=input_ids, labels=targets)
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
# data = self.list_data_dict[i]
data = copy.deepcopy(self.list_data_dict[i])
if isinstance(data, dict):
if 'image' in data:
image_path = self.list_image_path[i]
image_file = data['image']
# TODO this is a bug, because some json has wrong path
try:
image = Image.open(image_path + image_file).convert('RGB')
except:
print(f'cannot identify image file {image_path+image_file}.')
return self.__getitem__(0)
try:
image, image_high = self.image_processor(image)
except:
print(f'image {image_file} are broken or grayscale! we thus select 0-th sample instead!')
return self.__getitem__(0)
conversations = self.multimodal_processor([data["conversations"]])
else:
conversations = [data]
# align with fastchat & llava here, put the conversation into a list for tokenization
data_dict = self.token_processor(conversations)
data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
if isinstance(data, dict) and 'image' in data:
data_dict['image'] = [image]
data_dict['image_high'] = [image_high]
else:
crop_size = self.multimodal_cfg['image_processor'].crop_size
data_dict['image'] = [torch.zeros(3, crop_size['height'], crop_size['width'])]
# TODO sam is 1024*1024
data_dict['image_high'] = [torch.zeros(3, 1024, 1024)]
return data_dict
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment