Commit 799a38c5 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #616 failed with stages
in 0 seconds
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
from io import BytesIO
import logging
import warnings
import string
import numpy as np
import torch
import base64
from torchvision import transforms
from PIL import Image, ImageFile
from data import data_utils
from data.ofa_dataset import OFADataset
ImageFile.LOAD_TRUNCATED_IMAGES = True
ImageFile.MAX_IMAGE_PIXELS = None
Image.MAX_IMAGE_PIXELS = None
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
def collate(samples, pad_idx, eos_idx):
if len(samples) == 0:
return {}
def merge(key):
return data_utils.collate_tokens(
[s[key] for s in samples],
pad_idx,
eos_idx=eos_idx,
)
id = np.array([s["id"] for s in samples])
src_tokens = merge("source")
src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
prev_output_tokens = None
target = None
if samples[0].get("target", None) is not None:
target = merge("target")
tgt_lengths = torch.LongTensor([s["target"].ne(pad_idx).long().sum() for s in samples])
ntokens = tgt_lengths.sum().item()
if samples[0].get("prev_output_tokens", None) is not None:
prev_output_tokens = merge("prev_output_tokens")
else:
ntokens = src_lengths.sum().item()
batch = {
"id": id,
"nsentences": len(samples),
"ntokens": ntokens,
"net_input": {
"src_tokens": src_tokens,
"src_lengths": src_lengths,
"patch_images": patch_images,
"patch_masks": patch_masks,
"prev_output_tokens": prev_output_tokens
},
"target": target,
}
return batch
class CaptionDataset(OFADataset):
def __init__(
self,
split,
dataset,
bpe,
src_dict,
tgt_dict=None,
max_src_length=128,
max_tgt_length=30,
patch_image_size=224,
imagenet_default_mean_and_std=False,
scst=False
):
super().__init__(split, dataset, bpe, src_dict, tgt_dict)
self.max_src_length = max_src_length
self.max_tgt_length = max_tgt_length
self.patch_image_size = patch_image_size
self.scst = scst
self.transtab = str.maketrans({key: None for key in string.punctuation})
if imagenet_default_mean_and_std:
mean = IMAGENET_DEFAULT_MEAN
std = IMAGENET_DEFAULT_STD
else:
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]
self.patch_resize_transform = transforms.Compose([
lambda image: image.convert("RGB"),
transforms.Resize((patch_image_size, patch_image_size), interpolation=Image.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])
if type(bpe).__name__ == 'GPT2BPE':
self.prompt = " what does the image describe?"
elif type(bpe).__name__ == 'BertBPE':
self.prompt = "图片描述了什么内容?"
def __getitem__(self, index):
uniq_id, image, caption = self.dataset[index]
'''
# for b64decode
if len(image)%4 != 0:
num_padding = 4 - (len(image) % 4)
if num_padding < 4:
image += "=" * num_padding
'''
try:
image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
except Exception as e:
with open("image.txt", "wb") as f:
f.write(image)
# for b64decode
if len(image)%4 != 0:
num_padding = 4 - (len(image) % 4)
if num_padding < 4:
image += "=" * num_padding
print('*****错误图片*****', index, uniq_id, image[-20:])
image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
patch_image = self.patch_resize_transform(image)
patch_mask = torch.tensor([True])
if self.split == 'train' and not self.scst:
caption = caption.translate(self.transtab).strip()
caption_token_list = caption.strip().split()
tgt_caption = ' '.join(caption_token_list[:self.max_tgt_length])
else:
caption = ' '.join(caption.strip().split())
caption_list = [cap.translate(self.transtab).strip() for cap in caption.strip().split('&&')]
tgt_caption = '&&'.join(caption_list)
src_item = self.encode_text(self.prompt)
tgt_item = self.encode_text(" {}".format(tgt_caption))
src_item = torch.cat([self.bos_item, src_item, self.eos_item])
target_item = torch.cat([tgt_item, self.eos_item])
prev_output_item = torch.cat([self.bos_item, tgt_item])
example = {
"id": uniq_id,
"source": src_item,
"patch_image": patch_image,
"patch_mask": patch_mask,
"target": target_item,
"prev_output_tokens": prev_output_item
}
return example
def collater(self, samples, pad_to_length=None):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[dict]): samples to collate
Returns:
dict: a mini-batch containing the data of the task
"""
return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
from io import BytesIO
import logging
import warnings
import base64
import random
import numpy as np
import torch
from PIL import Image, ImageFile
from itertools import chain
from data.ofa_dataset import OFADataset
from data import data_utils
from PIL import Image
from io import BytesIO
import base64
ImageFile.LOAD_TRUNCATED_IMAGES = True
ImageFile.MAX_IMAGE_PIXELS = None
Image.MAX_IMAGE_PIXELS = None
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
def collate(
samples,
pad_idx,
eos_idx,
left_pad_source=False,
left_pad_target=False,
):
if len(samples) == 0:
return {}
def merge(key, left_pad, move_eos_to_beginning=False):
return data_utils.collate_tokens(
[s[key] for s in samples],
pad_idx,
eos_idx,
left_pad,
move_eos_to_beginning,
)
id = np.array([s["id"] for s in samples])
src_tokens = merge("source", left_pad=left_pad_source)
# sort by descending source length
src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
code_images = np.array([s["code_image"] for s in samples])
code_masks = torch.cat([sample['code_mask'] for sample in samples])
prev_output_tokens = None
target = None
if samples[0].get("target", None) is not None:
target = merge("target", left_pad=left_pad_target)
tgt_lengths = torch.LongTensor(
[s["target"].ne(pad_idx).long().sum() for s in samples]
)
ntokens = tgt_lengths.sum().item()
if samples[0].get("prev_output_tokens", None) is not None:
prev_output_tokens = merge("prev_output_tokens", left_pad=left_pad_target)
else:
ntokens = src_lengths.sum().item()
batch = {
"id": id,
"nsentences": len(samples),
"ntokens": ntokens,
"net_input": {
"src_tokens": src_tokens,
"src_lengths": src_lengths,
"code_masks": code_masks,
"prev_output_tokens": prev_output_tokens
},
"code_images": code_images,
"target": target
}
return batch
def preprocess_vqgan(x):
x = 2. * x - 1.
return x
class ImageGenDataset(OFADataset):
def __init__(
self,
split,
dataset,
bpe,
src_dict,
tgt_dict=None,
max_src_length=128,
code_dict_size=8192,
code_image_size=256,
num_bins=1000
):
super().__init__(split, dataset, bpe, src_dict, tgt_dict)
self.max_src_length = max_src_length
self.code_dict_size = code_dict_size
self.num_codes = (code_image_size // 8) ** 2
self.num_bins = num_bins
slice_id = self.dataset.slice_id
empty_img = Image.new('RGB', (code_image_size, code_image_size))
empty_img.save(f'temp_{slice_id}.png')
img = Image.open(f'temp_{slice_id}.png')
img_buffer = BytesIO()
img.save(img_buffer, format=img.format)
byte_data = img_buffer.getvalue()
self.empty_image_base64 = (base64.urlsafe_b64encode(byte_data)).decode("utf-8")
def __getitem__(self, index):
data = self.dataset[index]
if len(data) == 2:
uniq_id, text = data
image_code = [0] * 1024
image = self.empty_image_base64
elif len(data) == 3:
uniq_id, text, image_code = data
image_code = [int(num) for num in image_code.strip().split()]
image = self.empty_image_base64
elif len(data) == 4:
uniq_id, image, text, image_code = data
image_code = [int(num) for num in image_code.strip().split()]
else:
raise NotImplementedError
code_mask = torch.tensor([True])
image_code = torch.LongTensor(image_code)
tgt_item = image_code + len(self.src_dict) - self.code_dict_size - self.num_bins
target_item = torch.cat([tgt_item, self.eos_item])
prev_output_item = torch.cat([self.bos_item, tgt_item])
caption_token_list = text.strip().split()
caption = ' '.join(caption_token_list[:self.max_src_length])
src_item = self.encode_text(
" what is the complete image? caption: {}".format(caption),
append_bos=True,
append_eos=True
)
example = {
"id": uniq_id,
"source": src_item,
"code_mask": code_mask,
"code_image": image,
"target": target_item,
"prev_output_tokens": prev_output_item
}
return example
def collater(self, samples, pad_to_length=None):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[dict]): samples to collate
Returns:
dict: a mini-batch containing the data of the task
"""
return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
from io import BytesIO
import logging
import warnings
import random
import functools
import numpy as np
import torch
import base64
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from torchvision.transforms import functional as F
from PIL import Image, ImageFile
from zhconv import convert
import unicodedata
from data import data_utils
from data.ofa_dataset import OFADataset
ImageFile.LOAD_TRUNCATED_IMAGES = True
ImageFile.MAX_IMAGE_PIXELS = None
Image.MAX_IMAGE_PIXELS = None
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
def collate(samples, pad_idx, eos_idx):
if len(samples) == 0:
return {}
def merge(key):
return data_utils.collate_tokens(
[s[key] for s in samples],
pad_idx,
eos_idx=eos_idx,
)
id = np.array([s["id"] for s in samples])
src_tokens = merge("source")
src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
prev_output_tokens = None
target = None
if samples[0].get("target", None) is not None:
target = merge("target")
tgt_lengths = torch.LongTensor([s["target"].ne(pad_idx).long().sum() for s in samples])
ntokens = tgt_lengths.sum().item()
if samples[0].get("prev_output_tokens", None) is not None:
prev_output_tokens = merge("prev_output_tokens")
else:
ntokens = src_lengths.sum().item()
batch = {
"id": id,
"nsentences": len(samples),
"ntokens": ntokens,
"net_input": {
"src_tokens": src_tokens,
"src_lengths": src_lengths,
"patch_images": patch_images,
"patch_masks": patch_masks,
"prev_output_tokens": prev_output_tokens
},
"target": target,
}
return batch
def ocr_resize(img, patch_image_size, is_document=False):
img = img.convert("RGB")
width, height = img.size
if is_document:
new_height, new_width = 64, 1920
else:
if width >= height:
new_width = max(64, patch_image_size)
new_height = max(64, int(patch_image_size * (height / width)))
top = random.randint(0, patch_image_size - new_height)
bottom = patch_image_size - new_height - top
left, right = 0, 0
else:
new_height = max(64, patch_image_size)
new_width = max(64, int(patch_image_size * (width / height)))
left = random.randint(0, patch_image_size - new_width)
right = patch_image_size - new_width - left
top, bottom = 0, 0
img_new = F.resize(
img,
[new_height, new_width],
interpolation=InterpolationMode.BICUBIC,
)
if is_document:
img_split = transforms.ToTensor()(img_new).chunk(4, dim=-1)
img_new = transforms.ToPILImage()(torch.cat(img_split, dim=-2))
new_width, new_height = img_new.size
top = random.randint(0, patch_image_size - new_height)
bottom = patch_image_size - new_height - top
left, right = 0, 0
img_new = F.pad(img_new, padding=[left, top, right, bottom], padding_mode="edge")
assert img_new.size == (patch_image_size, patch_image_size)
return img_new
class OcrDataset(OFADataset):
def __init__(
self,
split,
dataset,
bpe,
src_dict,
tgt_dict=None,
max_src_length=80,
max_tgt_length=30,
patch_image_size=224,
imagenet_default_mean_and_std=False,
is_document=False,
):
super().__init__(split, dataset, bpe, src_dict, tgt_dict)
self.max_src_length = max_src_length
self.max_tgt_length = max_tgt_length
self.patch_image_size = patch_image_size
if imagenet_default_mean_and_std:
mean = IMAGENET_DEFAULT_MEAN
std = IMAGENET_DEFAULT_STD
else:
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]
self.patch_resize_transform = transforms.Compose(
[
lambda image: ocr_resize(
image, patch_image_size, is_document=is_document
),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
]
)
self.bpe = bpe
if type(bpe).__name__ == 'GPT2BPE':
self.prompt = " what are the texts on the image?"
elif type(bpe).__name__ == 'BertBPE':
self.prompt = "图片上的文字是什么?"
def __getitem__(self, index):
uniq_id, image, caption = self.dataset[index]
image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
patch_image = self.patch_resize_transform(image)
patch_mask = torch.tensor([True])
caption = unicodedata.normalize("NFKC", convert(caption, "zh-hans"))
if type(self.bpe).__name__ == 'GPT2BPE':
caption_token_list = caption.lower().strip().split()
tgt_caption = ' '.join(caption_token_list[:self.max_tgt_length])
elif type(self.bpe).__name__ == 'BertBPE':
tgt_caption = caption[: self.max_tgt_length].lower()
src_item = self.encode_text(self.prompt)
tgt_item = self.encode_text(" {}".format(tgt_caption))
src_item = torch.cat([self.bos_item, src_item, self.eos_item])
target_item = torch.cat([tgt_item, self.eos_item])
prev_output_item = torch.cat([self.bos_item, tgt_item])
example = {
"id": uniq_id,
"source": src_item,
"patch_image": patch_image,
"patch_mask": patch_mask,
"target": target_item,
"prev_output_tokens": prev_output_item,
}
return example
def collater(self, samples, pad_to_length=None):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[dict]): samples to collate
Returns:
dict: a mini-batch containing the data required for the task
"""
return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
from io import BytesIO
import logging
import warnings
import numpy as np
import torch
import base64
import utils.transforms as T
from PIL import Image, ImageFile
from data import data_utils
from data.ofa_dataset import OFADataset
ImageFile.LOAD_TRUNCATED_IMAGES = True
ImageFile.MAX_IMAGE_PIXELS = None
Image.MAX_IMAGE_PIXELS = None
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
def collate(samples, pad_idx, eos_idx):
if len(samples) == 0:
return {}
def merge(key):
return data_utils.collate_tokens(
[s[key] for s in samples],
pad_idx,
eos_idx=eos_idx,
)
id = np.array([s["id"] for s in samples])
src_tokens = merge("source")
src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
w_resize_ratios = torch.stack([s["w_resize_ratio"] for s in samples], dim=0)
h_resize_ratios = torch.stack([s["h_resize_ratio"] for s in samples], dim=0)
region_coords = torch.stack([s['region_coord'] for s in samples], dim=0)
prev_output_tokens = None
target = None
if samples[0].get("target", None) is not None:
target = merge("target")
tgt_lengths = torch.LongTensor([s["target"].ne(pad_idx).long().sum() for s in samples])
ntokens = tgt_lengths.sum().item()
if samples[0].get("prev_output_tokens", None) is not None:
prev_output_tokens = merge("prev_output_tokens")
else:
ntokens = src_lengths.sum().item()
batch = {
"id": id,
"nsentences": len(samples),
"ntokens": ntokens,
"net_input": {
"src_tokens": src_tokens,
"src_lengths": src_lengths,
"patch_images": patch_images,
"patch_masks": patch_masks,
"prev_output_tokens": prev_output_tokens
},
"target": target,
"w_resize_ratios": w_resize_ratios,
"h_resize_ratios": h_resize_ratios,
"region_coords": region_coords
}
return batch
class RefcocoDataset(OFADataset):
def __init__(
self,
split,
dataset,
bpe,
src_dict,
tgt_dict=None,
max_src_length=80,
max_tgt_length=30,
patch_image_size=512,
imagenet_default_mean_and_std=False,
num_bins=1000,
max_image_size=512
):
super().__init__(split, dataset, bpe, src_dict, tgt_dict)
self.max_src_length = max_src_length
self.max_tgt_length = max_tgt_length
self.patch_image_size = patch_image_size
self.num_bins = num_bins
if imagenet_default_mean_and_std:
mean = IMAGENET_DEFAULT_MEAN
std = IMAGENET_DEFAULT_STD
else:
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]
# for positioning
self.positioning_transform = T.Compose([
T.RandomResize([patch_image_size], max_size=patch_image_size),
T.ToTensor(),
T.Normalize(mean=mean, std=std, max_image_size=max_image_size)
])
if type(bpe).__name__ == 'GPT2BPE':
self.prompt = ' which region does the text " {} " describe?'
elif type(bpe).__name__ == 'BertBPE':
self.prompt = '这段文字" {} "描述的是哪个区域?'
def __getitem__(self, index):
uniq_id, base64_str, text, region_coord = self.dataset[index]
image = Image.open(BytesIO(base64.urlsafe_b64decode(base64_str))).convert("RGB")
w, h = image.size
boxes_target = {"boxes": [], "labels": [], "area": [], "size": torch.tensor([h, w])}
x0, y0, x1, y1 = region_coord.strip().split(',')
region = torch.tensor([float(x0), float(y0), float(x1), float(y1)])
boxes_target["boxes"] = torch.tensor([[float(x0), float(y0), float(x1), float(y1)]])
boxes_target["labels"] = np.array([0])
boxes_target["area"] = torch.tensor([(float(x1) - float(x0)) * (float(y1) - float(y0))])
patch_image, patch_boxes = self.positioning_transform(image, boxes_target)
resize_h, resize_w = patch_boxes["size"][0], patch_boxes["size"][1]
patch_mask = torch.tensor([True])
quant_x0 = "<bin_{}>".format(int((patch_boxes["boxes"][0][0] * (self.num_bins - 1)).round()))
quant_y0 = "<bin_{}>".format(int((patch_boxes["boxes"][0][1] * (self.num_bins - 1)).round()))
quant_x1 = "<bin_{}>".format(int((patch_boxes["boxes"][0][2] * (self.num_bins - 1)).round()))
quant_y1 = "<bin_{}>".format(int((patch_boxes["boxes"][0][3] * (self.num_bins - 1)).round()))
region_coord = "{} {} {} {}".format(quant_x0, quant_y0, quant_x1, quant_y1)
src_caption = self.pre_caption(text, self.max_src_length)
src_item = self.encode_text(self.prompt.format(src_caption))
tgt_item = self.encode_text(region_coord, use_bpe=False)
src_item = torch.cat([self.bos_item, src_item, self.eos_item])
target_item = torch.cat([tgt_item, self.eos_item])
prev_output_item = torch.cat([self.bos_item, tgt_item])
example = {
"id": uniq_id,
"source": src_item,
"patch_image": patch_image,
"patch_mask": patch_mask,
"target": target_item,
"prev_output_tokens": prev_output_item,
"w_resize_ratio": resize_w / w,
"h_resize_ratio": resize_h / h,
"region_coord": region
}
return example
def collater(self, samples, pad_to_length=None):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[dict]): samples to collate
Returns:
dict: a mini-batch containing the data of the task
"""
return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
from io import BytesIO
import logging
import warnings
import numpy as np
import torch
import base64
from torchvision import transforms
from PIL import Image, ImageFile
from data import data_utils
from data.ofa_dataset import OFADataset
ImageFile.LOAD_TRUNCATED_IMAGES = True
ImageFile.MAX_IMAGE_PIXELS = None
Image.MAX_IMAGE_PIXELS = None
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
def collate(samples, pad_idx, eos_idx):
if len(samples) == 0:
return {}
def merge(key):
return data_utils.collate_tokens(
[s[key] for s in samples],
pad_idx,
eos_idx=eos_idx,
)
id = np.array([s["id"] for s in samples])
src_tokens = merge("source")
src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
ref_dict = None
if samples[0].get("ref_dict", None) is not None:
ref_dict = np.array([s['ref_dict'] for s in samples])
constraint_masks = None
if samples[0].get("constraint_mask", None) is not None:
constraint_masks = merge("constraint_mask")
decoder_prompts = None
if samples[0].get("decoder_prompt", None) is not None:
decoder_prompts = np.array([s['decoder_prompt'].tolist() for s in samples])
prev_output_tokens = None
target = None
if samples[0].get("target", None) is not None:
target = merge("target")
tgt_lengths = torch.LongTensor(
[s["target"].ne(pad_idx).long().sum() for s in samples]
)
ntokens = tgt_lengths.sum().item()
if samples[0].get("prev_output_tokens", None) is not None:
prev_output_tokens = merge("prev_output_tokens")
else:
ntokens = src_lengths.sum().item()
batch = {
"id": id,
"nsentences": len(samples),
"ntokens": ntokens,
"net_input": {
"src_tokens": src_tokens,
"src_lengths": src_lengths,
"patch_images": patch_images,
"patch_masks": patch_masks,
"prev_output_tokens": prev_output_tokens
},
"ref_dict": ref_dict,
"constraint_masks": constraint_masks,
"decoder_prompts": decoder_prompts,
"target": target
}
return batch
class SnliVeDataset(OFADataset):
def __init__(
self,
split,
dataset,
bpe,
src_dict,
tgt_dict=None,
max_src_length=80,
max_tgt_length=30,
patch_image_size=224,
add_caption=False,
constraint_trie=None,
imagenet_default_mean_and_std=False,
prompt_type="none"
):
super().__init__(split, dataset, bpe, src_dict, tgt_dict)
self.max_src_length = max_src_length
self.max_tgt_length = max_tgt_length
self.patch_image_size = patch_image_size
self.add_caption = add_caption
self.constraint_trie = constraint_trie
self.prompt_type = prompt_type
if imagenet_default_mean_and_std:
mean = IMAGENET_DEFAULT_MEAN
std = IMAGENET_DEFAULT_STD
else:
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]
self.patch_resize_transform = transforms.Compose([
lambda image: image.convert("RGB"),
transforms.Resize((patch_image_size, patch_image_size), interpolation=Image.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])
def __getitem__(self, index):
uniq_id, image, hypothesis, caption, label = self.dataset[index]
if label == 'contradiction':
label = 'no'
elif label == 'entailment':
label = 'yes'
elif label == 'neutral':
label = 'maybe'
else:
raise NotImplementedError
image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
patch_image = self.patch_resize_transform(image)
patch_mask = torch.tensor([True])
hypothesis = self.pre_caption(hypothesis, self.max_src_length)
src_item = self.encode_text(' does the image describe " {} "?'.format(hypothesis))
tgt_item = self.encode_text(" {}".format(label))
ref_dict = {label: 1.0}
if self.add_caption:
caption = self.pre_caption(caption, self.max_src_length)
src_item = self.encode_text(' can image and text1 " {} " imply text2 " {} "?'.format(caption, hypothesis))
src_item = torch.cat([self.bos_item, src_item, self.eos_item])
if self.prompt_type == 'none':
prev_output_item = torch.cat([self.bos_item, tgt_item])
target_item = torch.cat([prev_output_item[1:], self.eos_item])
decoder_prompt = self.bos_item
elif self.prompt_type == 'src':
prev_output_item = torch.cat([src_item, tgt_item])
target_item = torch.cat([prev_output_item[1:], self.eos_item])
decoder_prompt = src_item
elif self.prompt_type == 'prev_output':
prev_output_item = torch.cat([src_item[:-1], tgt_item])
target_item = torch.cat([prev_output_item[1:], self.eos_item])
decoder_prompt = src_item[:-1]
else:
raise NotImplementedError
target_item[:-len(tgt_item)-1] = self.tgt_dict.pad()
example = {
"id": uniq_id,
"source": src_item,
"patch_image": patch_image,
"patch_mask": patch_mask,
"target": target_item,
"prev_output_tokens": prev_output_item,
"decoder_prompt": decoder_prompt,
"ref_dict": ref_dict,
}
if self.constraint_trie is not None:
constraint_mask = torch.zeros((len(target_item), len(self.tgt_dict))).bool()
start_idx = len(target_item) - len(tgt_item) - 1
for i in range(len(target_item)-len(tgt_item)-1, len(target_item)):
constraint_prefix_token = [self.tgt_dict.bos()] + target_item[start_idx:i].tolist()
constraint_nodes = self.constraint_trie.get_next_layer(constraint_prefix_token)
constraint_mask[i][constraint_nodes] = True
example["constraint_mask"] = constraint_mask
return example
def collater(self, samples, pad_to_length=None):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[dict]): samples to collate
Returns:
dict: a mini-batch containing the data of the task
"""
return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
from io import BytesIO
import logging
import warnings
import numpy as np
import torch
import base64
from torchvision import transforms
from PIL import Image, ImageFile
from data import data_utils
from data.ofa_dataset import OFADataset
ImageFile.LOAD_TRUNCATED_IMAGES = True
ImageFile.MAX_IMAGE_PIXELS = None
Image.MAX_IMAGE_PIXELS = None
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
def collate(samples, pad_idx, eos_idx):
if len(samples) == 0:
return {}
def merge(key):
return data_utils.collate_tokens(
[s[key] for s in samples],
pad_idx,
eos_idx=eos_idx,
)
id = np.array([s["id"] for s in samples])
src_tokens = merge("source")
src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
conf = None
if samples[0].get("conf", None) is not None:
conf = torch.cat([s['conf'] for s in samples], dim=0)
ref_dict = None
if samples[0].get("ref_dict", None) is not None:
ref_dict = np.array([s['ref_dict'] for s in samples])
constraint_masks = None
if samples[0].get("constraint_mask", None) is not None:
constraint_masks = merge("constraint_mask")
decoder_prompts = None
if samples[0].get("decoder_prompt", None) is not None:
decoder_prompts = np.array([s['decoder_prompt'].tolist() for s in samples])
prefix_tokens = None
if samples[0].get("decoder_prompt", None) is not None:
prefix_tokens = merge("decoder_prompt")
prefix_tokens = prefix_tokens[:, 1:]
prev_output_tokens = None
target = None
if samples[0].get("target", None) is not None:
target = merge("target")
tgt_lengths = torch.LongTensor(
[s["target"].ne(pad_idx).long().sum() for s in samples]
)
ntokens = tgt_lengths.sum().item()
if samples[0].get("prev_output_tokens", None) is not None:
prev_output_tokens = merge("prev_output_tokens")
else:
ntokens = src_lengths.sum().item()
batch = {
"id": id,
"nsentences": len(samples),
"ntokens": ntokens,
"net_input": {
"src_tokens": src_tokens,
"src_lengths": src_lengths,
"patch_images": patch_images,
"patch_masks": patch_masks,
"prev_output_tokens": prev_output_tokens
},
"conf": conf,
"ref_dict": ref_dict,
"constraint_masks": constraint_masks,
"decoder_prompts": decoder_prompts,
"target": target,
"prefix_tokens": prefix_tokens
}
return batch
class VqaGenDataset(OFADataset):
def __init__(
self,
split,
dataset,
bpe,
src_dict,
tgt_dict=None,
max_src_length=128,
max_object_length=30,
max_tgt_length=30,
patch_image_size=224,
add_object=False,
constraint_trie=None,
imagenet_default_mean_and_std=False,
prompt_type="none"
):
super().__init__(split, dataset, bpe, src_dict, tgt_dict)
self.max_src_length = max_src_length
self.max_object_length = max_object_length
self.max_tgt_length = max_tgt_length
self.patch_image_size = patch_image_size
self.add_object = add_object
self.constraint_trie = constraint_trie
self.prompt_type = prompt_type
if imagenet_default_mean_and_std:
mean = IMAGENET_DEFAULT_MEAN
std = IMAGENET_DEFAULT_STD
else:
mean = [0.5, 0.5, 0.5]
std = [0.5, 0.5, 0.5]
self.patch_resize_transform = transforms.Compose([
lambda image: image.convert("RGB"),
transforms.Resize((patch_image_size, patch_image_size), interpolation=Image.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
])
def __getitem__(self, index):
item = self.dataset[index]
if len(item) == 5:
uniq_id, image, question, ref, predict_objects = item
else:
uniq_id, image, question, ref, predict_objects, caption = item
image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
patch_image = self.patch_resize_transform(image)
patch_mask = torch.tensor([True])
question = self.pre_question(question, self.max_src_length)
question = question + '?' if not question.endswith('?') else question
src_item = self.encode_text(' {}'.format(question))
ref_dict = {item.split('|!+')[1]: float(item.split('|!+')[0]) for item in ref.split('&&')}
answer = max(ref_dict, key=ref_dict.get)
conf = torch.tensor([ref_dict[answer]])
tgt_item = self.encode_text(" {}".format(answer), length=self.max_tgt_length)
if self.add_object and predict_objects is not None:
predict_object_seq = ' '.join(predict_objects.strip().split('&&')[:self.max_object_length])
predict_object_item = self.encode_text(" object: {}".format(predict_object_seq))
src_item = torch.cat([src_item, predict_object_item])
src_item = torch.cat([self.bos_item, src_item, self.eos_item])
if self.prompt_type == 'none':
prev_output_item = torch.cat([self.bos_item, tgt_item])
target_item = torch.cat([prev_output_item[1:], self.eos_item])
decoder_prompt = self.bos_item
elif self.prompt_type == 'src':
prev_output_item = torch.cat([src_item, tgt_item])
target_item = torch.cat([prev_output_item[1:], self.eos_item])
decoder_prompt = src_item
elif self.prompt_type == 'prev_output':
prev_output_item = torch.cat([src_item[:-1], tgt_item])
target_item = torch.cat([prev_output_item[1:], self.eos_item])
decoder_prompt = src_item[:-1]
else:
raise NotImplementedError
target_item[:-len(tgt_item)-1] = self.tgt_dict.pad()
example = {
"id": uniq_id,
"source": src_item,
"patch_image": patch_image,
"patch_mask": patch_mask,
"target": target_item,
"prev_output_tokens": prev_output_item,
"decoder_prompt": decoder_prompt,
"ref_dict": ref_dict,
"conf": conf,
}
if self.constraint_trie is not None:
constraint_mask = torch.zeros((len(target_item), len(self.tgt_dict))).bool()
start_idx = len(target_item) - len(tgt_item) - 1
for i in range(len(target_item)-len(tgt_item)-1, len(target_item)):
constraint_prefix_token = [self.tgt_dict.bos()] + target_item[start_idx:i].tolist()
constraint_nodes = self.constraint_trie.get_next_layer(constraint_prefix_token)
constraint_mask[i][constraint_nodes] = True
example["constraint_mask"] = constraint_mask
return example
def collater(self, samples, pad_to_length=None):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[dict]): samples to collate
Returns:
dict: a mini-batch containing the data of the task
"""
return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
import logging
import warnings
import torch
import numpy as np
from data import data_utils
from data.ofa_dataset import OFADataset
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
def collate(samples, pad_idx, eos_idx):
if len(samples) == 0:
return {}
def merge(key):
return data_utils.collate_tokens(
[s[key] for s in samples],
pad_idx,
eos_idx=eos_idx,
)
src_tokens = merge("source")
src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
prev_output_tokens = None
target = None
if samples[0].get("target", None) is not None:
target = merge("target")
tgt_lengths = torch.LongTensor(
[s["target"].ne(pad_idx).long().sum() for s in samples]
)
ntokens = tgt_lengths.sum().item()
if samples[0].get("prev_output_tokens", None) is not None:
prev_output_tokens = merge("prev_output_tokens")
else:
ntokens = src_lengths.sum().item()
target_strs = np.array([s["target_str"] for s in samples])
batch = {
"nsentences": len(samples),
"ntokens": ntokens,
"net_input": {
"src_tokens": src_tokens,
"src_lengths": src_lengths,
"prev_output_tokens": prev_output_tokens
},
"target": target,
"target_strs": target_strs
}
return batch
class SummaryDataset(OFADataset):
def __init__(
self,
split,
dataset,
bpe,
src_dict,
tgt_dict=None,
code_dict_size=8192,
num_bins=1000,
max_src_length=512,
max_tgt_length=128,
noise_ratio=0.0
):
super().__init__(split, dataset, bpe, src_dict, tgt_dict)
self.max_src_length = max_src_length
self.max_tgt_length = max_tgt_length
self.code_dict_size = code_dict_size
self.num_bins = num_bins
self.noise_ratio = noise_ratio
if type(bpe).__name__ == 'GPT2BPE':
self.prompt = ' what is the summary of article " {} "?'
elif type(bpe).__name__ == 'BertBPE':
self.prompt = "{} 请用一个句子简单总结上文:"
def __getitem__(self, index):
source, target = self.dataset[index]
target_str = target.lower()
source = self.pre_caption(source, max_words=self.max_src_length)
target = self.pre_caption(target, max_words=self.max_tgt_length)
source = source.replace('<unk>', 'unk')
target = target.replace('<unk>', 'unk')
src_item = self.encode_text(
self.prompt.format(source),
length=self.max_src_length
)
tgt_item = self.encode_text('{}'.format(target))
noise_tgt_item = self.add_noise_to_tgt(tgt_item.clone(), self.noise_ratio)
src_item = torch.cat([self.bos_item, src_item, self.eos_item])
target_item = torch.cat([tgt_item, self.eos_item])
prev_output_item = torch.cat([self.bos_item, noise_tgt_item])
example = {
"source": src_item,
"target": target_item,
"prev_output_tokens": prev_output_item,
"target_str": target_str
}
return example
def add_noise_to_tgt(self, target, p):
noise_indices = torch.FloatTensor(target.size(0)).uniform_() < p
target[noise_indices] = torch.randint(
4, len(self.src_dict) - self.code_dict_size - self.num_bins, size=(noise_indices.sum(),)
)
return target
def collater(self, samples, pad_to_length=None):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[dict]): samples to collate
Returns:
dict: a mini-batch containing the data of the task
"""
return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
\ No newline at end of file
# Copyright 2022 The OFA-Sys Team.
# All rights reserved.
# This source code is licensed under the Apache 2.0 license
# found in the LICENSE file in the root directory.
import logging
import warnings
import torch
import numpy as np
from data import data_utils
from data.ofa_dataset import OFADataset
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
def collate(samples, pad_idx, eos_idx):
if len(samples) == 0:
return {}
def merge(key):
return data_utils.collate_tokens(
[s[key] for s in samples],
pad_idx,
eos_idx=eos_idx,
)
src_tokens = merge("source")
src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
ref_dict = None
if samples[0].get("ref_dict", None) is not None:
ref_dict = np.array([s['ref_dict'] for s in samples])
constraint_masks = None
if samples[0].get("constraint_mask", None) is not None:
constraint_masks = merge("constraint_mask")
prev_output_tokens = None
target = None
if samples[0].get("target", None) is not None:
target = merge("target")
tgt_lengths = torch.LongTensor(
[s["target"].ne(pad_idx).long().sum() for s in samples]
)
ntokens = tgt_lengths.sum().item()
if samples[0].get("prev_output_tokens", None) is not None:
prev_output_tokens = merge("prev_output_tokens")
else:
ntokens = src_lengths.sum().item()
batch = {
"nsentences": len(samples),
"ntokens": ntokens,
"net_input": {
"src_tokens": src_tokens,
"src_lengths": src_lengths,
"prev_output_tokens": prev_output_tokens
},
"ref_dict": ref_dict,
"constraint_masks": constraint_masks,
"target": target,
}
return batch
class COLADataset(OFADataset):
def __init__(
self,
split,
dataset,
bpe,
src_dict,
tgt_dict=None,
max_src_length=512,
max_tgt_length=30,
constraint_trie=None,
prompt_type="none"
):
super().__init__(split, dataset, bpe, src_dict, tgt_dict)
self.max_src_length = max_src_length
self.max_tgt_length = max_tgt_length
self.constraint_trie = constraint_trie
self.prompt_type = prompt_type
def __getitem__(self, index):
sentence, label = self.dataset[index]
if label == '0':
label = 'no'
elif label == '1':
label = 'yes'
else:
raise NotImplementedError
sentence = ' '.join(sentence.lower().strip().split()[:self.max_src_length])
src_item = self.encode_text(' is the text " {} " grammatically correct?'.format(sentence))
tgt_item = self.encode_text(" {}".format(label))
assert tgt_item.size(0) == 1
ref_dict = {label: 1.0}
src_item = torch.cat([self.bos_item, src_item, self.eos_item])
if self.prompt_type == 'none':
prev_output_item = self.bos_item
target_item = tgt_item
elif self.prompt_type == 'src':
prev_output_item = src_item.clone()
target_item = torch.cat([prev_output_item[1:], tgt_item])
elif self.prompt_type == 'prev_output':
prev_output_item = src_item[:-1].clone()
target_item = torch.cat([prev_output_item[1:], tgt_item])
else:
raise NotImplementedError
target_item[:-1] = self.tgt_dict.pad()
example = {
"source": src_item,
"target": target_item,
"prev_output_tokens": prev_output_item,
"ref_dict": ref_dict,
}
if self.constraint_trie is not None:
constraint_mask = torch.zeros((len(prev_output_item), len(self.tgt_dict))).bool()
constraint_nodes = self.constraint_trie.get_next_layer(self.bos_item.tolist())
constraint_mask[-1][constraint_nodes] = True
example["constraint_mask"] = constraint_mask
return example
def collater(self, samples, pad_to_length=None):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[dict]): samples to collate
Returns:
dict: a mini-batch containing the data of the task
"""
return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment