v1.0

799a38c5 · chenzk · 799a38c5 · 799a38c5 · 799a38c5 · 799a38c5
Commit 799a38c5 authored Nov 10, 2023 by chenzk
20 changed files
--- a/data/mm_data/__pycache__/image_gen_dataset.cpython-38.pyc
+++ b/data/mm_data/__pycache__/image_gen_dataset.cpython-38.pyc
--- a/data/mm_data/__pycache__/refcoco_dataset.cpython-38.pyc
+++ b/data/mm_data/__pycache__/refcoco_dataset.cpython-38.pyc
--- a/data/mm_data/__pycache__/snli_ve_dataset.cpython-38.pyc
+++ b/data/mm_data/__pycache__/snli_ve_dataset.cpython-38.pyc
--- a/data/mm_data/__pycache__/vqa_gen_dataset.cpython-38.pyc
+++ b/data/mm_data/__pycache__/vqa_gen_dataset.cpython-38.pyc
--- a/data/mm_data/caption_dataset.py
+++ b/data/mm_data/caption_dataset.py
+# Copyright 2022 The OFA-Sys Team. 
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license 
+# found in the LICENSE file in the root directory.
+
+from io import BytesIO
+
+import logging
+import warnings
+import string
+
+import numpy as np
+import torch
+import base64
+from torchvision import transforms
+
+from PIL import Image, ImageFile
+
+from data import data_utils
+from data.ofa_dataset import OFADataset
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+ImageFile.MAX_IMAGE_PIXELS = None
+Image.MAX_IMAGE_PIXELS = None
+
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def collate(samples, pad_idx, eos_idx):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=eos_idx,
+        )
+
+    id = np.array([s["id"] for s in samples])
+    src_tokens = merge("source")
+    src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
+
+    patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
+    patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge("target")
+        tgt_lengths = torch.LongTensor([s["target"].ne(pad_idx).long().sum() for s in samples])
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens")
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        "id": id,
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+            "patch_images": patch_images,
+            "patch_masks": patch_masks,
+            "prev_output_tokens": prev_output_tokens
+        },
+        "target": target,
+    }
+
+    return batch
+
+
+class CaptionDataset(OFADataset):
+    def __init__(
+        self,
+        split,
+        dataset,
+        bpe,
+        src_dict,
+        tgt_dict=None,
+        max_src_length=128,
+        max_tgt_length=30,
+        patch_image_size=224,
+        imagenet_default_mean_and_std=False,
+        scst=False
+    ):
+        super().__init__(split, dataset, bpe, src_dict, tgt_dict)
+        self.max_src_length = max_src_length
+        self.max_tgt_length = max_tgt_length
+        self.patch_image_size = patch_image_size
+        self.scst = scst
+
+        self.transtab = str.maketrans({key: None for key in string.punctuation})
+
+        if imagenet_default_mean_and_std:
+            mean = IMAGENET_DEFAULT_MEAN
+            std = IMAGENET_DEFAULT_STD
+        else:
+            mean = [0.5, 0.5, 0.5]
+            std = [0.5, 0.5, 0.5]
+
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: image.convert("RGB"),
+            transforms.Resize((patch_image_size, patch_image_size), interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+
+        if type(bpe).__name__ == 'GPT2BPE':
+            self.prompt = " what does the image describe?"
+        elif type(bpe).__name__ == 'BertBPE':
+            self.prompt = "图片描述了什么内容?"
+
+    def __getitem__(self, index):
+        uniq_id, image, caption = self.dataset[index]
+
+        '''
+        # for b64decode
+        if len(image)%4 != 0:
+            num_padding = 4 - (len(image) % 4)
+            if num_padding < 4:
+                image += "=" * num_padding
+        '''
+        
+        try:
+            image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
+        except Exception as e:
+            with open("image.txt", "wb") as f:
+                f.write(image)
+            # for b64decode
+            if len(image)%4 != 0:
+                num_padding = 4 - (len(image) % 4)
+                if num_padding < 4:
+                    image += "=" * num_padding
+            print('*****错误图片*****', index, uniq_id, image[-20:])
+            image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
+        
+        patch_image = self.patch_resize_transform(image)
+        patch_mask = torch.tensor([True])
+
+        if self.split == 'train' and not self.scst:
+            caption = caption.translate(self.transtab).strip()
+            caption_token_list = caption.strip().split()
+            tgt_caption = ' '.join(caption_token_list[:self.max_tgt_length])
+        else:
+            caption = ' '.join(caption.strip().split())
+            caption_list = [cap.translate(self.transtab).strip() for cap in caption.strip().split('&&')]
+            tgt_caption = '&&'.join(caption_list)
+        src_item = self.encode_text(self.prompt)
+        tgt_item = self.encode_text(" {}".format(tgt_caption))
+
+        src_item = torch.cat([self.bos_item, src_item, self.eos_item])
+        target_item = torch.cat([tgt_item, self.eos_item])
+        prev_output_item = torch.cat([self.bos_item, tgt_item])
+
+        example = {
+            "id": uniq_id,
+            "source": src_item,
+            "patch_image": patch_image,
+            "patch_mask": patch_mask,
+            "target": target_item,
+            "prev_output_tokens": prev_output_item
+        }
+        return example
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch containing the data of the task
+        """
+        return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
--- a/data/mm_data/image_gen_dataset.py
+++ b/data/mm_data/image_gen_dataset.py
+# Copyright 2022 The OFA-Sys Team. 
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license 
+# found in the LICENSE file in the root directory.
+
+from io import BytesIO
+
+import logging
+import warnings
+import base64
+import random
+
+import numpy as np
+import torch
+
+from PIL import Image, ImageFile
+from itertools import chain
+from data.ofa_dataset import OFADataset
+from data import data_utils
+
+from PIL import Image
+from io import BytesIO
+import base64
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+ImageFile.MAX_IMAGE_PIXELS = None
+Image.MAX_IMAGE_PIXELS = None
+
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
+
+
+def collate(
+        samples,
+        pad_idx,
+        eos_idx,
+        left_pad_source=False,
+        left_pad_target=False,
+):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key, left_pad, move_eos_to_beginning=False):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx,
+            left_pad,
+            move_eos_to_beginning,
+        )
+
+    id = np.array([s["id"] for s in samples])
+    src_tokens = merge("source", left_pad=left_pad_source)
+    # sort by descending source length
+    src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
+
+    code_images = np.array([s["code_image"] for s in samples])
+    code_masks = torch.cat([sample['code_mask'] for sample in samples])
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge("target", left_pad=left_pad_target)
+        tgt_lengths = torch.LongTensor(
+            [s["target"].ne(pad_idx).long().sum() for s in samples]
+        )
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens", left_pad=left_pad_target)
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        "id": id,
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+            "code_masks": code_masks,
+            "prev_output_tokens": prev_output_tokens
+        },
+        "code_images": code_images,
+        "target": target
+    }
+
+    return batch
+
+
+def preprocess_vqgan(x):
+    x = 2. * x - 1.
+    return x
+
+
+class ImageGenDataset(OFADataset):
+    def __init__(
+            self,
+            split,
+            dataset,
+            bpe,
+            src_dict,
+            tgt_dict=None,
+            max_src_length=128,
+            code_dict_size=8192,
+            code_image_size=256,
+            num_bins=1000
+    ):
+        super().__init__(split, dataset, bpe, src_dict, tgt_dict)
+        self.max_src_length = max_src_length
+
+        self.code_dict_size = code_dict_size
+        self.num_codes = (code_image_size // 8) ** 2
+        self.num_bins = num_bins
+
+        slice_id = self.dataset.slice_id
+        empty_img = Image.new('RGB', (code_image_size, code_image_size))
+        empty_img.save(f'temp_{slice_id}.png')
+        img = Image.open(f'temp_{slice_id}.png')
+        img_buffer = BytesIO()
+        img.save(img_buffer, format=img.format)
+        byte_data = img_buffer.getvalue()
+        self.empty_image_base64 = (base64.urlsafe_b64encode(byte_data)).decode("utf-8")
+
+    def __getitem__(self, index):
+
+        data = self.dataset[index]
+        if len(data) == 2:
+            uniq_id, text = data
+            image_code = [0] * 1024
+            image = self.empty_image_base64
+        elif len(data) == 3:
+            uniq_id, text, image_code = data
+            image_code = [int(num) for num in image_code.strip().split()]
+            image = self.empty_image_base64
+        elif len(data) == 4:
+            uniq_id, image, text, image_code = data
+            image_code = [int(num) for num in image_code.strip().split()]
+        else:
+            raise NotImplementedError
+        code_mask = torch.tensor([True])
+        image_code = torch.LongTensor(image_code)
+        tgt_item = image_code + len(self.src_dict) - self.code_dict_size - self.num_bins
+        target_item = torch.cat([tgt_item, self.eos_item])
+        prev_output_item = torch.cat([self.bos_item, tgt_item])
+
+        caption_token_list = text.strip().split()
+        caption = ' '.join(caption_token_list[:self.max_src_length])
+        src_item = self.encode_text(
+            " what is the complete image? caption: {}".format(caption),
+            append_bos=True,
+            append_eos=True
+        )
+        example = {
+            "id": uniq_id,
+            "source": src_item,
+            "code_mask": code_mask,
+            "code_image": image,
+            "target": target_item,
+            "prev_output_tokens": prev_output_item
+        }
+        return example
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch containing the data of the task
+        """
+        return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
--- a/data/mm_data/ocr_dataset.py
+++ b/data/mm_data/ocr_dataset.py
+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+
+from io import BytesIO
+
+import logging
+import warnings
+import random
+import functools
+import numpy as np
+import torch
+import base64
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms import functional as F
+
+from PIL import Image, ImageFile
+
+from zhconv import convert
+import unicodedata
+
+from data import data_utils
+from data.ofa_dataset import OFADataset
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+ImageFile.MAX_IMAGE_PIXELS = None
+Image.MAX_IMAGE_PIXELS = None
+
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def collate(samples, pad_idx, eos_idx):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=eos_idx,
+        )
+
+    id = np.array([s["id"] for s in samples])
+    src_tokens = merge("source")
+    src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
+
+    patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
+    patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge("target")
+        tgt_lengths = torch.LongTensor([s["target"].ne(pad_idx).long().sum() for s in samples])
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens")
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        "id": id,
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+            "patch_images": patch_images,
+            "patch_masks": patch_masks,
+            "prev_output_tokens": prev_output_tokens
+        },
+        "target": target,
+    }
+
+    return batch
+
+
+def ocr_resize(img, patch_image_size, is_document=False):
+    img = img.convert("RGB")
+    width, height = img.size
+
+    if is_document:
+        new_height, new_width = 64, 1920
+    else:
+        if width >= height:
+            new_width = max(64, patch_image_size)
+            new_height = max(64, int(patch_image_size * (height / width)))
+            top = random.randint(0, patch_image_size - new_height)
+            bottom = patch_image_size - new_height - top
+            left, right = 0, 0
+        else:
+            new_height = max(64, patch_image_size)
+            new_width = max(64, int(patch_image_size * (width / height)))
+            left = random.randint(0, patch_image_size - new_width)
+            right = patch_image_size - new_width - left
+            top, bottom = 0, 0
+
+    img_new = F.resize(
+        img,
+        [new_height, new_width],
+        interpolation=InterpolationMode.BICUBIC,
+    )
+
+    if is_document:
+        img_split = transforms.ToTensor()(img_new).chunk(4, dim=-1)
+        img_new = transforms.ToPILImage()(torch.cat(img_split, dim=-2))
+        new_width, new_height = img_new.size
+        top = random.randint(0, patch_image_size - new_height)
+        bottom = patch_image_size - new_height - top
+        left, right = 0, 0
+
+    img_new = F.pad(img_new, padding=[left, top, right, bottom], padding_mode="edge")
+    assert img_new.size == (patch_image_size, patch_image_size)
+
+    return img_new
+
+
+class OcrDataset(OFADataset):
+    def __init__(
+        self,
+        split,
+        dataset,
+        bpe,
+        src_dict,
+        tgt_dict=None,
+        max_src_length=80,
+        max_tgt_length=30,
+        patch_image_size=224,
+        imagenet_default_mean_and_std=False,
+        is_document=False,
+    ):
+        super().__init__(split, dataset, bpe, src_dict, tgt_dict)
+        self.max_src_length = max_src_length
+        self.max_tgt_length = max_tgt_length
+        self.patch_image_size = patch_image_size
+
+        if imagenet_default_mean_and_std:
+            mean = IMAGENET_DEFAULT_MEAN
+            std = IMAGENET_DEFAULT_STD
+        else:
+            mean = [0.5, 0.5, 0.5]
+            std = [0.5, 0.5, 0.5]
+
+        self.patch_resize_transform = transforms.Compose(
+            [
+                lambda image: ocr_resize(
+                    image, patch_image_size, is_document=is_document
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=mean, std=std),
+            ]
+        )
+
+        self.bpe = bpe
+        if type(bpe).__name__ == 'GPT2BPE':
+            self.prompt = " what are the texts on the image?"
+        elif type(bpe).__name__ == 'BertBPE':
+            self.prompt = "图片上的文字是什么?"
+
+    def __getitem__(self, index):
+        uniq_id, image, caption = self.dataset[index]
+
+        image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
+        patch_image = self.patch_resize_transform(image)
+        patch_mask = torch.tensor([True])
+
+        caption = unicodedata.normalize("NFKC", convert(caption, "zh-hans"))
+        if type(self.bpe).__name__ == 'GPT2BPE':
+            caption_token_list = caption.lower().strip().split()
+            tgt_caption = ' '.join(caption_token_list[:self.max_tgt_length])
+        elif type(self.bpe).__name__ == 'BertBPE':
+            tgt_caption = caption[: self.max_tgt_length].lower()
+        src_item = self.encode_text(self.prompt)
+        tgt_item = self.encode_text(" {}".format(tgt_caption))
+
+        src_item = torch.cat([self.bos_item, src_item, self.eos_item])
+        target_item = torch.cat([tgt_item, self.eos_item])
+        prev_output_item = torch.cat([self.bos_item, tgt_item])
+
+        example = {
+            "id": uniq_id,
+            "source": src_item,
+            "patch_image": patch_image,
+            "patch_mask": patch_mask,
+            "target": target_item,
+            "prev_output_tokens": prev_output_item,
+        }
+        return example
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch containing the data required for the task
+        """
+        return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
--- a/data/mm_data/refcoco_dataset.py
+++ b/data/mm_data/refcoco_dataset.py
+# Copyright 2022 The OFA-Sys Team. 
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license 
+# found in the LICENSE file in the root directory.
+
+from io import BytesIO
+
+import logging
+import warnings
+
+import numpy as np
+import torch
+import base64
+import utils.transforms as T
+
+from PIL import Image, ImageFile
+
+from data import data_utils
+from data.ofa_dataset import OFADataset
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+ImageFile.MAX_IMAGE_PIXELS = None
+Image.MAX_IMAGE_PIXELS = None
+
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def collate(samples, pad_idx, eos_idx):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=eos_idx,
+        )
+
+    id = np.array([s["id"] for s in samples])
+    src_tokens = merge("source")
+    src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
+
+    patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
+    patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
+
+    w_resize_ratios = torch.stack([s["w_resize_ratio"] for s in samples], dim=0)
+    h_resize_ratios = torch.stack([s["h_resize_ratio"] for s in samples], dim=0)
+    region_coords = torch.stack([s['region_coord'] for s in samples], dim=0)
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge("target")
+        tgt_lengths = torch.LongTensor([s["target"].ne(pad_idx).long().sum() for s in samples])
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens")
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        "id": id,
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+            "patch_images": patch_images,
+            "patch_masks": patch_masks,
+            "prev_output_tokens": prev_output_tokens
+        },
+        "target": target,
+        "w_resize_ratios": w_resize_ratios,
+        "h_resize_ratios": h_resize_ratios,
+        "region_coords": region_coords
+    }
+
+    return batch
+
+
+class RefcocoDataset(OFADataset):
+    def __init__(
+        self,
+        split,
+        dataset,
+        bpe,
+        src_dict,
+        tgt_dict=None,
+        max_src_length=80,
+        max_tgt_length=30,
+        patch_image_size=512,
+        imagenet_default_mean_and_std=False,
+        num_bins=1000,
+        max_image_size=512
+    ):
+        super().__init__(split, dataset, bpe, src_dict, tgt_dict)
+        self.max_src_length = max_src_length
+        self.max_tgt_length = max_tgt_length
+        self.patch_image_size = patch_image_size
+        self.num_bins = num_bins
+
+        if imagenet_default_mean_and_std:
+            mean = IMAGENET_DEFAULT_MEAN
+            std = IMAGENET_DEFAULT_STD
+        else:
+            mean = [0.5, 0.5, 0.5]
+            std = [0.5, 0.5, 0.5]
+
+        # for positioning
+        self.positioning_transform = T.Compose([
+            T.RandomResize([patch_image_size], max_size=patch_image_size),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std, max_image_size=max_image_size)
+        ])
+
+        if type(bpe).__name__ == 'GPT2BPE':
+            self.prompt = ' which region does the text " {} " describe?'
+        elif type(bpe).__name__ == 'BertBPE':
+            self.prompt = '这段文字" {} "描述的是哪个区域？'
+
+    def __getitem__(self, index):
+        uniq_id, base64_str, text, region_coord = self.dataset[index]
+
+        image = Image.open(BytesIO(base64.urlsafe_b64decode(base64_str))).convert("RGB")
+        w, h = image.size
+        boxes_target = {"boxes": [], "labels": [], "area": [], "size": torch.tensor([h, w])}
+        x0, y0, x1, y1 = region_coord.strip().split(',')
+        region = torch.tensor([float(x0), float(y0), float(x1), float(y1)])
+        boxes_target["boxes"] = torch.tensor([[float(x0), float(y0), float(x1), float(y1)]])
+        boxes_target["labels"] = np.array([0])
+        boxes_target["area"] = torch.tensor([(float(x1) - float(x0)) * (float(y1) - float(y0))])
+
+        patch_image, patch_boxes = self.positioning_transform(image, boxes_target)
+        resize_h, resize_w = patch_boxes["size"][0], patch_boxes["size"][1]
+        patch_mask = torch.tensor([True])
+        quant_x0 = "<bin_{}>".format(int((patch_boxes["boxes"][0][0] * (self.num_bins - 1)).round()))
+        quant_y0 = "<bin_{}>".format(int((patch_boxes["boxes"][0][1] * (self.num_bins - 1)).round()))
+        quant_x1 = "<bin_{}>".format(int((patch_boxes["boxes"][0][2] * (self.num_bins - 1)).round()))
+        quant_y1 = "<bin_{}>".format(int((patch_boxes["boxes"][0][3] * (self.num_bins - 1)).round()))
+        region_coord = "{} {} {} {}".format(quant_x0, quant_y0, quant_x1, quant_y1)
+        src_caption = self.pre_caption(text, self.max_src_length)
+        src_item = self.encode_text(self.prompt.format(src_caption))
+        tgt_item = self.encode_text(region_coord, use_bpe=False)
+
+        src_item = torch.cat([self.bos_item, src_item, self.eos_item])
+        target_item = torch.cat([tgt_item, self.eos_item])
+        prev_output_item = torch.cat([self.bos_item, tgt_item])
+
+        example = {
+            "id": uniq_id,
+            "source": src_item,
+            "patch_image": patch_image,
+            "patch_mask": patch_mask,
+            "target": target_item,
+            "prev_output_tokens": prev_output_item,
+            "w_resize_ratio": resize_w / w,
+            "h_resize_ratio": resize_h / h,
+            "region_coord": region
+        }
+        return example
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch containing the data of the task
+        """
+        return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
--- a/data/mm_data/snli_ve_dataset.py
+++ b/data/mm_data/snli_ve_dataset.py
+# Copyright 2022 The OFA-Sys Team. 
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license 
+# found in the LICENSE file in the root directory.
+
+from io import BytesIO
+
+import logging
+import warnings
+
+import numpy as np
+import torch
+import base64
+from torchvision import transforms
+
+from PIL import Image, ImageFile
+
+from data import data_utils
+from data.ofa_dataset import OFADataset
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+ImageFile.MAX_IMAGE_PIXELS = None
+Image.MAX_IMAGE_PIXELS = None
+
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def collate(samples, pad_idx, eos_idx):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=eos_idx,
+        )
+
+    id = np.array([s["id"] for s in samples])
+    src_tokens = merge("source")
+    src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
+
+    patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
+    patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
+
+    ref_dict = None
+    if samples[0].get("ref_dict", None) is not None:
+        ref_dict = np.array([s['ref_dict'] for s in samples])
+
+    constraint_masks = None
+    if samples[0].get("constraint_mask", None) is not None:
+        constraint_masks = merge("constraint_mask")
+
+    decoder_prompts = None
+    if samples[0].get("decoder_prompt", None) is not None:
+        decoder_prompts = np.array([s['decoder_prompt'].tolist() for s in samples])
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge("target")
+        tgt_lengths = torch.LongTensor(
+            [s["target"].ne(pad_idx).long().sum() for s in samples]
+        )
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens")
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        "id": id,
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+            "patch_images": patch_images,
+            "patch_masks": patch_masks,
+            "prev_output_tokens": prev_output_tokens
+        },
+        "ref_dict": ref_dict,
+        "constraint_masks": constraint_masks,
+        "decoder_prompts": decoder_prompts,
+        "target": target
+    }
+
+    return batch
+
+
+class SnliVeDataset(OFADataset):
+    def __init__(
+        self,
+        split,
+        dataset,
+        bpe,
+        src_dict,
+        tgt_dict=None,
+        max_src_length=80,
+        max_tgt_length=30,
+        patch_image_size=224,
+        add_caption=False,
+        constraint_trie=None,
+        imagenet_default_mean_and_std=False,
+        prompt_type="none"
+    ):
+        super().__init__(split, dataset, bpe, src_dict, tgt_dict)
+        self.max_src_length = max_src_length
+        self.max_tgt_length = max_tgt_length
+        self.patch_image_size = patch_image_size
+
+        self.add_caption = add_caption
+        self.constraint_trie = constraint_trie
+        self.prompt_type = prompt_type
+
+        if imagenet_default_mean_and_std:
+            mean = IMAGENET_DEFAULT_MEAN
+            std = IMAGENET_DEFAULT_STD
+        else:
+            mean = [0.5, 0.5, 0.5]
+            std = [0.5, 0.5, 0.5]
+
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: image.convert("RGB"),
+            transforms.Resize((patch_image_size, patch_image_size), interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+
+    def __getitem__(self, index):
+        uniq_id, image, hypothesis, caption, label = self.dataset[index]
+        if label == 'contradiction':
+            label = 'no'
+        elif label == 'entailment':
+            label = 'yes'
+        elif label == 'neutral':
+            label = 'maybe'
+        else:
+            raise NotImplementedError
+
+        image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
+        patch_image = self.patch_resize_transform(image)
+        patch_mask = torch.tensor([True])
+
+        hypothesis = self.pre_caption(hypothesis, self.max_src_length)
+        src_item = self.encode_text(' does the image describe " {} "?'.format(hypothesis))
+        tgt_item = self.encode_text(" {}".format(label))
+        ref_dict = {label: 1.0}
+
+        if self.add_caption:
+            caption = self.pre_caption(caption, self.max_src_length)
+            src_item = self.encode_text(' can image and text1 " {} " imply text2 " {} "?'.format(caption, hypothesis))
+
+        src_item = torch.cat([self.bos_item, src_item, self.eos_item])
+        if self.prompt_type == 'none':
+            prev_output_item = torch.cat([self.bos_item, tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+            decoder_prompt = self.bos_item
+        elif self.prompt_type == 'src':
+            prev_output_item = torch.cat([src_item, tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+            decoder_prompt = src_item
+        elif self.prompt_type == 'prev_output':
+            prev_output_item = torch.cat([src_item[:-1], tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+            decoder_prompt = src_item[:-1]
+        else:
+            raise NotImplementedError
+        target_item[:-len(tgt_item)-1] = self.tgt_dict.pad()
+
+        example = {
+            "id": uniq_id,
+            "source": src_item,
+            "patch_image": patch_image,
+            "patch_mask": patch_mask,
+            "target": target_item,
+            "prev_output_tokens": prev_output_item,
+            "decoder_prompt": decoder_prompt,
+            "ref_dict": ref_dict,
+        }
+        if self.constraint_trie is not None:
+            constraint_mask = torch.zeros((len(target_item), len(self.tgt_dict))).bool()
+            start_idx = len(target_item) - len(tgt_item) - 1
+            for i in range(len(target_item)-len(tgt_item)-1, len(target_item)):
+                constraint_prefix_token = [self.tgt_dict.bos()] + target_item[start_idx:i].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            example["constraint_mask"] = constraint_mask
+        return example
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch containing the data of the task
+        """
+        return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
--- a/data/mm_data/vqa_gen_dataset.py
+++ b/data/mm_data/vqa_gen_dataset.py
+# Copyright 2022 The OFA-Sys Team. 
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license 
+# found in the LICENSE file in the root directory.
+
+from io import BytesIO
+
+import logging
+import warnings
+
+import numpy as np
+import torch
+import base64
+from torchvision import transforms
+
+from PIL import Image, ImageFile
+
+from data import data_utils
+from data.ofa_dataset import OFADataset
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+ImageFile.MAX_IMAGE_PIXELS = None
+Image.MAX_IMAGE_PIXELS = None
+
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def collate(samples, pad_idx, eos_idx):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=eos_idx,
+        )
+
+    id = np.array([s["id"] for s in samples])
+    src_tokens = merge("source")
+    src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
+
+    patch_images = torch.stack([sample['patch_image'] for sample in samples], dim=0)
+    patch_masks = torch.cat([sample['patch_mask'] for sample in samples])
+
+    conf = None
+    if samples[0].get("conf", None) is not None:
+        conf = torch.cat([s['conf'] for s in samples], dim=0)
+
+    ref_dict = None
+    if samples[0].get("ref_dict", None) is not None:
+        ref_dict = np.array([s['ref_dict'] for s in samples])
+
+    constraint_masks = None
+    if samples[0].get("constraint_mask", None) is not None:
+        constraint_masks = merge("constraint_mask")
+
+    decoder_prompts = None
+    if samples[0].get("decoder_prompt", None) is not None:
+        decoder_prompts = np.array([s['decoder_prompt'].tolist() for s in samples])
+
+    prefix_tokens = None
+    if samples[0].get("decoder_prompt", None) is not None:
+        prefix_tokens = merge("decoder_prompt")
+        prefix_tokens = prefix_tokens[:, 1:]
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge("target")
+        tgt_lengths = torch.LongTensor(
+            [s["target"].ne(pad_idx).long().sum() for s in samples]
+        )
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens")
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        "id": id,
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+            "patch_images": patch_images,
+            "patch_masks": patch_masks,
+            "prev_output_tokens": prev_output_tokens
+        },
+        "conf": conf,
+        "ref_dict": ref_dict,
+        "constraint_masks": constraint_masks,
+        "decoder_prompts": decoder_prompts,
+        "target": target,
+        "prefix_tokens": prefix_tokens
+    }
+
+    return batch
+
+
+class VqaGenDataset(OFADataset):
+    def __init__(
+        self,
+        split,
+        dataset,
+        bpe,
+        src_dict,
+        tgt_dict=None,
+        max_src_length=128,
+        max_object_length=30,
+        max_tgt_length=30,
+        patch_image_size=224,
+        add_object=False,
+        constraint_trie=None,
+        imagenet_default_mean_and_std=False,
+        prompt_type="none"
+    ):
+        super().__init__(split, dataset, bpe, src_dict, tgt_dict)
+        self.max_src_length = max_src_length
+        self.max_object_length = max_object_length
+        self.max_tgt_length = max_tgt_length
+        self.patch_image_size = patch_image_size
+
+        self.add_object = add_object
+        self.constraint_trie = constraint_trie
+        self.prompt_type = prompt_type
+
+        if imagenet_default_mean_and_std:
+            mean = IMAGENET_DEFAULT_MEAN
+            std = IMAGENET_DEFAULT_STD
+        else:
+            mean = [0.5, 0.5, 0.5]
+            std = [0.5, 0.5, 0.5]
+
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: image.convert("RGB"),
+            transforms.Resize((patch_image_size, patch_image_size), interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        if len(item) == 5:
+            uniq_id, image, question, ref, predict_objects = item
+        else:
+            uniq_id, image, question, ref, predict_objects, caption = item
+
+        image = Image.open(BytesIO(base64.urlsafe_b64decode(image)))
+        patch_image = self.patch_resize_transform(image)
+        patch_mask = torch.tensor([True])
+
+        question = self.pre_question(question, self.max_src_length)
+        question = question + '?' if not question.endswith('?') else question
+        src_item = self.encode_text(' {}'.format(question))
+
+        ref_dict = {item.split('|!+')[1]: float(item.split('|!+')[0]) for item in ref.split('&&')}
+        answer = max(ref_dict, key=ref_dict.get)
+        conf = torch.tensor([ref_dict[answer]])
+        tgt_item = self.encode_text(" {}".format(answer), length=self.max_tgt_length)
+
+        if self.add_object and predict_objects is not None:
+            predict_object_seq = ' '.join(predict_objects.strip().split('&&')[:self.max_object_length])
+            predict_object_item = self.encode_text(" object: {}".format(predict_object_seq))
+            src_item = torch.cat([src_item, predict_object_item])
+
+        src_item = torch.cat([self.bos_item, src_item, self.eos_item])
+        if self.prompt_type == 'none':
+            prev_output_item = torch.cat([self.bos_item, tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+            decoder_prompt = self.bos_item
+        elif self.prompt_type == 'src':
+            prev_output_item = torch.cat([src_item, tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+            decoder_prompt = src_item
+        elif self.prompt_type == 'prev_output':
+            prev_output_item = torch.cat([src_item[:-1], tgt_item])
+            target_item = torch.cat([prev_output_item[1:], self.eos_item])
+            decoder_prompt = src_item[:-1]
+        else:
+            raise NotImplementedError
+        target_item[:-len(tgt_item)-1] = self.tgt_dict.pad()
+
+        example = {
+            "id": uniq_id,
+            "source": src_item,
+            "patch_image": patch_image,
+            "patch_mask": patch_mask,
+            "target": target_item,
+            "prev_output_tokens": prev_output_item,
+            "decoder_prompt": decoder_prompt,
+            "ref_dict": ref_dict,
+            "conf": conf,
+        }
+        if self.constraint_trie is not None:
+            constraint_mask = torch.zeros((len(target_item), len(self.tgt_dict))).bool()
+            start_idx = len(target_item) - len(tgt_item) - 1
+            for i in range(len(target_item)-len(tgt_item)-1, len(target_item)):
+                constraint_prefix_token = [self.tgt_dict.bos()] + target_item[start_idx:i].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            example["constraint_mask"] = constraint_mask
+        return example
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch containing the data of the task
+        """
+        return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
--- a/data/nlg_data/__pycache__/summary_dataset.cpython-38.pyc
+++ b/data/nlg_data/__pycache__/summary_dataset.cpython-38.pyc
--- a/data/nlg_data/summary_dataset.py
+++ b/data/nlg_data/summary_dataset.py
+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+
+import logging
+import warnings
+import torch
+import numpy as np
+
+from data import data_utils
+from data.ofa_dataset import OFADataset
+
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
+
+
+def collate(samples, pad_idx, eos_idx):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=eos_idx,
+        )
+
+    src_tokens = merge("source")
+    src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge("target")
+        tgt_lengths = torch.LongTensor(
+            [s["target"].ne(pad_idx).long().sum() for s in samples]
+        )
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens")
+    else:
+        ntokens = src_lengths.sum().item()
+
+    target_strs = np.array([s["target_str"] for s in samples])
+
+    batch = {
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+            "prev_output_tokens": prev_output_tokens
+        },
+        "target": target,
+        "target_strs": target_strs
+    }
+
+    return batch
+
+
+class SummaryDataset(OFADataset):
+    def __init__(
+        self,
+        split,
+        dataset,
+        bpe,
+        src_dict,
+        tgt_dict=None,
+        code_dict_size=8192,
+        num_bins=1000,
+        max_src_length=512,
+        max_tgt_length=128,
+        noise_ratio=0.0
+    ):
+        super().__init__(split, dataset, bpe, src_dict, tgt_dict)
+        self.max_src_length = max_src_length
+        self.max_tgt_length = max_tgt_length
+        self.code_dict_size = code_dict_size
+        self.num_bins = num_bins
+        self.noise_ratio = noise_ratio
+
+        if type(bpe).__name__ == 'GPT2BPE':
+            self.prompt = ' what is the summary of article " {} "?'
+        elif type(bpe).__name__ == 'BertBPE':
+            self.prompt = "{} 请用一个句子简单总结上文："
+
+    def __getitem__(self, index):
+        source, target = self.dataset[index]
+        target_str = target.lower()
+
+        source = self.pre_caption(source, max_words=self.max_src_length)
+        target = self.pre_caption(target, max_words=self.max_tgt_length)
+        source = source.replace('<unk>', 'unk')
+        target = target.replace('<unk>', 'unk')
+
+        src_item = self.encode_text(
+            self.prompt.format(source),
+            length=self.max_src_length
+        )
+        tgt_item = self.encode_text('{}'.format(target))
+        noise_tgt_item = self.add_noise_to_tgt(tgt_item.clone(), self.noise_ratio)
+
+        src_item = torch.cat([self.bos_item, src_item, self.eos_item])
+        target_item = torch.cat([tgt_item, self.eos_item])
+        prev_output_item = torch.cat([self.bos_item, noise_tgt_item])
+
+        example = {
+            "source": src_item,
+            "target": target_item,
+            "prev_output_tokens": prev_output_item,
+            "target_str": target_str
+        }
+        return example
+
+    def add_noise_to_tgt(self, target, p):
+        noise_indices = torch.FloatTensor(target.size(0)).uniform_() < p
+        target[noise_indices] = torch.randint(
+            4, len(self.src_dict) - self.code_dict_size - self.num_bins, size=(noise_indices.sum(),)
+        )
+        return target
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch containing the data of the task
+        """
+        return collate(samples, pad_idx=self.pad, eos_idx=self.eos)
\ No newline at end of file
--- a/data/nlu_data/__pycache__/cola_dataset.cpython-38.pyc
+++ b/data/nlu_data/__pycache__/cola_dataset.cpython-38.pyc
--- a/data/nlu_data/__pycache__/mnli_dataset.cpython-38.pyc
+++ b/data/nlu_data/__pycache__/mnli_dataset.cpython-38.pyc
--- a/data/nlu_data/__pycache__/mrpc_dataset.cpython-38.pyc
+++ b/data/nlu_data/__pycache__/mrpc_dataset.cpython-38.pyc
--- a/data/nlu_data/__pycache__/qnli_dataset.cpython-38.pyc
+++ b/data/nlu_data/__pycache__/qnli_dataset.cpython-38.pyc
--- a/data/nlu_data/__pycache__/qqp_dataset.cpython-38.pyc
+++ b/data/nlu_data/__pycache__/qqp_dataset.cpython-38.pyc
--- a/data/nlu_data/__pycache__/rte_dataset.cpython-38.pyc
+++ b/data/nlu_data/__pycache__/rte_dataset.cpython-38.pyc
--- a/data/nlu_data/__pycache__/sst2_dataset.cpython-38.pyc
+++ b/data/nlu_data/__pycache__/sst2_dataset.cpython-38.pyc
--- a/data/nlu_data/cola_dataset.py
+++ b/data/nlu_data/cola_dataset.py
+# Copyright 2022 The OFA-Sys Team. 
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license 
+# found in the LICENSE file in the root directory.
+
+import logging
+import warnings
+import torch
+import numpy as np
+
+from data import data_utils
+from data.ofa_dataset import OFADataset
+
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
+
+
+def collate(samples, pad_idx, eos_idx):
+    if len(samples) == 0:
+        return {}
+
+    def merge(key):
+        return data_utils.collate_tokens(
+            [s[key] for s in samples],
+            pad_idx,
+            eos_idx=eos_idx,
+        )
+
+    src_tokens = merge("source")
+    src_lengths = torch.LongTensor([s["source"].ne(pad_idx).long().sum() for s in samples])
+
+    ref_dict = None
+    if samples[0].get("ref_dict", None) is not None:
+        ref_dict = np.array([s['ref_dict'] for s in samples])
+
+    constraint_masks = None
+    if samples[0].get("constraint_mask", None) is not None:
+        constraint_masks = merge("constraint_mask")
+
+    prev_output_tokens = None
+    target = None
+    if samples[0].get("target", None) is not None:
+        target = merge("target")
+        tgt_lengths = torch.LongTensor(
+            [s["target"].ne(pad_idx).long().sum() for s in samples]
+        )
+        ntokens = tgt_lengths.sum().item()
+
+        if samples[0].get("prev_output_tokens", None) is not None:
+            prev_output_tokens = merge("prev_output_tokens")
+    else:
+        ntokens = src_lengths.sum().item()
+
+    batch = {
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "net_input": {
+            "src_tokens": src_tokens,
+            "src_lengths": src_lengths,
+            "prev_output_tokens": prev_output_tokens
+        },
+        "ref_dict": ref_dict,
+        "constraint_masks": constraint_masks,
+        "target": target,
+    }
+
+    return batch
+
+
+class COLADataset(OFADataset):
+    def __init__(
+        self,
+        split,
+        dataset,
+        bpe,
+        src_dict,
+        tgt_dict=None,
+        max_src_length=512,
+        max_tgt_length=30,
+        constraint_trie=None,
+        prompt_type="none"
+    ):
+        super().__init__(split, dataset, bpe, src_dict, tgt_dict)
+        self.max_src_length = max_src_length
+        self.max_tgt_length = max_tgt_length
+        self.constraint_trie = constraint_trie
+        self.prompt_type = prompt_type
+
+    def __getitem__(self, index):
+        sentence, label = self.dataset[index]
+        if label == '0':
+            label = 'no'
+        elif label == '1':
+            label = 'yes'
+        else:
+            raise NotImplementedError
+
+        sentence = ' '.join(sentence.lower().strip().split()[:self.max_src_length])
+        src_item = self.encode_text(' is the text " {} " grammatically correct?'.format(sentence))
+        tgt_item = self.encode_text(" {}".format(label))
+        assert tgt_item.size(0) == 1
+        ref_dict = {label: 1.0}
+
+        src_item = torch.cat([self.bos_item, src_item, self.eos_item])
+        if self.prompt_type == 'none':
+            prev_output_item = self.bos_item
+            target_item = tgt_item
+        elif self.prompt_type == 'src':
+            prev_output_item = src_item.clone()
+            target_item = torch.cat([prev_output_item[1:], tgt_item])
+        elif self.prompt_type == 'prev_output':
+            prev_output_item = src_item[:-1].clone()
+            target_item = torch.cat([prev_output_item[1:], tgt_item])
+        else:
+            raise NotImplementedError
+        target_item[:-1] = self.tgt_dict.pad()
+
+        example = {
+            "source": src_item,
+            "target": target_item,
+            "prev_output_tokens": prev_output_item,
+            "ref_dict": ref_dict,
+        }
+        if self.constraint_trie is not None:
+            constraint_mask = torch.zeros((len(prev_output_item), len(self.tgt_dict))).bool()
+            constraint_nodes = self.constraint_trie.get_next_layer(self.bos_item.tolist())
+            constraint_mask[-1][constraint_nodes] = True
+            example["constraint_mask"] = constraint_mask
+        return example
+
+    def collater(self, samples, pad_to_length=None):
+        """Merge a list of samples to form a mini-batch.
+        Args:
+            samples (List[dict]): samples to collate
+        Returns:
+            dict: a mini-batch containing the data of the task
+        """
+        return collate(samples, pad_idx=self.pad, eos_idx=self.eos)