convert_flickr30k_ann.py

# Copyright (c) OpenMMLab. All rights reserved.
"""Create COCO-Style GT annotations based on raw annotation of Flickr30k.

GT annotations are used for evaluation in image caption task.
"""

import json


def main():
    with open('dataset_flickr30k.json', 'r') as f:
        annotations = json.load(f)
    ann_list = []
    img_list = []
    splits = ['train', 'val', 'test']
    for split in splits:
        for img in annotations['images']:

            # img_example={
            #     "sentids": [0, 1, 2],
            #     "imgid": 0,
            #     "sentences": [
            #         {"raw": "Two men in green shirts standing in a yard.",
            #          "imgid": 0, "sentid": 0},
            #         {"raw": "A man in a blue shirt standing in a garden.",
            #          "imgid": 0, "sentid": 1},
            #         {"raw": "Two friends enjoy time spent together.",
            #          "imgid": 0, "sentid": 2}
            #     ],
            #     "split": "train",
            #     "filename": "1000092795.jpg"
            # },

            if img['split'] != split:
                continue

            img_list.append({'id': img['imgid']})

            for sentence in img['sentences']:
                ann_info = {
                    'image_id': img['imgid'],
                    'id': sentence['sentid'],
                    'caption': sentence['raw']
                }
                ann_list.append(ann_info)

        json_file = {'annotations': ann_list, 'images': img_list}

        # generate flickr30k_train_gt.json, flickr30k_val_gt.json
        # and flickr30k_test_gt.json
        with open(f'flickr30k_{split}_gt.json', 'w') as f:
            json.dump(json_file, f)


if __name__ == '__main__':
    main()