coca_data.py

from clip_benchmark.datasets.builder import build_dataset
import pandas as pd
import os

root_path = "datasets" # set this to smth meaningful
ds = build_dataset("mscoco_captions", root=root_path, split="val", task="captioning") # this downloads the dataset if it is not there already
coco = ds.coco
imgs = coco.loadImgs(coco.getImgIds())
future_df = {"filepath":[], "title":[]}
for img in imgs:
    caps = coco.imgToAnns[img["id"]]
    for cap in caps:
        future_df["filepath"].append(img["file_name"])
        future_df["title"].append(cap["caption"])
pd.DataFrame.from_dict(future_df).to_csv(
  os.path.join(root_path, "train2014.csv"), index=False, sep="\t"
)