""" RealWorldQA Dataset Utilities Data loading and processing utilities, fully independent of VLMEvalKit. """ import os import pandas as pd import numpy as np import string from typing import Dict, Any, List from PIL import Image from common_utils import download_file, md5, toliststr, decode_base64_to_image_file # RealWorldQA dataset URL and MD5 REALWORLDQA_DATASET_URL = 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv' REALWORLDQA_DATASET_MD5 = '92321028d2bc29040284b6674721e48f' def load_dataset(dataset_name='RealWorldQA'): """ Load RealWorldQA dataset. Args: dataset_name: Dataset name (default: 'RealWorldQA') Returns: pd.DataFrame: Loaded dataset """ if 'LMUData' not in os.environ: raise ValueError("Please set LMUData environment variable or use --data-dir argument") data_root = os.path.join(os.environ['LMUData']) os.makedirs(data_root, exist_ok=True) file_name = f"{dataset_name}.tsv" data_path = os.path.join(data_root, file_name) # Download dataset if not exists or MD5 mismatch if not os.path.exists(data_path) or md5(data_path) != REALWORLDQA_DATASET_MD5: print(f"Downloading {dataset_name} dataset...") download_file(REALWORLDQA_DATASET_URL, data_path) # Load dataset data = pd.read_csv(data_path, sep='\t') # Process dataset data['index'] = [str(x) for x in data['index']] # Process image data (base64 encoded or referenced) if 'image' in data: data['image'] = [str(x) for x in data['image']] image_map = {x: y for x, y in zip(data['index'], data['image'])} # Process image references (some images may reference other indices) for k in image_map: if len(image_map[k]) <= 64: idx = image_map[k] assert idx in image_map and len(image_map[idx]) > 64 image_map[k] = image_map[idx] images = [toliststr(image_map[k]) for k in data['index']] data['image'] = [x[0] if len(x) == 1 else x for x in images] # Process image paths if 'image_path' in data: paths = [toliststr(x) for x in data['image_path']] data['image_path'] = [x[0] if len(x) == 1 else x for x in paths] # Convert index to integer if possible if np.all([isinstance(x, int) or (isinstance(x, str) and x.isdigit()) for x in data['index']]): data['index'] = [int(x) for x in data['index']] return data def dump_image(line, img_root): """ Save image data to disk and return path. Args: line: Data row containing image data img_root: Image save root directory Returns: list: List of image paths """ os.makedirs(img_root, exist_ok=True) if 'image' in line: if isinstance(line['image'], list): tgt_path = [] assert 'image_path' in line for img, im_name in zip(line['image'], line['image_path']): path = os.path.join(img_root, im_name) if not os.path.exists(path): decode_base64_to_image_file(img, path) tgt_path.append(path) else: tgt_path = os.path.join(img_root, f"{line['index']}.jpg") if not os.path.exists(tgt_path): decode_base64_to_image_file(line['image'], tgt_path) tgt_path = [tgt_path] else: assert 'image_path' in line tgt_path = toliststr(line['image_path']) return tgt_path def build_realworldqa_prompt(line, dump_image_func, min_pixels, max_pixels): """ Build RealWorldQA dataset prompt. Args: line: Data row dump_image_func: Image save function min_pixels: Minimum pixels max_pixels: Maximum pixels Returns: list: List of messages in standard conversation format """ # Save and get image path tgt_path = dump_image_func(line) # Build question text question = line['question'] # Build options options = { cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand]) } options_prompt = 'Options:\n' for key, item in options.items(): options_prompt += f'{key}. {item}\n' # Process hint if exists hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None # Build complete prompt prompt = '' if hint is not None: prompt += f'Hint: {hint}\n' prompt += f'Question: {question}\n' if len(options): prompt += options_prompt prompt += 'Please select the correct answer from the options above. \n' # Build messages in standard conversation format content = [] # Add images (using file:// prefix for consistency) if isinstance(tgt_path, list): for p in tgt_path: content.append({ "type": "image", "image": f"file://{p}", "min_pixels": min_pixels, "max_pixels": max_pixels }) else: content.append({ "type": "image", "image": f"file://{tgt_path}", "min_pixels": min_pixels, "max_pixels": max_pixels }) # Add text content.append({"type": "text", "text": prompt}) # Return messages in standard conversation format messages = [{ "role": "user", "content": content }] return messages