Commit 463544a1 authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #2694 failed with stages
in 0 seconds
.DS_store
.idea
*/.DS_store
__pycache__
*/__pycache__/
\ No newline at end of file
# EVAL
Our evaluation process consists of the following steps:
1. Prepare the Environment and Dataset
- Install required dependencies:
```bash
conda env create -f qwen25vl_environment.yml
conda activate qwen25vl
```
- Set up your API keys in secret_t2.env for GPT4.1 access
- Then download our dataset stepfun-ai/GEdit-Bench:
```python
from datasets import load_dataset
dataset = load_dataset("stepfun-ai/GEdit-Bench")
```
2. Generate and Organize Your Images
- Generate images following the example code in `generate_image_example.py`
- Organize your generated images in the following directory structure:
```
results/
├── method_name/
│ └── fullset/
│ └── edit_task/
│ ├── cn/ # Chinese instructions
│ │ ├── key1.png
│ │ ├── key2.png
│ │ └── ...
│ └── en/ # English instructions
│ ├── key1.png
│ ├── key2.png
│ └── ...
```
3. Evaluate using GPT4.1/Qwen2.5VL-72B-Instruct-AWQ
- For GPT-4.1 evaluation:
```bash
python test_gedit_score.py --model_name your_method --save_path --backbone gpt4o
```
- For Qwen evaluation:
```bash
python test_gedit_score.py --model_name your_method --save_path --backbone qwen25vl
```
4. Analyze your results and obtain scores across all dimensions
- Run the analysis script to get scores for semantics, quality, and overall performance:
```bash
python calculate_statistics.py --model_name your_method --save_path /path/to/results --backbone gpt4o
```
- This will output scores broken down by edit category and provide aggregate metrics
# Acknowledgements
This project builds upon and adapts code from the following excellent repositories:
- [VIEScore](https://github.com/TIGER-AI-Lab/VIEScore): A visual instruction-guided explainable metric for evaluating conditional image synthesis
We thank the authors of these repositories for making their code publicly available.
import megfile
import os
import pandas as pd
from collections import defaultdict
import sys
import numpy as np
import math
GROUPS = [
"background_change", "color_alter", "material_alter", "motion_change", "ps_human", "style_change", "subject-add", "subject-remove", "subject-replace", "text_change", "tone_transfer"
]
def analyze_scores(save_path_dir, evaluate_group, language):
results = defaultdict(dict)
save_path_new = save_path_dir
model_total_score = defaultdict(dict)
group_dict_sub = {}
group_scores_semantics = defaultdict(lambda: defaultdict(list))
group_scores_quality = defaultdict(lambda: defaultdict(list))
group_scores_overall = defaultdict(lambda: defaultdict(list))
group_scores_semantics_intersection = defaultdict(lambda: defaultdict(list))
group_scores_quality_intersection = defaultdict(lambda: defaultdict(list))
group_scores_overall_intersection = defaultdict(lambda: defaultdict(list))
length_total = 0
save_path_dir_raw = save_path_dir
for group_name in GROUPS:
csv_path = os.path.join(save_path_new, f"{evaluate_group[0]}_{group_name}_gpt_score.csv")
csv_file = megfile.smart_open(csv_path)
df = pd.read_csv(csv_file)
filtered_semantics_scores = []
filtered_quality_scores = []
filtered_overall_scores = []
filtered_semantics_scores_intersection = []
filtered_quality_scores_intersection = []
filtered_overall_scores_intersection = []
for _, row in df.iterrows():
source_image = row['source_image']
edited_image = row['edited_image']
instruction = row['instruction']
semantics_score = row['sementics_score']
quality_score = row['quality_score']
intersection_exist = row['intersection_exist']
instruction_language = row['instruction_language']
if instruction_language == language:
pass
else:
continue
overall_score = math.sqrt(semantics_score * quality_score)
filtered_semantics_scores.append(semantics_score)
filtered_quality_scores.append(quality_score)
filtered_overall_scores.append(overall_score)
if intersection_exist:
filtered_semantics_scores_intersection.append(semantics_score)
filtered_quality_scores_intersection.append(quality_score)
filtered_overall_scores_intersection.append(overall_score)
avg_semantics_score = np.mean(filtered_semantics_scores)
avg_quality_score = np.mean(filtered_quality_scores)
avg_overall_score = np.mean(filtered_overall_scores)
group_scores_semantics[evaluate_group[0]][group_name] = avg_semantics_score
group_scores_quality[evaluate_group[0]][group_name] = avg_quality_score
group_scores_overall[evaluate_group[0]][group_name] = avg_overall_score
avg_semantics_score_intersection = np.mean(filtered_semantics_scores_intersection)
avg_quality_score_intersection = np.mean(filtered_quality_scores_intersection)
avg_overall_score_intersection = np.mean(filtered_overall_scores_intersection)
group_scores_semantics_intersection[evaluate_group[0]][group_name] = avg_semantics_score_intersection
group_scores_quality_intersection[evaluate_group[0]][group_name] = avg_quality_score_intersection
group_scores_overall_intersection[evaluate_group[0]][group_name] = avg_overall_score_intersection
print("\n--- Overall Model Averages ---")
print("\nSemantics:")
for model_name in evaluate_group:
model_scores = [group_scores_semantics[model_name][group] for group in GROUPS]
model_avg = np.mean(model_scores)
group_scores_semantics[model_name]["avg_semantics"] = model_avg
print("\nSemantics Intersection:")
for model_name in evaluate_group:
model_scores = [group_scores_semantics_intersection[model_name][group] for group in GROUPS]
model_avg = np.mean(model_scores)
group_scores_semantics_intersection[model_name]["avg_semantics"] = model_avg
print("\nQuality:")
for model_name in evaluate_group:
model_scores = [group_scores_quality[model_name][group] for group in GROUPS]
model_avg = np.mean(model_scores)
group_scores_quality[model_name]["avg_quality"] = model_avg
print("\nQuality Intersection:")
for model_name in evaluate_group:
model_scores = [group_scores_quality_intersection[model_name][group] for group in GROUPS]
model_avg = np.mean(model_scores)
group_scores_quality_intersection[model_name]["avg_quality"] = model_avg
print("\nOverall:")
for model_name in evaluate_group:
model_scores = [group_scores_overall[model_name][group] for group in GROUPS]
model_avg = np.mean(model_scores)
group_scores_overall[model_name]["avg_overall"] = model_avg
print("\nOverall Intersection:")
for model_name in evaluate_group:
model_scores = [group_scores_overall_intersection[model_name][group] for group in GROUPS]
model_avg = np.mean(model_scores)
group_scores_overall_intersection[model_name]["avg_overall"] = model_avg
return group_scores_semantics, group_scores_quality, group_scores_overall, group_scores_semantics_intersection, group_scores_quality_intersection, group_scores_overall_intersection
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="step1x")
parser.add_argument("--save_path", type=str, default="/results/")
parser.add_argument("--backbone", type=str, default="gpt4o", choices=["gpt4o", "qwen25vl"])
parser.add_argument("--language", type=str, default="en", choices=["en", "zh"])
args = parser.parse_args()
model_name = args.model_name
save_path_dir = args.save_path
evaluate_group = [args.model_name]
backbone = args.backbone
save_path_new = os.path.join(save_path_dir, model_name, backbone, "eval_results_new")
print("\nOverall:")
for model_name in evaluate_group:
group_scores_semantics, group_scores_quality, group_scores_overall, group_scores_semantics_intersection, group_scores_quality_intersection, group_scores_overall_intersection = analyze_scores(save_path_new, [model_name], language=args.language)
for group_name in GROUPS:
print(f"{group_name}: {group_scores_semantics[model_name][group_name]:.3f}, {group_scores_quality[model_name][group_name]:.3f}, {group_scores_overall[model_name][group_name]:.3f}")
print(f"Average: {group_scores_semantics[model_name]['avg_semantics']:.3f}, {group_scores_quality[model_name]['avg_quality']:.3f}, {group_scores_overall[model_name]['avg_overall']:.3f}")
print("\nIntersection:")
for group_name in GROUPS:
print(f"{group_name}: {group_scores_semantics_intersection[model_name][group_name]:.3f}, {group_scores_quality_intersection[model_name][group_name]:.3f}, {group_scores_overall_intersection[model_name][group_name]:.3f}")
print(f"Average Intersection: {group_scores_semantics_intersection[model_name]['avg_semantics']:.3f}, {group_scores_quality_intersection[model_name]['avg_quality']:.3f}, {group_scores_overall_intersection[model_name]['avg_overall']:.3f}")
from datasets import Dataset, load_dataset
import math, os
# Dataset info structure:
# - task_type: string - Type of the task
# - key: string - Unique identifier for the sample
# - instruction: string - Task instruction/prompt
# - instruction_language: string - Language of the instruction
# - input_image: Image - Original input image
# - input_image_raw: Image - Raw/unprocessed input image
# - Intersection_exist: bool - Whether intersection exists
def calculate_dimensions(target_area, ratio):
# 根据长宽比计算宽度和高度
width = math.sqrt(target_area * ratio)
height = width / ratio
# 确保宽度和高度都能被16整除
width = round(width / 32) * 32
height = round(height / 32) * 32
# 重新计算面积以确保尽可能接近目标面积
new_area = width * height
if new_area < target_area:
width += 32
new_area = width * height
elif new_area > target_area:
width -= 32
new_area = width * height
return width, height, new_area
dataset = load_dataset("stepfun-ai/GEdit-Bench")
save_path = "your_save_dir/modelname/"
for item in dataset['train']:
task_type = item['task_type']
key = item['key']
instruction = item['instruction']
instruction_language = item['instruction_language']
input_image = item['input_image']
input_image_raw = item['input_image_raw']
intersection_exist = item['Intersection_exist']
target_width, target_height, new_are = calculate_dimensions(512 * 512, input_image_raw.width / input_image_raw.height)
resize_input_image = input_image_raw.resize((target_width, target_height))
save_path_fullset_source_image = f"{save_path}/fullset/{task_type}/{instruction_language}/{key}_SRCIMG.png"
save_path_fullset = f"{save_path}/fullset/{task_type}/{instruction_language}/{key}.png"
os.makedirs(os.path.dirname(save_path_fullset_source_image), exist_ok=True)
os.makedirs(os.path.dirname(save_path_fullset), exist_ok=True)
input_image.save(save_path_fullset_source_image)
resize_input_image.save(save_path_fullset)
\ No newline at end of file
name: qwen25vl
channels:
- pytorch
- defaults
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch
- nvidia
- https://repo.anaconda.com/pkgs/main
- https://repo.anaconda.com/pkgs/r
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=5.1=1_gnu
- bzip2=1.0.8=h5eee18b_6
- ca-certificates=2025.2.25=h06a4308_0
- ld_impl_linux-64=2.40=h12ee557_0
- libffi=3.4.4=h6a678d5_1
- libgcc-ng=11.2.0=h1234567_1
- libgomp=11.2.0=h1234567_1
- libstdcxx-ng=11.2.0=h1234567_1
- libuuid=1.41.5=h5eee18b_0
- ncurses=6.4=h6a678d5_0
- openssl=3.0.16=h5eee18b_0
- pip=25.0=py311h06a4308_0
- python=3.11.10=he870216_0
- readline=8.2=h5eee18b_0
- sqlite=3.45.3=h5eee18b_0
- tk=8.6.14=h39e8969_0
- wheel=0.45.1=py311h06a4308_0
- xz=5.6.4=h5eee18b_1
- zlib=1.2.13=h5eee18b_1
- pip:
- absl-py==2.2.2
- accelerate==1.6.0
- addict==2.4.0
- aiofiles==24.1.0
- aiohappyeyeballs==2.6.1
- aiohttp==3.11.18
- aiosignal==1.3.2
- aliyun-python-sdk-core==2.16.0
- aliyun-python-sdk-kms==2.16.5
- annotated-types==0.7.0
- anyio==4.9.0
- attrdict==2.0.1
- attrs==25.3.0
- autoawq==0.2.8
- autoawq-kernels==0.0.9
- av==14.3.0
- bcrypt==4.3.0
- binpacking==1.5.2
- boto3==1.37.38
- botocore==1.37.38
- certifi==2025.1.31
- cffi==1.17.1
- charset-normalizer==3.4.1
- click==8.1.8
- contourpy==1.3.2
- cpm-kernels==1.0.11
- crcmod==1.7
- cryptography==44.0.2
- cycler==0.12.1
- dacite==1.9.2
- datasets==3.2.0
- decord==0.6.0
- dill==0.3.8
- distro==1.9.0
- docstring-parser==0.16
- einops==0.8.1
- fastapi==0.115.12
- ffmpy==0.5.0
- filelock==3.18.0
- flash-attn==2.7.4.post1
- fonttools==4.57.0
- frozenlist==1.6.0
- fsspec==2024.9.0
- future==1.0.0
- gekko==1.3.0
- gradio==5.25.2
- gradio-client==1.8.0
- groovy==0.1.2
- grpcio==1.71.0
- h11==0.14.0
- httpcore==1.0.8
- httpx==0.28.1
- huggingface-hub==0.30.2
- idna==3.10
- importlib-metadata==8.6.1
- jieba==0.42.1
- jinja2==3.1.6
- jiter==0.9.0
- jmespath==0.10.0
- joblib==1.4.2
- kiwisolver==1.4.8
- markdown==3.8
- markdown-it-py==3.0.0
- markupsafe==3.0.2
- matplotlib==3.10.1
- mdurl==0.1.2
- megfile==4.1.4
- modelscope==1.25.0
- mpmath==1.3.0
- ms-swift==3.0.0
- multidict==6.4.3
- multiprocess==0.70.16
- networkx==3.4.2
- nltk==3.9.1
- numpy==1.26.4
- nvidia-cublas-cu12==12.4.5.8
- nvidia-cuda-cupti-cu12==12.4.127
- nvidia-cuda-nvrtc-cu12==12.4.127
- nvidia-cuda-runtime-cu12==12.4.127
- nvidia-cudnn-cu12==9.1.0.70
- nvidia-cufft-cu12==11.2.1.3
- nvidia-curand-cu12==10.3.5.147
- nvidia-cusolver-cu12==11.6.1.9
- nvidia-cusparse-cu12==12.3.1.170
- nvidia-cusparselt-cu12==0.6.2
- nvidia-nccl-cu12==2.21.5
- nvidia-nvjitlink-cu12==12.4.127
- nvidia-nvtx-cu12==12.4.127
- openai==1.75.0
- orjson==3.10.16
- oss2==2.19.1
- packaging==25.0
- pandas==2.2.3
- paramiko==3.5.1
- peft==0.14.0
- pillow==11.2.1
- propcache==0.3.1
- protobuf==6.30.2
- psutil==7.0.0
- pyarrow==19.0.1
- pycparser==2.22
- pycryptodome==3.22.0
- pydantic==2.11.3
- pydantic-core==2.33.1
- pydub==0.25.1
- pygments==2.19.1
- pynacl==1.5.0
- pyparsing==3.2.3
- python-dateutil==2.9.0.post0
- python-magic==0.4.27
- python-multipart==0.0.20
- pytz==2025.2
- pyyaml==6.0.2
- qwen-vl-utils==0.0.8
- regex==2024.11.6
- requests==2.32.3
- rich==14.0.0
- rouge==1.0.1
- ruff==0.11.6
- s3transfer==0.11.5
- safehttpx==0.1.6
- safetensors==0.5.3
- scipy==1.15.2
- semantic-version==2.10.0
- sentencepiece==0.2.0
- setuptools==69.5.1
- shellingham==1.5.4
- shtab==1.7.2
- simplejson==3.20.1
- six==1.17.0
- sniffio==1.3.1
- sortedcontainers==2.4.0
- starlette==0.46.2
- sympy==1.13.1
- tensorboard==2.19.0
- tensorboard-data-server==0.7.2
- tiktoken==0.9.0
- tokenizers==0.21.1
- tomlkit==0.13.2
- torch==2.6.0
- torchaudio==2.6.0
- torchvision==0.21.0
- tqdm==4.67.1
- transformers==4.52.0.dev0
- transformers-stream-generator==0.0.5
- triton==3.2.0
- trl==0.11.4
- typeguard==4.4.2
- typer==0.15.2
- typing-extensions==4.13.2
- typing-inspection==0.4.0
- tyro==0.9.19
- tzdata==2025.2
- urllib3==2.4.0
- uvicorn==0.34.2
- websockets==15.0.1
- werkzeug==3.1.3
- xxhash==3.5.0
- yarl==1.20.0
- zipp==3.21.0
- zstandard==0.23.0
prefix: /data/miniconda3/envs/qwen25vl
from viescore import VIEScore
import PIL
import os
import megfile
from PIL import Image
from tqdm import tqdm
from datasets import load_dataset, load_from_disk
import sys
import csv
import threading
import time
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed
GROUPS = [
"background_change", "color_alter", "material_alter", "motion_change", "ps_human", "style_change", "subject-add", "subject-remove", "subject-replace", "text_change", "tone_transfer"
]
def process_single_item(item, vie_score, max_retries=10000):
instruction = item['instruction']
key = item['key']
instruction_language = item['instruction_language']
intersection_exist = item['Intersection_exist']
sample_prefix = key
save_path_fullset_source_image = f"{save_path}/fullset/{group_name}/{instruction_language}/{key}_SRCIMG.png"
save_path_fullset_result_image = f"{save_path}/fullset/{group_name}/{instruction_language}/{key}.png"
src_image_path = save_path_fullset_source_image
save_path_item = save_path_fullset_result_image
for retry in range(max_retries):
try:
pil_image_raw =Image.open(megfile.smart_open(src_image_path, 'rb'))
pil_image_edited = Image.open(megfile.smart_open(save_path_item, 'rb')).convert("RGB").resize((pil_image_raw.size[0], pil_image_raw.size[1]))
text_prompt = instruction
score_list = vie_score.evaluate([pil_image_raw, pil_image_edited], text_prompt)
sementics_score, quality_score, overall_score = score_list
print(f"sementics_score: {sementics_score}, quality_score: {quality_score}, overall_score: {overall_score}, instruction_language: {instruction_language}, instruction: {instruction}")
return {
"source_image": src_image_path,
"edited_image": save_path_item,
"instruction": instruction,
"sementics_score": sementics_score,
"quality_score": quality_score,
"intersection_exist" : item['Intersection_exist'],
"instruction_language" : item['instruction_language']
}
except Exception as e:
if retry < max_retries - 1:
wait_time = (retry + 1) * 2 # 指数退避:2秒, 4秒, 6秒...
print(f"Error processing {save_path_item} (attempt {retry + 1}/{max_retries}): {e}")
print(f"Waiting {wait_time} seconds before retry...")
time.sleep(wait_time)
else:
print(f"Failed to process {save_path_item} after {max_retries} attempts: {e}")
return
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="gpt4o")
parser.add_argument("--save_path", type=str, default="/results/")
parser.add_argument("--backbone", type=str, default="gpt4o", choices=["gpt4o", "qwen25vl"])
args = parser.parse_args()
model_name = args.model_name
save_path_dir = args.save_path
evaluate_group = [args.model_name]
backbone = args.backbone
vie_score = VIEScore(backbone=backbone, task="tie", key_path='secret_t2.env')
max_workers = 5
dataset = load_dataset("stepfun-ai/GEdit-Bench")
for model_name in evaluate_group:
save_path = os.path.join(save_path_dir, model_name)
save_path_new = os.path.join(save_path_dir, model_name, backbone, "eval_results_new")
all_csv_list = [] # Store all results for final combined CSV
# Load existing processed samples from final CSV if it exists
processed_samples = set()
final_csv_path = os.path.join(save_path_new, f"{model_name}_combined_gpt_score.csv")
if megfile.smart_exists(final_csv_path):
with megfile.smart_open(final_csv_path, 'r', newline='') as f:
reader = csv.DictReader(f)
for row in reader:
# Create a unique identifier for each sample
sample_key = (row['source_image'], row['edited_image'])
processed_samples.add(sample_key)
print(f"Loaded {len(processed_samples)} processed samples from existing CSV")
for group_name in GROUPS:
group_csv_list = []
group_dataset_list = []
for item in tqdm(dataset, desc=f"Processing {model_name} - {group_name}"):
if item['task_type'] == group_name:
group_dataset_list.append(item)
# Load existing group CSV if it exists
group_csv_path = os.path.join(save_path_new, f"{model_name}_{group_name}_gpt_score.csv")
if megfile.smart_exists(group_csv_path):
with megfile.smart_open(group_csv_path, 'r', newline='') as f:
reader = csv.DictReader(f)
group_results = list(reader)
group_csv_list.extend(group_results)
print(f"Loaded existing results for {model_name} - {group_name}")
print(f"Processing group: {group_name}")
print(f"Processing model: {model_name}")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for item in group_dataset_list:
instruction = item['instruction']
key = item['key']
instruction_language = item['instruction_language']
intersection_exist = item['Intersection_exist']
sample_prefix = key
save_path_fullset_source_image = f"{save_path}/fullset/{group_name}/{instruction_language}/{key}_SRCIMG.png"
save_path_fullset_result_image = f"{save_path}/fullset/{group_name}/{instruction_language}/{key}.png"
if not megfile.smart_exists(save_path_fullset_result_image) or not megfile.smart_exists(save_path_fullset_source_image):
print(f"Skipping {sample_prefix}: Source or edited image does not exist")
continue
# Check if this sample has already been processed
sample_key = (save_path_fullset_source_image, save_path_fullset_result_image)
exists = sample_key in processed_samples
if exists:
print(f"Skipping already processed sample: {sample_prefix}")
continue
future = executor.submit(process_single_item, item, vie_score)
futures.append(future)
for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing {model_name} - {group_name}"):
result = future.result()
if result:
group_csv_list.append(result)
# Save group-specific CSV
group_csv_path = os.path.join(save_path_new, f"{model_name}_{group_name}_gpt_score.csv")
with megfile.smart_open(group_csv_path, 'w', newline='') as f:
fieldnames = ["source_image", "edited_image", "instruction", "sementics_score", "quality_score", "intersection_exist", "instruction_language"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in group_csv_list:
writer.writerow(row)
all_csv_list.extend(group_csv_list)
print(f"Saved group CSV for {group_name}, length: {len(group_csv_list)}")
# After processing all groups, calculate and save combined results
if not all_csv_list:
print(f"Warning: No results for model {model_name}, skipping combined CSV generation")
continue
# Save combined CSV
combined_csv_path = os.path.join(save_path_new, f"{model_name}_combined_gpt_score.csv")
with megfile.smart_open(combined_csv_path, 'w', newline='') as f:
fieldnames = ["source_image", "edited_image", "instruction", "sementics_score", "quality_score", "intersection_exist", "instruction_language"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in all_csv_list:
writer.writerow(row)
import sys
sys.path.insert(0, 'viescore')
from utils import (
mllm_output_to_dict
)
import math
import vie_prompts
class VIEScore:
def __init__(self, backbone="gpt4o", task="t2i", key_path=None) -> None:
self.task = task
self.backbone_name = backbone
if self.task not in ["t2i", "tie", "t2v"]:
raise ValueError("task must be either 't2i' or 'tie'")
if self.backbone_name == "gpt4o":
from mllm_tools.openai import GPT4o
self.model = GPT4o(key_path, model_name="gpt-4.1")
elif self.backbone_name == "gpt4v":
from mllm_tools.openai import GPT4v
self.model = GPT4v(key_path)
elif self.backbone_name == "gemini":
from mllm_tools.gemini import Gemini
self.model = Gemini()
elif self.backbone_name == "idefics2":
from mllm_tools.idefics2_eval import Idefics2
self.model = Idefics2()
elif self.backbone_name == "mantis":
from mllm_tools.mantis_idefics2_eval import Mantis
self.model = Mantis()
elif self.backbone_name == "minicpmv":
from mllm_tools.minicpmv_eval import MiniCPMV
self.model = MiniCPMV()
elif self.backbone_name == "qwen25vl":
from mllm_tools.qwen25vl_eval import Qwen25VL
self.model = Qwen25VL()
else:
raise NotImplementedError("backbone not supported")
self.context = vie_prompts._context_no_delimit
if self.task == "t2i":
self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_one_image_gen_rule, vie_prompts._prompts_0shot_t2i_rule_SC])
self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_rule_PQ])
elif self.task == "tie":
self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_two_image_edit_rule, vie_prompts._prompts_0shot_tie_rule_SC])
self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_rule_PQ])
elif self.task == "t2v":
self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_one_video_gen_rule, vie_prompts._prompts_0shot_t2v_rule_SC])
self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_t2v_rule_PQ])
def evaluate(self, image_prompts, text_prompt, extract_overall_score_only=False, extract_all_score=True, echo_output=False):
if not isinstance(image_prompts, list):
image_prompts = [image_prompts]
if self.backbone_name in ['gpt4o', 'gpt4v']:
self.model.use_encode = False if isinstance(image_prompts[0], str) else True
#print("Using encode:", self.model.use_encode)
if self.task == "t2i":
_SC_prompt = self.SC_prompt.replace("<prompt>", text_prompt)
elif self.task == "tie":
_SC_prompt = self.SC_prompt.replace("<instruction>", text_prompt)
elif self.task == "t2v":
_SC_prompt = self.SC_prompt.replace("<prompt>", text_prompt)
SC_prompt_final = self.model.prepare_prompt(image_prompts, _SC_prompt)
if self.task == "tie":
PQ_prompt_final = self.model.prepare_prompt(image_prompts[-1], self.PQ_prompt)
else:
PQ_prompt_final = self.model.prepare_prompt(image_prompts, self.PQ_prompt)
results_dict = {}
SC_dict = False
PQ_dict = False
tries = 0
max_tries = 1
while SC_dict is False or PQ_dict is False:
tries += 1
guess_if_cannot_parse = True if tries > max_tries else False
result_SC = self.model.get_parsed_output(SC_prompt_final)
result_PQ = self.model.get_parsed_output(PQ_prompt_final)
SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=guess_if_cannot_parse)
PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=guess_if_cannot_parse)
if SC_dict == "rate_limit_exceeded" or PQ_dict == "rate_limit_exceeded":
print("rate_limit_exceeded")
raise ValueError("rate_limit_exceeded")
results_dict['SC'] = SC_dict
results_dict['PQ'] = PQ_dict
if echo_output:
print("results_dict", results_dict)
if extract_all_score:
SC_score = min(results_dict['SC']['score'])
PQ_score = min(results_dict['PQ']['score'])
O_score = math.sqrt(SC_score * PQ_score)
return [SC_score, PQ_score, O_score]
if extract_overall_score_only:
SC_scores = results_dict['SC']['score']
PQ_scores = results_dict['PQ']['score']
O_score = math.sqrt(min(SC_scores) * min(PQ_scores))
return O_score
return results_dict
if __name__ == "__main__":
model = VIEScore(backbone="gemini", task="t2i")
from datasets import load_dataset
dataset = load_dataset("TIGER-Lab/GenAI-Arena-Bench", "image_generation")
dataset = dataset["test"]
print("Now running the VIEScore model")
for idx in range(5):
left_image = dataset['left_image'][idx]
right_image = dataset['right_image'][idx]
prompt = dataset['prompt'][idx]
print(model.evaluate(left_image, prompt, extract_all_score=True))
print(model.evaluate(right_image, prompt, extract_all_score=True))
"""
Install the Google AI Python SDK
$ pip install google-generativeai
See the getting started guide for more information:
https://ai.google.dev/gemini-api/docs/get-started/python
"""
import requests
from PIL import Image
from io import BytesIO
import os
from typing import List
from urllib.parse import urlparse
import google.generativeai as genai
import tempfile
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
def upload_to_gemini(input, mime_type=None):
"""Uploads the given file or PIL image to Gemini.
See https://ai.google.dev/gemini-api/docs/prompting_with_media
"""
if isinstance(input, str):
# Input is a file path
file = genai.upload_file(input, mime_type=mime_type)
elif isinstance(input, Image.Image):
# Input is a PIL image
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
input.save(tmp_file, format="JPEG")
tmp_file_path = tmp_file.name
file = genai.upload_file(tmp_file_path, mime_type=mime_type or "image/jpeg")
os.remove(tmp_file_path)
else:
raise ValueError("Unsupported input type. Must be a file path or PIL Image.")
#print(f"Uploaded file '{file.display_name}' as: {file.uri}")
return file
def save_image_from_url(url, base_save_directory='tmp', file_name=None):
# Parse the URL to create a directory path
parsed_url = urlparse(url)
url_path = os.path.join(parsed_url.netloc, parsed_url.path.lstrip('/'))
save_directory = os.path.join(base_save_directory, os.path.dirname(url_path))
# Create the directory if it doesn't exist
if not os.path.exists(save_directory):
os.makedirs(save_directory)
# Get the image from the URL
response = requests.get(url)
if response.status_code == 200:
# Open the image
image = Image.open(BytesIO(response.content))
# Set the file name if not provided
if not file_name:
file_name = os.path.basename(parsed_url.path)
# Save the image locally
file_path = os.path.join(save_directory, file_name)
image.save(file_path)
return file_path
else:
raise Exception(f"Failed to retrieve image from URL. Status code: {response.status_code}")
class Gemini():
def __init__(self, model_name="gemini-1.5-pro-latest"):
# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
},
]
self.model = genai.GenerativeModel(
model_name=model_name,
safety_settings=safety_settings,
generation_config=generation_config,
)
def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
if not isinstance(image_links, list):
image_links = [image_links]
images_prompt = []
for image_link in image_links:
if isinstance(image_link, str):
image = save_image_from_url(image_link)
else:
image = image_link
image = upload_to_gemini(image, mime_type="image/jpeg")
images_prompt.append(image)
prompt_content = [images_prompt, text_prompt]
return prompt_content
def get_parsed_output(self, prompt):
images_prompt = prompt[0]
text_prompt = prompt[1]
chat_session = self.model.start_chat(
history=[
{
"role": "user",
"parts": images_prompt,
},
]
)
try:
response = chat_session.send_message(text_prompt)
except:
return "Error in sending message to chat session."
return self.extract_response(response)
def extract_response(self, response):
response = response.text
return response
if __name__ == "__main__":
model = Gemini()
prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
print("prompt : \n", prompt)
res = model.get_parsed_output(prompt)
print("result : \n", res)
\ No newline at end of file
import os
import torch
import time
from typing import List
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
from transformers.utils import is_flash_attn_2_available
class Idefics2():
def __init__(self, model_path:str="HuggingFaceM4/idefics2-8b") -> None:
attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
print(f"Using {attn_implementation} for attention implementation")
self.model = AutoModelForVision2Seq.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, _attn_implementation=attn_implementation).eval()
self.processor = AutoProcessor.from_pretrained(model_path)
def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
if not isinstance(image_links, list):
image_links = [image_links]
messages = [
{
"role": "user",
"content": [ {"type": "image"}] * len(image_links) + [{"type": "text", "text": text_prompt}]
}
]
prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
images = [load_image(image_link) for image_link in image_links] #Support PIL images as well
inputs = self.processor(text=prompt, images=images, return_tensors="pt")
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
return inputs
def get_parsed_output(self, inputs):
generate_ids = self.model.generate(**inputs, max_new_tokens=512, num_beams=1)
generated_text = self.processor.batch_decode(generate_ids[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return generated_text
if __name__ == "__main__":
model = Idefics2()
prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
#print("prompt : \n", prompt)
res = model.get_parsed_output(prompt)
print("result : \n", res)
\ No newline at end of file
import os
import torch
import time
from typing import List
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
from transformers.utils import is_flash_attn_2_available
class Mantis():
def __init__(self, model_path:str="TIGER-Lab/Mantis-8B-Idefics2") -> None:
attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
print(f"Using {attn_implementation} for attention implementation")
self.model = AutoModelForVision2Seq.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, _attn_implementation=attn_implementation).eval()
self.processor = AutoProcessor.from_pretrained(model_path)
def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
if not isinstance(image_links, list):
image_links = [image_links]
messages = [
{
"role": "user",
"content": [ {"type": "image"}] * len(image_links) + [{"type": "text", "text": text_prompt}]
}
]
prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
images = [load_image(image_link) for image_link in image_links] #Support PIL images as well
inputs = self.processor(text=prompt, images=images, return_tensors="pt")
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
return inputs
def get_parsed_output(self, inputs):
generate_ids = self.model.generate(**inputs, max_new_tokens=512, num_beams=1)
generated_text = self.processor.batch_decode(generate_ids[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return generated_text
if __name__ == "__main__":
model = Mantis()
prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
#print("prompt : \n", prompt)
res = model.get_parsed_output(prompt)
print("result : \n", res)
\ No newline at end of file
import os
import torch
import time
from PIL import Image
from typing import List
from transformers import AutoModel, AutoTokenizer
from transformers.utils import is_flash_attn_2_available
class MiniCPMV():
def __init__(self) -> None:
attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
self.model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, torch_dtype=torch.float16, device_map='auto', _attn_implementation=attn_implementation).eval()
self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
print(f"Using {attn_implementation} for attention implementation")
def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
if not isinstance(image_links, list):
image_links = [image_links]
messages = [
{
"role": "user",
"content": [ {"type": "image"}] * len(image_links) + [{"type": "text", "text": text_prompt}]
}
]
return messages
def get_parsed_output(self, inputs):
res = self.model.chat(
image=None,
msgs=inputs,
tokenizer=self.tokenizer,
sampling=False, # if sampling=False, beam_search will be used by default
)
return res
if __name__ == "__main__":
model = MiniCPMV()
prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
#print("prompt : \n", prompt)
res = model.get_parsed_output(prompt)
print("result : \n", res)
\ No newline at end of file
import base64
import requests
from io import BytesIO, StringIO
from typing import Union, Optional, Tuple, List
from PIL import Image, ImageOps
import os
def get_api_key(file_path):
# Read the API key from the first line of the file
with open(file_path, 'r') as file:
return file.readline().strip()
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def pick_next_item(current_item, item_list):
if current_item not in item_list:
raise ValueError("Current item is not in the list")
current_index = item_list.index(current_item)
next_index = (current_index + 1) % len(item_list)
return item_list[next_index]
# Function to encode a PIL image
def encode_pil_image(pil_image):
# Create an in-memory binary stream
image_stream = BytesIO()
# Save the PIL image to the binary stream in JPEG format (you can change the format if needed)
pil_image.save(image_stream, format='JPEG')
# Get the binary data from the stream and encode it as base64
image_data = image_stream.getvalue()
base64_image = base64.b64encode(image_data).decode('utf-8')
return base64_image
def load_image(image: Union[str, Image.Image], format: str = "RGB", size: Optional[Tuple] = None) -> Image.Image:
"""
Load an image from a given path or URL and convert it to a PIL Image.
Args:
image (Union[str, Image.Image]): The image path, URL, or a PIL Image object to be loaded.
format (str, optional): Desired color format of the resulting image. Defaults to "RGB".
size (Optional[Tuple], optional): Desired size for resizing the image. Defaults to None.
Returns:
Image.Image: A PIL Image in the specified format and size.
Raises:
ValueError: If the provided image format is not recognized.
"""
if isinstance(image, str):
if image.startswith("http://") or image.startswith("https://"):
image = Image.open(requests.get(image, stream=True).raw)
elif os.path.isfile(image):
image = Image.open(image)
else:
raise ValueError(
f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
)
elif isinstance(image, Image.Image):
image = image
else:
raise ValueError(
"Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
)
image = ImageOps.exif_transpose(image)
image = image.convert(format)
if (size != None):
image = image.resize(size, Image.LANCZOS)
return image
class GPT4v():
def __init__(self, api_key_path='keys/secret.env', are_images_encoded=False, model_name="gpt-4-vision-preview"):
"""OpenAI GPT-4-vision model wrapper
Args:
api_key_path (str): Path to the API key file. Defaults to 'keys/secret.env'.
are_images_encoded (bool): Whether the images are encoded in base64. Defaults to False.
"""
self.multiple_api_keys = False
self.current_key_file = None
self.key_lists = None
if isinstance(api_key_path, list):
self.key_lists = api_key_path
self.current_key_file = api_key_path[0]
self.api_key = get_api_key(self.current_key_file)
self.multiple_api_keys = True
else:
self.api_key = get_api_key(api_key_path)
if not self.api_key:
print("API key not found.")
exit(1)
self.url = "https://api.openai.com/v1/chat/completions"
self.model_name = model_name
self.use_encode = are_images_encoded
def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
prompt_content = []
text_dict = {
"type": "text",
"text": text_prompt
}
prompt_content.append(text_dict)
if not isinstance(image_links, list):
image_links = [image_links]
for image_link in image_links:
image = load_image(image_link)
if self.use_encode == True:
visual_dict = {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{encode_pil_image(image)}"}
}
else:
visual_dict = {
"type": "image_url",
"image_url": {"url": image_link}
}
prompt_content.append(visual_dict)
return prompt_content
def get_parsed_output(self, prompt):
payload = {
"model": self.model_name,
"messages": [
{
"role": "user",
"content": prompt
}
],
"max_tokens": 1400
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
response = requests.post(self.url, json=payload, headers=headers)
#return response.text
return self.extract_response(response)
def extract_response(self, response):
response = response.json()
try:
out = response['choices'][0]['message']['content']
return out
except:
if response['error']['code'] == 'content_policy_violation':
print("Code is content_policy_violation")
elif response['error']['code'] == 'rate_limit_exceeded' or response['error']['code'] == 'insufficient_quota':
print(f"Code is {response['error']['code']}")
print(response['error']['message'])
if self.multiple_api_keys == True:
new_key = pick_next_item(self.current_key_file, self.key_lists)
self.update_key(new_key)
self.current_key_file = new_key #override key
print("New key is from the file: ", new_key)
else:
print("Code is different")
print(response)
return ""
def update_key(self, key, load_from_file=True):
if load_from_file:
self.api_key = get_api_key(key)
else:
self.api_key = key
class GPT4o(GPT4v):
def __init__(self, api_key_path='keys/secret.env', are_images_encoded=False, model_name="gpt-4o-2024-05-13"):
super().__init__(api_key_path, are_images_encoded, model_name)
if __name__ == "__main__":
model = GPT4o('secret_t2.env', model_name="gpt-4.1")
prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
print("prompt : \n", prompt)
res = model.get_parsed_output(prompt)
print("result : \n", res)
\ No newline at end of file
import os
import torch
import time
from PIL import Image
from typing import List
from transformers import AutoModel, AutoTokenizer
from transformers.utils import is_flash_attn_2_available
from transformers import Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor
import requests
from io import BytesIO
import random
import numpy as np
import base64
import magic
import megfile
def process_image(image):
img_byte_arr = BytesIO()
image.save(img_byte_arr, format='PNG')
img_byte_arr = img_byte_arr.getvalue()
return img_byte_arr
def convert_image_to_base64(file_content):
mime_type = magic.from_buffer(file_content, mime=True)
base64_encoded_data = base64.b64encode(file_content).decode('utf-8')
return f"data:{mime_type};base64,{base64_encoded_data}"
def set_seed(seed: int):
"""
Args:
Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
seed (`int`): The seed to set.
"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
class Qwen25VL():
def __init__(self) -> None:
attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"/mnt/jfs-test/pretrained_models/Qwen2.5-VL-72B-Instruct-AWQ",
torch_dtype=torch.float16,
device_map="auto"
).eval()
self.processor = AutoProcessor.from_pretrained("/mnt/jfs-test/pretrained_models/Qwen2.5-VL-72B-Instruct-AWQ")
print(f"Using {attn_implementation} for attention implementation")
def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
if not isinstance(image_links, list):
image_links = [image_links]
image_links_base64 = []
for img_link in image_links:
if type(img_link) == str:
image_links_base64.append(convert_image_to_base64(process_image(megfile.smart_open(img_link, 'rb'))))
else:
image_links_base64.append(convert_image_to_base64(process_image(img_link)))
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": img_link} for img_link in image_links_base64
] + [{"type": "text", "text": text_prompt}]
}
]
return messages
def get_parsed_output(self, messages):
set_seed(42)
# Prepare the inputs
text = self.processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
# Process inputs
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt"
)
inputs = inputs.to("cuda")
# Generate output
generation_config = {
"max_new_tokens": 512,
"num_beams": 1,
"do_sample": False,
"temperature": 0.1,
"top_p": None,
}
generated_ids = self.model.generate(**inputs, **generation_config)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = self.processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
return output_text[0] if output_text else ""
if __name__ == "__main__":
model = Qwen25VL()
prompt = model.prepare_prompt(
["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"],
'Describe the image in detail.'
)
res = model.get_parsed_output(prompt)
print("result : \n", res)
\ No newline at end of file
from typing import List
import base64
from io import BytesIO
from PIL import Image
import requests
def pil_image_to_base64(pil_image, format="PNG"):
buffered = BytesIO()
pil_image.save(buffered, format=format) # Save image to the buffer in the specified format
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') # Encode the buffer's content to base64
return img_str
def load_image(image_file):
if image_file.startswith("http"):
response = requests.get(image_file)
image = Image.open(BytesIO(response.content)).convert("RGB")
else:
import os
image = Image.open(image_file).convert("RGB")
return image
def load_images(image_files):
out = []
for image_file in image_files:
image = load_image(image_file)
out.append(image)
return out
def merge_images(image_links: List = []):
"""Merge multiple images into one image
Args:
image_links (List, optional): List of image links. Defaults to [].
Returns:
[type]: [description]
"""
if len(image_links) == 0:
return None
images = load_images(image_links)
if len(images) == 1:
return images[0]
widths, heights = zip(*(i.size for i in images))
average_height = sum(heights) // len(heights)
for i, im in enumerate(images):
# scale in proportion
images[i] = im.resize((int(im.size[0] * average_height / im.size[1]), average_height))
widths, heights = zip(*(i.size for i in images))
total_width = sum(widths)
max_height = max(heights)
new_im = Image.new("RGB", (total_width + 10 * (len(images) - 1), max_height))
x_offset = 0
for i, im in enumerate(images):
if i > 0:
# past a column of 1 pixel starting from x_offset width being black, 8 pixels being white, and 1 pixel being black
new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
x_offset += 1
new_im.paste(Image.new("RGB", (8, max_height), (255, 255, 255)), (x_offset, 0))
x_offset += 8
new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
x_offset += 1
new_im.paste(im, (x_offset, 0))
x_offset += im.size[0]
return new_im
\ No newline at end of file
import os
def create_python_file_with_texts(folder_path, output_file):
with open(output_file, 'w', encoding='utf-8') as out_file:
out_file.write("# This file is generated automatically through parse_prompt.py\n\n")
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith(".txt"):
file_path = os.path.join(root, file)
var_name = "_" + file_path.replace(folder_path, "").replace(os.sep, "_").replace(".txt", "").strip("_")
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().replace('"""', '\"\"\"')
out_file.write(f'{var_name} = """{content}"""\n\n')
# Example usage
current_file_path = os.path.abspath(__file__)
current_folder_path = os.path.dirname(current_file_path)
folder_path = os.path.join(current_folder_path, "prompts_raw")
output_file = os.path.join(current_folder_path, "vie_prompts.py")
create_python_file_with_texts(folder_path, output_file)
import os
from typing import Union, List, Optional
import json
import regex as re
import ast
import random
def fix_json(input_str):
# Add double quotes around keys using regex
fixed_str = re.sub(r'(\w+):', r'"\1":', input_str)
# Add double quotes around string values if necessary and wrap int/float values in []
def format_value(match):
key, value, comma = match.groups()
value = value.strip()
# Check if value is an integer or float
if re.match(r'^-?\d+(\.\d+)?$', value):
value = f'[{value}]'
# Check if value is a boolean or null
elif re.match(r'^(true|false|null)$', value, re.IGNORECASE):
pass # leave as is
else:
# Add quotes around string values
value = f'"{value}"'
return f'{key}: {value}{comma}'
fixed_str = re.sub(r'(".*?"):(.*?)(,|})', format_value, fixed_str)
return fixed_str
def read_file_to_string(file_path):
"""
Reads the contents of a text file and returns it as a string.
:param file_path: The path to the text file.
:return: A string containing the contents of the file.
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except FileNotFoundError:
print(f"The file {file_path} was not found.")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
def read_files_to_string(file_paths):
"""
Reads the contents of multiple text files and returns them as a single string,
with each file's contents separated by a newline.
:param file_paths: A list of paths to text files.
:return: A string containing the concatenated contents of the files.
"""
all_contents = [] # List to hold the contents of each file
for file_path in file_paths:
try:
with open(file_path, 'r', encoding='utf-8') as file:
all_contents.append(file.read())
except FileNotFoundError:
print(f"The file {file_path} was not found.")
except Exception as e:
print(f"An error occurred while reading {file_path}: {e}")
# Join all the contents with a newline character
return "\n".join(all_contents)
def get_file_path(filename: Union[str, os.PathLike], search_from: Union[str, os.PathLike] = "."):
"""
Search for a file across a directory and return its absolute path.
Args:
filename (Union[str, os.PathLike]): The name of the file to search for.
search_from (Union[str, os.PathLike], optional): The directory from which to start the search. Defaults to ".".
Returns:
str: Absolute path to the found file.
Raises:
FileNotFoundError: If the file is not found.
"""
for root, dirs, files in os.walk(search_from):
for name in files:
if name == filename:
return os.path.abspath(os.path.join(root, name))
raise FileNotFoundError(filename, "not found.")
#+=========================================================================================
def verify(s, target_sequence):
# Count the occurrences of the target sequence
count = s.count(target_sequence)
# Check if the target sequence appears exactly twice
return count == 2
def is_int_between_0_and_10(s):
try:
num = int(s)
return 0 <= num <= 10
except ValueError:
return False
def is_str_a_list_of_ints_0_to_10(s):
try:
# Attempt to parse the string as a Python literal (list, dict, etc.)
parsed = ast.literal_eval(s)
# Check if the parsed object is a list
if not isinstance(parsed, list):
return False
# Check if all elements are integers and between 0 to 10
return all(isinstance(item, int) and 0 <= item <= 10 for item in parsed)
except (ValueError, SyntaxError):
# If parsing fails or any other error occurs
return False
def is_str_valid_score_format_brackets(s):
try:
# Removing brackets and splitting the string by commas
content = s.strip("[]").split(',')
length = len(content)
# Parsing each element and checking the format and range
scores = {}
for item in content:
key, value = item.split(':')
key = key.strip()
value = int(value.strip())
# Check if the key starts with 'score' and the value is in the correct range
if not key.startswith("score") or not 0 <= value <= 10:
return False
scores[key] = value
fetch_words = [f"score{i+1}" for i in range(length)]
# Check if at least 'score1' and 'score2' are present
return all(key in scores for key in fetch_words)
except (ValueError, SyntaxError):
# If any parsing error occurs
return False
#+=========================================================================================
def mllm_output_to_dict(input_string, give_up_parsing=False):
"""
Args:
input_string (str): actually the output of the mllm model to be parsed
output_file_name (str): The name of the output file.
"""
# Catch for gpt4v rate_limit_exceeded error
if input_string == "rate_limit_exceeded":
return "rate_limit_exceeded"
# Define the delimiters
delimiter = '||V^=^V||'
if input_string.count(delimiter) == 2:
if not verify(input_string, delimiter):
print("The required delimiters were not found correctly in the string.")
return False
# Extract the content between the delimiters
start_index = input_string.find(delimiter) + len(delimiter)
end_index = input_string.rfind(delimiter)
else:
# find the json mannually
# some mllm tends not to output the delimiters, but it does output the json contents
# so we will find the json content mannually
start_index = input_string.find('{')
end_index = input_string.rfind('}') + 1
if start_index == -1 or end_index == 0:
# json not found
# some mllm tends to output only a list of scores like [6, 0],
# this time we will just get the scores and ignore the reasoning (other part of the json)
start_index = input_string.find('[')
end_index = input_string.rfind(']') + 1
if give_up_parsing: # if we want to give up parsing
guessed_value = random.randint(0, 10)
print(f"Failed to find the json content in the string. Guess a value : {guessed_value}.")
json_content = {'score': [guessed_value], "reasoning": f"guess_if_cannot_parse | {input_string}"}
json_str = json.dumps(json_content)
input_string = json_str
start_index = 0
end_index = len(json_str)
elif re.match(r'^\[\d+, ?\d+\]$', input_string[start_index:end_index]):
scores = json.loads(input_string[start_index:end_index])
if not isinstance(scores, list):
scores = [scores]
json_content = {'score': scores, "reasoning": "System: output is simply a list of scores"}
json_str = json.dumps(json_content)
input_string = json_str
start_index = 0
end_index = len(json_str)
elif is_int_between_0_and_10(input_string): # if output is simply a number
scores = [int(input_string)]
json_content = {'score': scores, "reasoning": "System: output is simply a number"}
json_str = json.dumps(json_content)
input_string = json_str
start_index = 0
end_index = len(json_str)
else:
print("Failed to find the json content in the string.")
return False
# Check if we found two delimiters
if start_index != -1 and end_index != -1 and start_index != end_index:
# Extract the JSON string
json_str = input_string[start_index:end_index].strip()
json_str = json_str.replace("\n", "")
# Parse the JSON string into a dictionary
try:
new_data = json.loads(json_str)
if not isinstance(new_data['score'], list):
new_data['score'] = [new_data['score']]
except:
print("Now fixing: ", json_str)
try:
new_data = json.loads(fix_json(json_str))
return new_data
except:
print("Error: Cannot fix", json_str)
return False
return new_data
else:
print("The required delimiters were not found correctly in the string.")
return False
def write_entry_to_json_file(input_string, uid, prompt_input, vision_input, output_file_name, give_up_parsing=False):
"""
Args:
input_string (str): actually the output of the mllm model to be parsed
uid (str): The unique identifier for the each item in the test data
prompt_input (str): The prompt input for the entry. text prompt.
vision_input (str): The vision input for the entry. image links.
output_file_name (str): The name of the output file.
"""
# Catch for gpt4v rate_limit_exceeded error
if input_string == "rate_limit_exceeded":
return "rate_limit_exceeded"
# Define the delimiters
delimiter = '||V^=^V||'
if input_string.count(delimiter) == 2:
if not verify(input_string, delimiter):
print("The required delimiters were not found correctly in the string.")
return False
# Extract the content between the delimiters
start_index = input_string.find(delimiter) + len(delimiter)
end_index = input_string.rfind(delimiter)
else:
# find the json mannually
# some mllm tends not to output the delimiters, but it does output the json contents
# so we will find the json content mannually
start_index = input_string.find('{')
end_index = input_string.rfind('}') + 1
if start_index == -1 or end_index == 0:
# json not found
# some mllm tends to output only a list of scores like [6, 0],
# this time we will just get the scores and ignore the reasoning (other part of the json)
start_index = input_string.find('[')
end_index = input_string.rfind(']') + 1
if give_up_parsing: # if we want to give up parsing
guessed_value = random.randint(0, 10)
print(f"Failed to find the json content in the string. Guess a value : {guessed_value}.")
json_content = {'score': [guessed_value], "reasoning": f"guess_if_cannot_parse | {input_string}"}
json_str = json.dumps(json_content)
input_string = json_str
start_index = 0
end_index = len(json_str)
elif re.match(r'^\[\d+, ?\d+\]$', input_string[start_index:end_index]):
scores = json.loads(input_string[start_index:end_index])
json_content = {'score': scores, "reasoning": None}
json_str = json.dumps(json_content)
input_string = json_str
start_index = 0
end_index = len(json_str)
elif is_int_between_0_and_10(input_string): # if output is simply a number
scores = [int(input_string)]
json_content = {'score': scores, "reasoning": None}
json_str = json.dumps(json_content)
input_string = json_str
start_index = 0
end_index = len(json_str)
else:
print("Failed to find the json content in the string.")
return False
# Check if we found two delimiters
if start_index != -1 and end_index != -1 and start_index != end_index:
# Extract the JSON string
json_str = input_string[start_index:end_index].strip()
json_str = json_str.replace("\n", "")
try:
# Parse the JSON string into a dictionary
new_data = json.loads(json_str)
# Ensure the directory exists
os.makedirs(os.path.dirname(output_file_name), exist_ok=True)
# Initialize or load existing data
if os.path.exists(output_file_name):
with open(output_file_name, 'r') as json_file:
data = json.load(json_file)
else:
data = {}
# If the additional key is already in the data, add or update notes
if uid in data:
data[uid].update(new_data) # Update with new data
if prompt_input: # If there are new notes, update or add them
data[uid]['prompt_input'] = prompt_input
if vision_input: # If there are new notes, update or add them
data[uid]['vision_input'] = vision_input
else:
# If it's a new key, add the entry to the dictionary
data[uid] = new_data
if prompt_input:
data[uid]['prompt_input'] = prompt_input
if vision_input:
data[uid]['vision_input'] = vision_input
# Write the updated data to the file
with open(output_file_name, 'w') as json_file:
json.dump(data, json_file, indent=4)
print(f"Data was successfully updated in {output_file_name}")
return True
except json.JSONDecodeError as e:
print(f"An error occurred while parsing the JSON content: {e}")
return False
else:
print("The required delimiters were not found correctly in the string.")
return False
def check_key_in_json(file_path, key):
try:
with open(file_path, 'r') as json_file:
data = json.load(json_file)
# Check if the key exists at the top level of the JSON structure
if key in data:
return True
else:
return False
except FileNotFoundError:
print(f"The file {file_path} was not found.")
except json.JSONDecodeError as e:
print(f"Error reading {file_path}: {e}")
except Exception as e:
print(f"An error occurred with {file_path}: {e}")
return False
\ No newline at end of file
This diff is collapsed.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment