evaluate_ssc.py

import openai
import os
import argparse
import json
import ast
from multiprocessing.pool import Pool
from tqdm import tqdm

def parse_args():
    parser = argparse.ArgumentParser(description="ssc-evaluation-gpt-4")
    parser.add_argument("--pred_path", default="output_dir/qwen/pred_subPlot_all.json", help="The path to file containing prediction.")
    parser.add_argument("--output_dir", default="output_dir/qwen_subplot_all", help="The path to save annotation json files.")
    parser.add_argument("--output_json", default="output_dir/qwen_subplot_all_results.json", help="The path to save annotation final combined json file.")
    parser.add_argument("--api_key", default="", help="OpenAI API key.")
    parser.add_argument("--num_tasks", default=1, type=int, help="Number of splits.")
    args = parser.parse_args()
    return args


def get_scoring_points(score_points="MLVU_all/json/8_sub_scene.json"):
    q_s_dict = {}
    all_data = json.load(open(score_points, "r"))
    for data in all_data:
        question = data["question"]
        score_point = data["scoring_points"]
        q_s_dict[question] = score_point
    return q_s_dict


def annotate(prediction_set, caption_files, output_dir):
    """
    Evaluates question and answer pairs using GPT-4
    Returns a score for correctness.
    """
    q_s_dict = get_scoring_points()
    for file in tqdm(caption_files):
        print("#############",file)
        key = file[:-5] # Strip file extension
        qa_set = prediction_set[key]
        question = qa_set['q']
        question = question.replace('\n','')
        answer = qa_set['a']
        pred = qa_set['pred']
        scoring_points = q_s_dict[question]
        try:
            # Compute the correctness score
            completion = openai.ChatCompletion.create(
                temperature=0,
                model="gpt-4-turbo",
                messages = [
                    {
                        "role": "system",
                        "content": 
                        """
                            ##TASK DESCRIPTION: 
                            You are required to evaluate a respondent's answer based on a provided question, some scoring points, and the respondent's answer. You should provide two scores. The first is the accuracy score, which should range from 1 to 5. The second is the relevance score, which should also range from 1 to 5. Below are the criteria for each scoring category.
                            ##ACCURACY Scoring Criteria: 
                            Evaluate the respondent's answer against specific scoring points as follows:
                            Score 1: The response completely misses the scoring point.
                            Score 3: The response mentions content related to the scoring point but is not entirely correct.
                            Score 5: The response accurately addresses the scoring point.
                            Calculate the average score across all scoring points to determine the final accuracy score.
                            ##RELEVANCE Scoring Criteria:
                            Assess how the respondent's answer relates to the original question:
                            Score 1: The response is completely off-topic from the question.
                            Score 2: The response is partially related to the question but contains a significant amount of irrelevant content.
                            Score 3: The response primarily addresses the question, but the respondent seems uncertain about their own answer.
                            Score 4: The response mostly addresses the question and the respondent appears confident in their answer.
                            Score 5: The response is fully focused on addressing the question with no irrelevant content and demonstrates complete certainty.
                            ----
                            ##INSTRUCTION:
                            1. Evaluate Accuracy: First, assess and score each scoring point based on the respondent's answer. Calculate the average of these scores to establish the final accuracy score. Provide a detailed rationale before assigning your score.
                            2. Evaluate RELEVANCE: Assess the relevance of the respondent’s answer to the question. Note that when evaluating relevance, the correctness of the answer is not considered; focus solely on how relevant the answer is to the question. Provide a comprehensive rationale before assigning your score.
                            3. Output Scores in JSON Format: Present the scores in JSON format as follows:
                            {'score_accuracy': score_acc, 'score_relevance': score_rele, 'total_score': score_acc + score_rele}
                        """
                    },
                    {
                        "role": "user",
                        "content": f"""
                            Please score the respondent's answer according to the steps in the Instructions. You must end with a JSON dict to store the scores.
                            Question: {question}
                            Scoring Points: {scoring_points}
                            Respondent's Answer: {pred}
                        """
                    }
                ]

            )
            # Convert response to a Python dictionary.
            response_message = completion["choices"][0]["message"]["content"]
            # print("#############",response_message)
         
          
            save_dict={}
         
            # response_dict = ast.literal_eval(response_message)
            qa_set["scoring_points"] = scoring_points
            save_dict["explain"] = response_message
            result_qa_pair = [save_dict, qa_set]

            # Save the question-answer pairs to a json file.
            with open(f"{output_dir}/{key}.json", "w") as f:
                json.dump(result_qa_pair, f)
            
    
        except Exception as e:
            print(f"Error processing file '{key}': {e}")


def main():
    """
    Main function to control the flow of the program.
    """
    # Parse arguments.
    args = parse_args()

    file = open(args.pred_path)
    pred_contents = json.load(file)

    # Dictionary to store the count of occurrences for each video_id
    video_id_counts = {}
    new_pred_contents = []

    # Iterate through each sample in pred_contents
    for sample in pred_contents:
        video_id = sample['video_name']
        if video_id in video_id_counts:
            video_id_counts[video_id] += 1
        else:
            video_id_counts[video_id] = 0

        # Create a new sample with the modified key
        new_sample = sample
        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
        new_pred_contents.append(new_sample)

    # Generating list of id's and corresponding files
    id_list = [x['video_name'] for x in new_pred_contents]
    caption_files = [f"{id}.json" for id in id_list]

    output_dir = args.output_dir

    # Generate output directory if not exists.
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    

    # Preparing dictionary of question-answer sets
    prediction_set = {}
    for sample in new_pred_contents:
        id = sample['video_name']
        question = sample['Q']
        answer = sample['A']
        pred = sample['pred']
        qa_set = {"q": question, "a": answer, "pred": pred}
        prediction_set[id] = qa_set

    # Set the OpenAI API key.
    openai.api_key = args.api_key
    num_tasks = args.num_tasks

    # While loop to ensure that all captions are processed.
    while True:
        try:
            # Files that have not been processed yet.
            completed_files = os.listdir(output_dir)
            print(f"completed_files: {len(completed_files)}")

            # Files that have not been processed yet.
            incomplete_files = [f for f in caption_files if f not in completed_files]
            print(f"incomplete_files: {len(incomplete_files)}")

            # Break the loop when there are no incomplete files
            if len(incomplete_files) == 0:
                break
            if len(incomplete_files) <= num_tasks:
                num_tasks = 1

            # Split tasks into parts.
            part_len = len(incomplete_files) // num_tasks
            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]

            # Use a pool of workers to process the files in parallel.
            with Pool(processes=1) as pool:
                pool.starmap(annotate, task_args)

        except Exception as e:
            print(f"Error: {e}")


    # Combine all the processed files into one
    combined_contents = {}
    json_path = args.output_json

    # Iterate through json files
    for file_name in os.listdir(output_dir):
        if file_name.endswith(".json"):
            file_path = os.path.join(output_dir, file_name)
            with open(file_path, "r") as json_file:
                content = json.load(json_file)
                combined_contents[file_name[:-5]] = content

    # Write combined content to a json file
    with open(json_path, "w") as json_file:
        json.dump(combined_contents, json_file)
    print("All evaluation completed!")

 
    # # Calculate average score and accuracy
    # score_sum = 0
    # count = 0
  
    # for key, result in combined_contents.items():
    #     # Computing score
    #     if "explain" in key:
    #         continue
    #     count += 1
    #     try :
    #         score_match = result[0]['score']
    #         score = int(score_match)
    #         score_sum += score
    #     except:
    #         print("Score not found for", key)
    #         continue


    # average_score = score_sum / count
    # print("Average score:", average_score)


if __name__ == "__main__":
    main()