mmlu-lmcache_enabled-dynamo.py

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This is a MMLU test script for dynamo LMCache testing (with LMCache enabled)
# Reference: https://github.com/LMCache/LMCache/blob/dev/.buildkite/correctness/2-mmlu.py

# ASSUMPTIONS:
# 1. dynamo is running (default: localhost:8000) with LMCache enabled
# 2. the mmlu dataset is in a "data" directory
# 3. all invocations of this script should be run in the same directory
#    (for later consolidation)

# Standard
import argparse
import json
import os
from typing import Optional

import numpy as np
import pandas as pd
import requests

# Third Party
from tqdm import tqdm
from transformers import AutoTokenizer, set_seed

tokenizer: Optional[AutoTokenizer] = None
choices = ["A", "B", "C", "D"]


# for complete determinism between runs of MMLU, we should:
# 1. set the seed of LLM requests to a fixed number (42)
# 2. set temperature to 0 on requests
def get_llm_response(args, prompt):
    # Use dynamo's completions API format
    data = {
        "model": args.model,
        "prompt": prompt,
        "temperature": 0,
        "max_tokens": 3,
        "stream": False,
        "seed": 42,  # Add explicit seed for determinism
    }
    url = f"http://{args.host}:{args.port}/v1/completions"
    res = requests.post(url, json=data, timeout=30)
    if res.status_code != 200:
        raise Exception(f"Error: {res.status_code} {res.text}")
    response_json = res.json()
    return response_json["choices"][0]["text"]


# grab the idx'th row of the df and generate a prompt string
# format of the MMLU csvs:
# question,option_A,option_B,option_C,option_D,answer
def prompt_string(df, idx, include_answer=True):
    prompt = df.iloc[idx, 0]
    k = df.shape[1] - 2  # number of columns - 2 (question and answer)
    for i in range(k):
        prompt += f"\n{choices[i]}. {df.iloc[idx, i + 1]}"
    prompt += "\nRespond with **only the letter** (A, B, C, D).  Do **not** output any explanation, analysis, or extra words. Answer:"
    if include_answer:
        prompt += f" {df.iloc[idx, k]}\n\n"
    return prompt


def evaluate(args, subject, dev_df, test_df):
    prompts, labels = [], []

    shared_multi_shot_prefix = [
        f"The following are multiple choice questions (with answers) \
                                about {subject}. \n\n"
    ]
    shared_multi_shot_prefix_length = 0
    for i in range(dev_df.shape[0]):
        # the multi-shot examples should contain answers
        shared_multi_shot_prefix.append(prompt_string(dev_df, i))

        # Use plain list of token IDs, no torch tensors
        assert tokenizer is not None, "Tokenizer must be initialized"
        token_ids = tokenizer(shared_multi_shot_prefix[-1], add_special_tokens=True)[  # type: ignore
            "input_ids"
        ]
        shared_multi_shot_prefix_length += len(token_ids)

        if shared_multi_shot_prefix_length > 4000:
            break

    # all already have double newlines at the end
    shared_multi_shot_prefix_str = "".join(shared_multi_shot_prefix)

    for i in range(test_df.shape[0]):
        # do NOT include the answer for the actual question we want the LLM to answer
        query_prompt = prompt_string(test_df, i, include_answer=False)
        prompt = f"{shared_multi_shot_prefix_str}\n\n{query_prompt}"
        prompts.append(prompt)
        label = test_df.iloc[i, test_df.shape[1] - 1]
        labels.append(label)

    predictions = []
    for i, prompt in enumerate(prompts):
        prediction = get_llm_response(args, prompt)
        prediction_stripped = prediction.strip()
        if prediction_stripped and prediction_stripped[0] in ["A", "B", "C", "D"]:
            predictions.append(prediction_stripped[0])
        else:
            # Fallback: look for any A, B, C, D in the response
            for char in prediction_stripped:
                if char in ["A", "B", "C", "D"]:
                    predictions.append(char)
                    break
            else:
                predictions.append("A")  # Default fallback

    accuracy = np.mean(np.array(predictions) == np.array(labels))
    return accuracy


def main(args):
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model)

    mmlu_files = os.listdir("data/test")
    test_files = [f for f in mmlu_files if f.endswith("_test.csv")]
    subjects = sorted([f.split("_test.csv")[0] for f in test_files])

    accuracies = []
    num_questions = []
    output_dict = {}

    for subject_raw in tqdm(
        subjects[: args.number_of_subjects], desc="Processing subjects"
    ):
        subject = " ".join(subject_raw.split("_"))  # replace underscores with spaces
        dev_df = pd.read_csv(
            os.path.join("data/dev", subject_raw + "_dev.csv"), header=None
        )
        test_df = pd.read_csv(
            os.path.join("data/test", subject_raw + "_test.csv"), header=None
        )
        accuracy = evaluate(args, subject, dev_df, test_df)
        accuracies.append(accuracy)
        num_questions.append(len(test_df))
        output_dict[subject_raw] = {"accuracy": accuracy, "num_questions": len(test_df)}

    total_accuracy = np.mean(accuracies)
    total_num_questions = sum(num_questions)
    output_dict["total"] = {
        "accuracy": total_accuracy,
        "num_questions": total_num_questions,
    }

    with open(args.result_file, "w") as f:
        # output will be a jsonl file
        for subject, value in output_dict.items():
            f.write(json.dumps({subject: value}) + "\n")


if __name__ == "__main__":
    set_seed(42)  # some tokenizers may have randomness
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True)
    parser.add_argument("--result-file", type=str, required=False)
    parser.add_argument("--number-of-subjects", type=int, required=True)
    parser.add_argument("--host", type=str, default="localhost", help="Dynamo host")
    parser.add_argument("--port", type=int, default=8000, help="Dynamo port")

    args = parser.parse_args()
    if args.result_file is None:
        # Clean model name if it's a path or has slashes
        model_name = args.model.split("/")[-1]
        args.result_file = f"dynamo-lmcache-{model_name}.jsonl"

    main(args)