feat: add ceval test script

515b9245 · PanZezhong · ddea3d19 · 515b9245
Commit 515b9245 authored Oct 10, 2025 by PanZezhong
Hide whitespace changes
Inline Side-by-side

Showing with 158 additions and 0 deletions

scripts/test_ceval.py scripts/test_ceval.py +158 -0

No files found.
--- a/scripts/test_ceval.py
+++ b/scripts/test_ceval.py
+import sys
+from jiuge import *
+from datasets import load_dataset
+
+
+class JiugeForCeval(JiugeForCauslLM):
+    def __init__(
+        self, model_dir_path, device=DeviceType.DEVICE_TYPE_CPU, ndev=1, max_tokens=None
+    ):
+        super().__init__(model_dir_path, device, ndev, max_tokens)
+        pass
+
+    def generate(self, conversation, max_steps, topp_=1.0, topk_=1, temperature_=1.0):
+        input_content = (
+            self.tokenizer.apply_chat_template(
+                conversation=conversation,
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            + "正确答案是"
+        )
+
+        print(input_content, end="", flush=True)
+
+        tokens = self.tokenizer.encode(input_content)
+        infer_task = InferTask(
+            0,
+            tokens,
+            self.max_context_len(),
+            temperature_,
+            topk_,
+            topp_,
+            self.eos_token_id,
+        )
+        infer_task.bind_kvcache(KVCache(self))
+
+        steps = 0
+        total_time = 0
+        output_content = ""
+
+        for step_i in range(max_steps):
+            start_time = time.time()
+            output_tokens = self.batch_infer_one_round([infer_task])
+            end_time = time.time()
+            steps += 1
+            output_str = (
+                self.tokenizer._tokenizer.id_to_token(output_tokens[0])
+                .replace("▁", " ")
+                .replace("<0x0A>", "\n")
+            )
+            output_content += output_str
+            print(output_str, end="", flush=True)
+            if output_tokens[0] in self.eos_token_id:
+                break
+            infer_task.next(output_tokens[0])
+
+            if step_i > 0:
+                total_time += end_time - start_time
+
+        print("\n")
+        avg_time = total_time * 1000 / (steps - 1 + 1e-9)
+        print(f"Time per step: {avg_time:.3f}ms")
+
+        infer_task._kv_cache.drop(self)
+        return output_content, avg_time
+
+
+def test():
+    if len(sys.argv) < 3:
+        print(
+            "Usage: python test_ceval.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device]"
+        )
+        sys.exit(1)
+
+    model_path = sys.argv[2]
+    device_type = DeviceType.DEVICE_TYPE_CPU
+    if sys.argv[1] == "--cpu":
+        device_type = DeviceType.DEVICE_TYPE_CPU
+    elif sys.argv[1] == "--nvidia":
+        device_type = DeviceType.DEVICE_TYPE_NVIDIA
+    elif sys.argv[1] == "--cambricon":
+        device_type = DeviceType.DEVICE_TYPE_CAMBRICON
+    elif sys.argv[1] == "--ascend":
+        device_type = DeviceType.DEVICE_TYPE_ASCEND
+    elif sys.argv[1] == "--metax":
+        device_type = DeviceType.DEVICE_TYPE_METAX
+    elif sys.argv[1] == "--moore":
+        device_type = DeviceType.DEVICE_TYPE_MOORE
+    elif sys.argv[1] == "--iluvatar":
+        device_type = DeviceType.DEVICE_TYPE_ILUVATAR
+    elif sys.argv[1] == "--kunlun":
+        device_type = DeviceType.DEVICE_TYPE_KUNLUN
+    elif sys.argv[1] == "--hygon":
+        device_type = DeviceType.DEVICE_TYPE_HYGON
+    else:
+        print(
+            "Usage: python test_ceval.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device]"
+        )
+        sys.exit(1)
+
+    # https://huggingface.co/datasets/ceval/ceval-exam/tree/main/middle_school_geography
+
+    dataset = load_dataset(r"ceval/ceval-exam", name="middle_school_mathematics")
+    # dataset = load_dataset(r"ceval/ceval-exam", name="high_school_history")
+    # dataset = load_dataset(r"ceval/ceval-exam", name="high_school_chinese")
+    # dataset = load_dataset(r"ceval/ceval-exam", name="high_school_physics")
+    # dataset = load_dataset(r"ceval/ceval-exam", name="middle_school_geography")
+    # dataset = load_dataset(r"ceval/ceval-exam", name="middle_school_physics")
+
+    samples = dataset["val"]
+    ndev = int(sys.argv[3]) if len(sys.argv) > 3 else 1
+    model = JiugeForCeval(model_path, device_type, ndev)
+
+    answers_list = []
+    for sample in samples:
+        input_content = f"'question':{sample['question']},'A': {sample['A']}, 'B':{sample['B']}, 'C': {sample['C']},'D': {sample['D']}。"
+        conversation = [
+            {
+                "role": "system",
+                "content": "请从question的A，B，C，D四个选项中选择正确的选项。例如，标准答案：A。",
+            },
+            {"role": "user", "content": input_content},
+        ]
+
+        answer = sample["answer"]
+        output_content, avg_time = model.generate(
+            conversation, 500, topp_=1.0, topk_=1, temperature_=1.0
+        )
+        print("标准答案：", answer)
+        answers_list.append(
+            {"id": sample["id"], "output_content": output_content, "answer": answer}
+        )
+
+    model.destroy_model_instance()
+
+    print("-------------------------------------------------------------")
+
+    true_num = 0
+    all_num = 0
+    for cont in answers_list:
+        id = cont["id"]
+        output = cont["output_content"]
+        answer = cont["answer"]
+
+        all_num = all_num + 1
+        position = 0
+        ABCD = output[position : position + 2]
+        if answer in ABCD:
+            true_num = true_num + 1
+            print(f"id {id} : ", "正确")
+        else:
+            print(f"id {id}: ", "错误")
+
+    print(f"成绩: {true_num}/{all_num}", true_num / all_num)
+
+
+if __name__ == "__main__":
+    test()