issue/106 适配模型9G7B

9c256a17 · Ceng · GitHub · 39bea30a · 9c256a17 · 9c256a17
Unverified Commit 9c256a17 authored Dec 08, 2025 by Ceng Committed by GitHub Dec 08, 2025
4 changed files
--- a/python/infinilm/models/llama/configuration_llama.py
+++ b/python/infinilm/models/llama/configuration_llama.py
@@ -173,7 +173,7 @@ class LlamaConfig(PretrainedConfig):
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
-        attention_bias=False,
+        attention_bias=True,
        attention_dropout=0.0,
        mlp_bias=False,
        head_dim=None,

--- a/python/infinilm/models/llama/modeling_llama.py
+++ b/python/infinilm/models/llama/modeling_llama.py
@@ -157,7 +157,7 @@ class LlamaAttention(infinicore.nn.Module):
        self.o_proj = infinicore.nn.Linear(
            self.num_attention_heads * self.head_dim,
            self.hidden_size,
-            bias=attention_bias,
+            bias=False,
            **kwargs,
        )


--- a/test/bench/test_benchmark.py
+++ b/test/bench/test_benchmark.py
+import sys
+import os
+import argparse
+import time
+import re
+from datasets import load_dataset
+import infinicore
+import infinilm
+from infinilm.models.llama import AutoLlamaModel
+from infinilm.modeling_utils import get_model_state_dict
+from infinilm.distributed import DistConfig
+from abc import ABC, abstractmethod
+
+
+class BaseBenchmark(ABC):
+    """Base class for benchmark evaluation with common tokenizer and generation utilities"""
+
+    def encode_text(self, text):
+        """Encode text to token IDs - reused across backends"""
+        return self.tokenizer.encode(text)
+
+    def decode_token(self, token_id):
+        """Decode token ID to text - reused across backends"""
+        return self.tokenizer.decode(token_id)
+
+    @abstractmethod
+    def render_input_content(self, *args, **kwargs):
+        """Render input content - benchmark-specific implementation"""
+        pass
+
+    @abstractmethod
+    def generate(self, *args, **kwargs):
+        """Generate response - benchmark-specific implementation"""
+        pass
+
+    @abstractmethod
+    def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
+        """Backend-specific generation implementation"""
+        pass
+
+
+class InfiniLMBenchmark(BaseBenchmark):
+    """Wrapper class for InfiniLM cpp backend for benchmark evaluation"""
+
+    def __init__(self, model_dir_path, device_type_str="cpu", ndev=1, backend="cpp", benchmark="ceval"):
+        import transformers
+
+        self.benchmark = benchmark
+
+        # Map device type string to infinicore device
+        device_map = {
+            "cpu": "cpu",
+            "nvidia": "cuda",
+            "cambricon": "cambricon",
+            "ascend": "ascend",
+            "metax": "metax",
+            "moore": "moore",
+            "iluvatar": "iluvatar",
+            "kunlun": "kunlun",
+            "hygon": "hygon",
+        }
+
+        device_name = device_map.get(device_type_str.lower(), "cpu")
+        # CUDA_VISIBLE_DEVICES is automatically respected by CUDA runtime API
+        # When CUDA_VISIBLE_DEVICES=5 is set, CUDA only sees device 5 as device 0
+        # So device index 0 will automatically map to the first visible device
+        self.device = infinicore.device(device_name, 0)
+        self.dtype = infinicore.bfloat16
+
+        # Load config and tokenizer
+        with open(os.path.join(model_dir_path, "config.json"), "r") as f:
+            import json
+            self.config_dict = json.load(f)
+
+        # Align tokenizer initialization with jiuge backend (010)
+        # Match the exact same initialization logic based on model type
+        model_type = self.config_dict.get("model_type", "")
+        if model_type == "llama":
+            # For llama models: no trust_remote_code (matches jiuge line 465)
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir_path)
+        elif model_type in ["fm9g", "minicpm", "fm9g7b"]:
+            # For fm9g/minicpm/fm9g7b models: use trust_remote_code=True (matches jiuge lines 493-495, 518-520)
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_dir_path, trust_remote_code=True
+            )
+        elif model_type in ["qwen2", "qwen3"]:
+            # For qwen2/qwen3 models: no trust_remote_code (matches jiuge line 534-536)
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir_path)
+        else:
+            # Default: use trust_remote_code=True for other models
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_dir_path, trust_remote_code=True
+            )
+
+        eos_token_id = self.config_dict.get("eos_token_id")
+        self.eos_token_id = (
+            [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
+        )
+
+        # Create model with cpp backend
+        print("Loading model with cpp backend...")
+        self.model = AutoLlamaModel.from_pretrained(
+            model_dir_path,
+            device=self.device,
+            dtype=self.dtype,
+            backend=backend,
+            distributed_config=DistConfig(ndev),
+        )
+
+        # Enable KV cache for generation
+        self.model.use_cache = True
+
+        # Load weights
+        print("Loading model weights...")
+        model_param_infini = get_model_state_dict(
+            model_dir_path,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        self.model.load_state_dict(model_param_infini)
+        print("Model loaded successfully")
+
+    def max_context_len(self):
+        return self.config_dict.get("max_position_embeddings", 2048)
+
+    def render_input_content(self, *args, **kwargs):
+        """Render input content based on benchmark type"""
+        if self.benchmark == "ceval":
+            return self._render_ceval(*args, **kwargs)
+        elif self.benchmark == "mmlu":
+            return self._render_mmlu(*args, **kwargs)
+        else:
+            raise ValueError(f"Unknown benchmark: {self.benchmark}")
+
+    def _render_ceval(self, conversation):
+        """Render C-Eval conversation to input content"""
+        return (
+            self.tokenizer.apply_chat_template(
+                conversation=conversation,
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            + "正确答案是"
+        )
+
+    def _render_mmlu(self, question, choices):
+        """Render MMLU question and choices to input content"""
+        choices_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])
+        instruction = (
+            "You are a multiple-choice question solver. "
+            "Select the correct option and respond with only the letter A, B, C, or D."
+        )
+        prompt = f"{instruction}\n\nQuestion: {question}\n{choices_text}\nAnswer:"
+
+        # Use chat template if available, otherwise return plain text
+        if hasattr(self.tokenizer, 'apply_chat_template'):
+            conversation = [
+                {"role": "system", "content": instruction},
+                {"role": "user", "content": f"{question}\n{choices_text}\nAnswer:"}
+            ]
+            try:
+                return self.tokenizer.apply_chat_template(
+                    conversation=conversation,
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+            except Exception:
+                return prompt
+        return prompt
+
+    def generate(self, *args, max_steps=500, topp_=1.0, topk_=1, temperature_=1.0):
+        """Generate response based on benchmark type"""
+        # Render input content
+        input_content = self.render_input_content(*args)
+        print(input_content, end="", flush=True)
+
+        # Encode input
+        tokens = self.encode_text(input_content)
+
+        # Delegate to backend-specific generation implementation
+        output_content, avg_time = self._generate_step(
+            tokens, max_steps, topp_, topk_, temperature_
+        )
+
+        return output_content, avg_time
+
+    def _generate_step(self, tokens, max_steps, topp_, topk_, temperature_):
+        """
+        InfiniLM cpp backend-specific generation implementation
+
+        NOTE: Validation confirmed input configs are identical between backends.
+        The issue was that manual generation loop called InferEngine.generate() which
+        doesn't maintain KV cache. Solution: Use model's built-in generate() method
+        which properly handles KV cache through GenerationMixin.
+        """
+        # Convert tokens to infinicore format
+        input_ids_list = [tokens]
+        input_ids = infinicore.from_list(input_ids_list, dtype=infinicore.int64).to(self.device)
+
+        # Use model's built-in generate() method which properly handles KV cache
+        # Pass sampling parameters (temperature, topk, topp) via kwargs
+        output_tokens_list, output_content = self.model.generate(
+            input_ids=input_ids,
+            max_new_tokens=max_steps,
+            tokenizer=self.tokenizer,
+            stop_on_eos=True,
+            temperature=temperature_,
+            topk=topk_,
+            topp=topp_,
+        )
+
+        # Calculate average time (GenerationMixin doesn't return timing info)
+        # We'll use a placeholder since the timing info isn't available
+        print("\n")
+        avg_time = 0.0  # GenerationMixin doesn't expose per-step timing
+        print(f"Time per step: N/A (using GenerationMixin.generate)")
+
+        return output_content, avg_time
+
+    def destroy_model_instance(self):
+        # Cleanup if needed
+        del self.model
+        print("Model destroyed")
+
+
+def extract_answer_ceval(output_content, answer):
+    """Extract predicted answer from C-Eval output"""
+    output_upper = output_content.upper().strip()
+    position = 0
+    ABCD = output_upper[position : position + 2]
+    return answer in ABCD
+
+
+def extract_answer_mmlu(output_content):
+    """Extract predicted answer from MMLU output (returns 0-3 index or None)"""
+    output_upper = output_content.upper().strip()
+
+    # Find first meaningful token
+    match = re.search(r"\b([ABCD])\b", output_upper)
+    if match:
+        return ord(match.group(1)) - ord('A')
+    else:
+        match_num = re.search(r"\b([0-3])\b", output_upper)
+        if match_num:
+            return int(match_num.group(1))
+    return None
+
+
+def test():
+    # Parse arguments manually to handle device flags properly
+    if len(sys.argv) < 4:
+        print(
+            "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N]"
+        )
+        sys.exit(1)
+
+    # Parse device flag (first argument)
+    device_flag = sys.argv[1]
+    model_path = sys.argv[2]
+
+    # Parse optional arguments
+    backend = "cpp"
+    ndev = 1
+    benchmark = None
+    subject = None  # For MMLU
+    dataset_name = "middle_school_mathematics"  # For C-Eval
+    num_samples = None
+    max_new_tokens = 500
+
+    i = 3
+    while i < len(sys.argv):
+        if sys.argv[i] == "--bench" and i + 1 < len(sys.argv):
+            benchmark = sys.argv[i + 1]
+            i += 2
+        elif sys.argv[i] == "--backend" and i + 1 < len(sys.argv):
+            backend = sys.argv[i + 1]
+            i += 2
+        elif sys.argv[i] == "--ndev" and i + 1 < len(sys.argv):
+            ndev = int(sys.argv[i + 1])
+            i += 2
+        elif sys.argv[i] == "--subject" and i + 1 < len(sys.argv):
+            subject = sys.argv[i + 1]
+            i += 2
+        elif sys.argv[i] == "--dataset" and i + 1 < len(sys.argv):
+            dataset_name = sys.argv[i + 1]
+            i += 2
+        elif sys.argv[i] == "--num_samples" and i + 1 < len(sys.argv):
+            num_samples = int(sys.argv[i + 1])
+            i += 2
+        elif sys.argv[i] == "--max_new_tokens" and i + 1 < len(sys.argv):
+            max_new_tokens = int(sys.argv[i + 1])
+            i += 2
+        else:
+            i += 1
+
+    if benchmark is None:
+        print("Error: --bench argument is required. Choose 'ceval' or 'mmlu'")
+        sys.exit(1)
+
+    if benchmark not in ["ceval", "mmlu"]:
+        print(f"Error: Unknown benchmark '{benchmark}'. Choose 'ceval' or 'mmlu'")
+        sys.exit(1)
+
+    # Parse device type
+    device_type_str = "cpu"
+    if device_flag == "--cpu":
+        device_type_str = "cpu"
+    elif device_flag == "--nvidia":
+        device_type_str = "nvidia"
+    elif device_flag == "--cambricon":
+        device_type_str = "cambricon"
+    elif device_flag == "--ascend":
+        device_type_str = "ascend"
+    elif device_flag == "--metax":
+        device_type_str = "metax"
+    elif device_flag == "--moore":
+        device_type_str = "moore"
+    elif device_flag == "--iluvatar":
+        device_type_str = "iluvatar"
+    elif device_flag == "--kunlun":
+        device_type_str = "kunlun"
+    elif device_flag == "--hygon":
+        device_type_str = "hygon"
+    else:
+        print(
+            "Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N]"
+        )
+        sys.exit(1)
+
+    # Load dataset based on benchmark
+    if benchmark == "ceval":
+        # Load C-Eval dataset
+        # https://huggingface.co/datasets/ceval/ceval-exam/tree/main/middle_school_geography
+        print(f"Loading C-Eval dataset (dataset: {dataset_name})...")
+        try:
+            dataset = load_dataset(r"ceval/ceval-exam", name=dataset_name)
+            samples = dataset["val"]
+            # Convert Dataset to list if needed
+            if hasattr(samples, 'to_list'):
+                samples = samples.to_list()
+            else:
+                samples = list(samples)
+        except Exception as e:
+            print(f"Error loading dataset: {e}")
+            print("Available datasets: middle_school_mathematics, high_school_history, high_school_chinese, high_school_physics, middle_school_geography, middle_school_physics")
+            sys.exit(1)
+
+    elif benchmark == "mmlu":
+        # Load MMLU dataset
+        # https://huggingface.co/datasets/cais/mmlu
+        if subject is None:
+            subject = "all"
+        print(f"Loading MMLU dataset (subject: {subject})...")
+        try:
+            if subject == "all":
+                dataset = load_dataset("cais/mmlu", "all")
+                # Combine all subjects into a single dataset
+                samples = []
+                for subject_name in dataset.keys():
+                    if subject_name in ["train", "validation", "test"]:
+                        continue
+                    # Convert Dataset to list
+                    test_data = dataset[subject_name]["test"]
+                    if hasattr(test_data, 'to_list'):
+                        samples.extend(test_data.to_list())
+                    else:
+                        samples.extend(list(test_data))
+            else:
+                dataset = load_dataset("cais/mmlu", subject)
+                test_data = dataset["test"]
+                # Convert Dataset to list
+                if hasattr(test_data, 'to_list'):
+                    samples = test_data.to_list()
+                else:
+                    samples = list(test_data)
+        except Exception as e:
+            print(f"Error loading dataset: {e}")
+            print("Available subjects: abstract_algebra, anatomy, astronomy, business_ethics, etc.")
+            print("Use --subject all to load all subjects")
+            sys.exit(1)
+
+    print(f"Loaded {len(samples)} samples")
+
+    # Limit number of samples if specified
+    if num_samples is not None and num_samples > 0:
+        original_count = len(samples)
+        samples = samples[:num_samples]
+        print(f"Limited to {len(samples)} samples for validation (from {original_count} total)")
+
+    # Create model based on backend
+    if backend != "010":
+        model = InfiniLMBenchmark(model_path, device_type_str, ndev, backend, benchmark)
+    else:
+        print(f"test 010 backend by scripts/test_ceval.py")
+        exit(0)
+
+    # Test with first sample if available
+    if len(samples) > 0:
+        sample = samples[0]
+        if benchmark == "ceval":
+            input_content = f"'question':{sample['question']},'A': {sample['A']}, 'B':{sample['B']}, 'C': {sample['C']},'D': {sample['D']}。"
+            test_conversation = [
+                {
+                    "role": "system",
+                    "content": "请从question的A，B，C，D四个选项中选择正确的选项。例如，标准答案：A。",
+                },
+                {"role": "user", "content": input_content},
+            ]
+            test_output, _ = model.generate(test_conversation, max_steps=max_new_tokens, topp_=1.0, topk_=1, temperature_=1.0)
+        elif benchmark == "mmlu":
+            question = sample['question']
+            choices = sample['choices']
+            test_output, _ = model.generate(question, choices, max_steps=max_new_tokens, topp_=1.0, topk_=1, temperature_=1.0)
+        print(f"\nTest output: {test_output}")
+
+    answers_list = []
+    for idx, sample in enumerate(samples):
+        if benchmark == "ceval":
+            input_content = f"'question':{sample['question']},'A': {sample['A']}, 'B':{sample['B']}, 'C': {sample['C']},'D': {sample['D']}。"
+            conversation = [
+                {
+                    "role": "system",
+                    "content": "请从question的A，B，C，D四个选项中选择正确的选项。例如，标准答案：A。",
+                },
+                {"role": "user", "content": input_content},
+            ]
+            answer = sample["answer"]
+            output_content, avg_time = model.generate(
+                conversation, max_steps=max_new_tokens, topp_=1.0, topk_=1, temperature_=1.0
+            )
+            is_correct = extract_answer_ceval(output_content, answer)
+            answers_list.append({
+                "id": sample.get("id", idx),
+                "output_content": output_content,
+                "answer": answer,
+                "is_correct": is_correct
+            })
+            if benchmark == "ceval":
+                print("标准答案：", answer)
+
+        elif benchmark == "mmlu":
+            question = sample['question']
+            choices = sample['choices']
+            answer_idx = sample['answer']  # MMLU answer is 0-3 index
+
+            output_content, avg_time = model.generate(
+                question, choices, max_steps=max_new_tokens, topp_=1.0, topk_=1, temperature_=1.0
+            )
+
+            predicted_answer = extract_answer_mmlu(output_content)
+
+            # Convert answer index to letter for display
+            answer_letter = chr(65 + answer_idx) if answer_idx < 4 else "?"
+            predicted_letter = chr(65 + predicted_answer) if predicted_answer is not None and predicted_answer < 4 else "?"
+
+            print(f"Sample {idx}: Correct answer: {answer_letter} ({answer_idx}), Predicted: {predicted_letter} ({predicted_answer})")
+
+            answers_list.append({
+                "id": idx,
+                "output_content": output_content,
+                "answer": answer_idx,
+                "predicted": predicted_answer
+            })
+
+    model.destroy_model_instance()
+
+    print("-------------------------------------------------------------")
+
+    # Evaluate results
+    true_num = 0
+    all_num = 0
+    for cont in answers_list:
+        id = cont["id"]
+        all_num = all_num + 1
+
+        if benchmark == "ceval":
+            answer = cont["answer"]
+            is_correct = cont["is_correct"]
+            if is_correct:
+                true_num = true_num + 1
+                print(f"id {id} : ", "正确")
+            else:
+                print(f"id {id}: ", "错误")
+
+        elif benchmark == "mmlu":
+            answer = cont["answer"]
+            predicted = cont["predicted"]
+            if predicted is not None and predicted == answer:
+                true_num = true_num + 1
+                print(f"id {id}: Correct")
+            else:
+                answer_letter = chr(65 + answer) if answer < 4 else "?"
+                predicted_letter = chr(65 + predicted) if predicted is not None and predicted < 4 else "?"
+                print(f"id {id}: Wrong (correct: {answer_letter}, predicted: {predicted_letter})")
+
+    accuracy = true_num / all_num if all_num > 0 else 0.0
+    if benchmark == "ceval":
+        print(f"成绩: {true_num}/{all_num}", accuracy)
+    else:
+        print(f"Accuracy: {true_num}/{all_num} = {accuracy:.2%}")
+
+
+if __name__ == "__main__":
+    test()
--- a/test/models/llama/test_forward_validation.py
+++ b/test/models/llama/test_forward_validation.py
@@ -4,7 +4,6 @@ Test script to validate forward pass across different backends and dtypes.

 Tests:
 1. Python backend with bfloat16
-2. C++ backend with float32
 3. C++ backend with bfloat16

 This script runs a prefill step (full sequence forward pass with KV cache)
@@ -81,6 +80,12 @@ def get_args():
        default="How are you",
        help="Test prompt (default: 'How are you')",
    )
+    parser.add_argument(
+        "--num_decode_steps",
+        type=int,
+        default=2,
+        help="Number of decode steps to run after prefill (default: 2)",
+    )
    return parser.parse_args()


@@ -116,9 +121,9 @@ def create_inputs(prompt, tokenizer, device, backend="cpp"):
    return input_ids_infini, position_ids_infini, input_content


-def run_forward_pass(model, input_ids, position_ids, backend, dtype):
-    """Run prefill and first decode step with KV cache, return decode step logits."""
-    print(f"  Running forward pass (prefill + first decode step)...")
+def run_forward_pass(model, input_ids, position_ids, backend, dtype, num_decode_steps=2):
+    """Run prefill and multiple decode steps with KV cache, return all decode step logits."""
+    print(f"  Running forward pass (prefill + {num_decode_steps} decode step(s))...")

    try:
        # Get the underlying model
@@ -162,19 +167,6 @@ def run_forward_pass(model, input_ids, position_ids, backend, dtype):
                print(
                    f"    Prefill logits stats: min={prefill_logits_np.min():.6f}, max={prefill_logits_np.max():.6f}, mean={prefill_logits_np.mean():.6f}")

-            # Step 2: Decode - run forward pass with single token
-            # Get the predicted token from prefill
-            if np.isnan(prefill_logits_np).any():
-                # If prefill has NaN, use a default token to continue testing decode step
-                print(
-                    f"    ⚠ WARNING: Using default token 29902 due to NaN in prefill logits")
-                predicted_token_id = 29902
-            else:
-                predicted_token_id = int(
-                    prefill_logits_np.argmax(axis=-1)[0, 0])
-            print(
-                f"    Step 2: Decode (next_token_id={predicted_token_id})...")
-
            # Get device from input_ids
            if hasattr(input_ids, "device"):
                input_device = input_ids.device
@@ -182,19 +174,59 @@ def run_forward_pass(model, input_ids, position_ids, backend, dtype):
                input_device = getattr(
                    position_ids, "device", infinicore.device("cpu", 0))

-            # Create single token input for decode step
-            decode_input_ids = infinicore.from_list(
-                [[predicted_token_id]], device=input_device)
-
-            # Create position_ids for decode step (should be seq_len, since we've processed seq_len tokens)
+            # Initialize decode logits list
+            decode_logits_list = []
            seq_len = input_ids.shape[1]
-            decode_position_ids = infinicore.from_list(
-                [[seq_len]], dtype=infinicore.int64, device=input_device
-            )
-
-            # Run decode step - C++ backend manages cache internally
-            decode_logits = underlying_model.forward(
-                decode_input_ids, decode_position_ids)
+            current_token_id = None
+
+            # Run multiple decode steps
+            for decode_step in range(num_decode_steps):
+                # Get the predicted token from previous step
+                if decode_step == 0:
+                    # First decode step: use token from prefill
+                    if np.isnan(prefill_logits_np).any():
+                        print(f"    ⚠ WARNING: Using default token 29902 due to NaN in prefill logits")
+                        current_token_id = 29902
+                    else:
+                        current_token_id = int(prefill_logits_np.argmax(axis=-1)[0, 0])
+                else:
+                    # Subsequent decode steps: use token from previous decode
+                    prev_logits_np = decode_logits_list[-1]
+                    if np.isnan(prev_logits_np).any():
+                        print(f"    ⚠ WARNING: Using default token 29902 due to NaN in decode step {decode_step} logits")
+                        current_token_id = 29902
+                    else:
+                        current_token_id = int(prev_logits_np.argmax(axis=-1)[0, 0])
+
+                print(f"    Step {decode_step + 2}: Decode step {decode_step + 1} (next_token_id={current_token_id})...")
+
+                # Create single token input for decode step
+                decode_input_ids = infinicore.from_list(
+                    [[current_token_id]], device=input_device)
+
+                # Create position_ids for decode step
+                decode_position_ids = infinicore.from_list(
+                    [[seq_len + decode_step]], dtype=infinicore.int64, device=input_device
+                )
+
+                # Run decode step - C++ backend manages cache internally
+                decode_logits = underlying_model.forward(
+                    decode_input_ids, decode_position_ids)
+
+                # Convert decode logits to numpy
+                decode_logits_np = infinicore_to_numpy(decode_logits)
+                decode_logits_list.append(decode_logits_np)
+                print(f"    ✓ Decode step {decode_step + 1} completed, logits shape: {decode_logits_np.shape}")
+
+                # Check decode logits for issues
+                if np.isnan(decode_logits_np).any():
+                    print(f"    ⚠ WARNING: Decode step {decode_step + 1} logits contain NaN values!")
+                    print(f"      NaN count: {np.isnan(decode_logits_np).sum()}")
+                if np.isinf(decode_logits_np).any():
+                    print(f"    ⚠ WARNING: Decode step {decode_step + 1} logits contain Inf values!")
+                    print(f"      Inf count: {np.isinf(decode_logits_np).sum()}")
+                if not np.isnan(decode_logits_np).any():
+                    print(f"    Decode step {decode_step + 1} logits stats: min={decode_logits_np.min():.6f}, max={decode_logits_np.max():.6f}, mean={decode_logits_np.mean():.6f}")
        else:
            # Python backend uses DynamicCache
            # Get model config
@@ -217,12 +249,6 @@ def run_forward_pass(model, input_ids, position_ids, backend, dtype):
            print(
                f"    ✓ Prefill completed, logits shape: {prefill_logits_np.shape}")

-            # Step 2: Decode - run forward pass with single token
-            # Get the predicted token from prefill
-            predicted_token_id = int(prefill_logits_np.argmax(axis=-1)[0, 0])
-            print(
-                f"    Step 2: Decode (next_token_id={predicted_token_id})...")
-
            # Get device from input_ids
            if hasattr(input_ids, "device"):
                input_device = input_ids.device
@@ -231,48 +257,87 @@ def run_forward_pass(model, input_ids, position_ids, backend, dtype):
                input_device = getattr(
                    position_ids, "device", infinicore.device("cpu", 0))

-            # Create single token input for decode step
-            decode_input_ids = infinicore.from_list(
-                [[predicted_token_id]], device=input_device)
-
-            # Create position_ids for decode step (should be seq_len, since we've processed seq_len tokens)
+            # Initialize decode logits list
+            decode_logits_list = []
            seq_len = input_ids.shape[1]
-            decode_position_ids = infinicore.from_list(
-                [[seq_len]], dtype=infinicore.int64, device=input_device
-            )
-
-            # Run decode step with KV cache
-            decode_logits = underlying_model.forward(
-                decode_input_ids, decode_position_ids, past_key_values=past_key_values, use_cache=True
-            )
-
-        # Convert decode logits to numpy for analysis
-        logits_np = infinicore_to_numpy(decode_logits)
-
-        print(f"  ✓ Forward pass completed (prefill + decode)")
-        print(f"    Decode logits shape: {logits_np.shape}")
-        print(f"    Decode logits dtype: {logits_np.dtype}")
-        print(
-            f"    Decode logits stats: min={logits_np.min():.6f}, max={logits_np.max():.6f}, mean={logits_np.mean():.6f}")
-
-        # Check for issues
-        if np.isnan(logits_np).any():
-            print(f"    ⚠ WARNING: Logits contain NaN values!")
-            return None, True
-        if np.isinf(logits_np).any():
-            print(f"    ⚠ WARNING: Logits contain Inf values!")
-            return None, True
-
-        # Check if logits are too small (might indicate model not working)
-        if np.abs(logits_np).max() < 1.0:
-            print(
-                f"    ⚠ WARNING: Logits are very small (max abs: {np.abs(logits_np).max():.6f})")
-
-        # Get predicted token from decode step
-        predicted_token = int(logits_np.argmax(axis=-1)[0, 0])
-        print(f"    Predicted token ID from decode: {predicted_token}")
-
-        return logits_np, False
+            current_token_id = None
+
+            # Run multiple decode steps
+            for decode_step in range(num_decode_steps):
+                # Get the predicted token from previous step
+                if decode_step == 0:
+                    # First decode step: use token from prefill
+                    if np.isnan(prefill_logits_np).any():
+                        print(f"    ⚠ WARNING: Using default token 29902 due to NaN in prefill logits")
+                        current_token_id = 29902
+                    else:
+                        current_token_id = int(prefill_logits_np.argmax(axis=-1)[0, 0])
+                else:
+                    # Subsequent decode steps: use token from previous decode
+                    prev_logits_np = decode_logits_list[-1]
+                    if np.isnan(prev_logits_np).any():
+                        print(f"    ⚠ WARNING: Using default token 29902 due to NaN in decode step {decode_step} logits")
+                        current_token_id = 29902
+                    else:
+                        current_token_id = int(prev_logits_np.argmax(axis=-1)[0, 0])
+
+                print(f"    Step {decode_step + 2}: Decode step {decode_step + 1} (next_token_id={current_token_id})...")
+
+                # Create single token input for decode step
+                decode_input_ids = infinicore.from_list(
+                    [[current_token_id]], device=input_device)
+
+                # Create position_ids for decode step
+                decode_position_ids = infinicore.from_list(
+                    [[seq_len + decode_step]], dtype=infinicore.int64, device=input_device
+                )
+
+                # Run decode step with KV cache
+                decode_logits = underlying_model.forward(
+                    decode_input_ids, decode_position_ids, past_key_values=past_key_values, use_cache=True
+                )
+
+                # Convert decode logits to numpy
+                decode_logits_np = infinicore_to_numpy(decode_logits)
+                decode_logits_list.append(decode_logits_np)
+                print(f"    ✓ Decode step {decode_step + 1} completed, logits shape: {decode_logits_np.shape}")
+
+                # Check decode logits for issues
+                if np.isnan(decode_logits_np).any():
+                    print(f"    ⚠ WARNING: Decode step {decode_step + 1} logits contain NaN values!")
+                    print(f"      NaN count: {np.isnan(decode_logits_np).sum()}")
+                if np.isinf(decode_logits_np).any():
+                    print(f"    ⚠ WARNING: Decode step {decode_step + 1} logits contain Inf values!")
+                    print(f"      Inf count: {np.isinf(decode_logits_np).sum()}")
+                if not np.isnan(decode_logits_np).any():
+                    print(f"    Decode step {decode_step + 1} logits stats: min={decode_logits_np.min():.6f}, max={decode_logits_np.max():.6f}, mean={decode_logits_np.mean():.6f}")
+
+        # Summary of all decode steps
+        print(f"  ✓ Forward pass completed (prefill + {num_decode_steps} decode step(s))")
+        for i, logits_np in enumerate(decode_logits_list):
+            print(f"    Decode step {i + 1} logits shape: {logits_np.shape}, dtype: {logits_np.dtype}")
+
+        # Check for issues in all decode steps
+        has_error = False
+        for i, logits_np in enumerate(decode_logits_list):
+            if np.isnan(logits_np).any():
+                print(f"    ⚠ WARNING: Decode step {i + 1} logits contain NaN values!")
+                print(f"      NaN count: {np.isnan(logits_np).sum()}")
+                has_error = True
+            if np.isinf(logits_np).any():
+                print(f"    ⚠ WARNING: Decode step {i + 1} logits contain Inf values!")
+                print(f"      Inf count: {np.isinf(logits_np).sum()}")
+                has_error = True
+            if np.abs(logits_np).max() < 1.0:
+                print(f"    ⚠ WARNING: Decode step {i + 1} logits are very small (max abs: {np.abs(logits_np).max():.6f})")
+
+        # Get predicted token from last decode step
+        if decode_logits_list and not np.isnan(decode_logits_list[-1]).any():
+            predicted_token = int(decode_logits_list[-1].argmax(axis=-1)[0, 0])
+            print(f"    Predicted token ID from decode step {num_decode_steps}: {predicted_token}")
+
+        # Return tuple of all decode logits
+        return tuple(decode_logits_list), has_error

    except Exception as e:
        print(f"  ✗ Forward pass failed: {e}")
@@ -353,7 +418,7 @@ def infinicore_to_numpy(tensor):
    return result


-def test_configuration(model_path, device, backend, dtype, prompt):
+def test_configuration(model_path, device, backend, dtype, prompt, num_decode_steps=2):
    """Test a specific backend/dtype configuration."""
    print("\n" + "=" * 80)
    print(f"Testing: Backend={backend}, Dtype={dtype}")
@@ -377,7 +442,7 @@ def test_configuration(model_path, device, backend, dtype, prompt):
    # Load tokenizer
    print("\n1. Loading tokenizer...")
    try:
-        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        print(f"  ✓ Tokenizer loaded")
    except Exception as e:
        print(f"  ✗ Failed to load tokenizer: {e}")
@@ -428,25 +493,25 @@ def test_configuration(model_path, device, backend, dtype, prompt):
        traceback.print_exc()
        return None, True

-    # Run forward pass (prefill + decode step)
-    print(f"\n5. Running forward pass (prefill + first decode step)...")
-    logits, has_error = run_forward_pass(
-        model, input_ids, position_ids, backend, dtype)
+    # Run forward pass (prefill + multiple decode steps)
+    print(f"\n5. Running forward pass (prefill + {num_decode_steps} decode step(s))...")
+    logits_tuple, has_error = run_forward_pass(
+        model, input_ids, position_ids, backend, dtype, num_decode_steps)

    if has_error:
        return None, True

-    return logits, False
+    return logits_tuple, False


-def compare_logits(logits1, logits2, name1, name2):
+def compare_logits(logits1, logits2, name1, name2, step_name="logits"):
    """Compare two logits arrays."""
    print(f"\n{'=' * 80}")
-    print(f"Comparing: {name1} vs {name2}")
+    print(f"Comparing: {name1} vs {name2} ({step_name})")
    print(f"{'=' * 80}")

    if logits1 is None or logits2 is None:
-        print("  ✗ Cannot compare: one or both logits are None")
+        print(f"  ✗ Cannot compare: one or both {step_name} are None")
        return False

    if logits1.shape != logits2.shape:
@@ -469,9 +534,9 @@ def compare_logits(logits1, logits2, name1, name2):
    is_close = np.allclose(logits1, logits2, rtol=rtol, atol=atol)

    if is_close:
-        print(f"  ✓ Logits are close (within tolerance)")
+        print(f"  ✓ {step_name.capitalize()} are close (within tolerance)")
    else:
-        print(f"  ⚠ Logits differ significantly")
+        print(f"  ⚠ {step_name.capitalize()} differ significantly")
        # Show top differences
        flat_diff = diff.flatten()
        top_indices = np.argsort(flat_diff)[-10:][::-1]
@@ -493,6 +558,7 @@ def main():
    print(f"Model path: {args.model_path}")
    print(f"Device: {args.device}")
    print(f"Prompt: {args.prompt}")
+    print(f"Number of decode steps: {args.num_decode_steps}")
    print("=" * 80)

    results = {}
@@ -502,25 +568,16 @@ def main():
    print("TEST 1: Python Backend + BFloat16")
    print("=" * 80)
    logits_py_bf16, error = test_configuration(
-        args.model_path, args.device, "python", "bfloat16", args.prompt
+        args.model_path, args.device, "python", "bfloat16", args.prompt, args.num_decode_steps
    )
    results["python_bf16"] = (logits_py_bf16, error)

-    # Test 2: C++ backend with float32
-    print("\n\n" + "=" * 80)
-    print("TEST 2: C++ Backend + Float32")
-    print("=" * 80)
-    logits_cpp_f32, error = test_configuration(
-        args.model_path, args.device, "cpp", "float32", args.prompt
-    )
-    results["cpp_f32"] = (logits_cpp_f32, error)
-
    # Test 3: C++ backend with bfloat16
    print("\n\n" + "=" * 80)
    print("TEST 3: C++ Backend + BFloat16")
    print("=" * 80)
    logits_cpp_bf16, error = test_configuration(
-        args.model_path, args.device, "cpp", "bfloat16", args.prompt
+        args.model_path, args.device, "cpp", "bfloat16", args.prompt, args.num_decode_steps
    )
    results["cpp_bf16"] = (logits_cpp_bf16, error)

@@ -533,23 +590,22 @@ def main():

    # Compare Python BF16 vs C++ BF16 (should be similar)
    if not results["python_bf16"][1] and not results["cpp_bf16"][1]:
-        is_close = compare_logits(
-            results["python_bf16"][0],
-            results["cpp_bf16"][0],
-            "Python BF16",
-            "C++ BF16"
-        )
-        comparisons.append(("Python BF16 vs C++ BF16", is_close))
-
-    # Compare C++ F32 vs C++ BF16 (should be similar but with some differences)
-    if not results["cpp_f32"][1] and not results["cpp_bf16"][1]:
-        is_close = compare_logits(
-            results["cpp_f32"][0],
-            results["cpp_bf16"][0],
-            "C++ F32",
-            "C++ BF16"
-        )
-        comparisons.append(("C++ F32 vs C++ BF16", is_close))
+        py_logits = results["python_bf16"][0]
+        cpp_logits = results["cpp_bf16"][0]
+
+        if py_logits is not None and cpp_logits is not None:
+            # Compare all decode steps
+            num_steps = min(len(py_logits), len(cpp_logits))
+            for step_idx in range(num_steps):
+                step_name = f"decode step {step_idx + 1}"
+                is_close = compare_logits(
+                    py_logits[step_idx],
+                    cpp_logits[step_idx],
+                    "Python BF16",
+                    "C++ BF16",
+                    step_name
+                )
+                comparisons.append((f"Python BF16 vs C++ BF16 ({step_name})", is_close))

    # Summary
    print("\n\n" + "=" * 80)