issue/106 适配模型9G7B

9c256a17 · Ceng · GitHub · 39bea30a · 9c256a17 · 9c256a17
Unverified Commit 9c256a17 authored Dec 08, 2025 by Ceng Committed by GitHub Dec 08, 2025
20 changed files
--- a/csrc/cache/kv_cache.hpp
+++ b/csrc/cache/kv_cache.hpp
@@ -4,6 +4,7 @@
 #include "infinicore/device.hpp"
 #include "infinicore/tensor.hpp"

+#include "infinicore/context/context.hpp"
 #include <algorithm>
 #include <memory>
 #include <numeric>
@@ -14,23 +15,23 @@

 namespace infinilm::cache {

+
 /**
- * @brief Simple KV cache structure for incremental decoding
+ * @brief Single layer's KV cache for incremental decoding
 *
 * Stores key and value caches with shape [batch_size, n_kv_head, capacity, head_dim]
 * Similar to DynamicLayer in Python cache_utils.py
 *
- * This is a common component that can be used by any model architecture
- * that needs KV caching for attention mechanisms.
+ * This represents a single layer's cache within a model-level cache container.
 */
-struct KVCache {
+struct KVCacheLayer {
    infinicore::Tensor k_cache;          // [batch_size, n_kv_head, capacity, head_dim]
    infinicore::Tensor v_cache;          // [batch_size, n_kv_head, capacity, head_dim]
    std::vector<size_t> cache_positions; // Current position in cache
    size_t max_capacity;                 // Maximum capacity of cache
    bool initialized;                    // Whether cache has been initialized

-    KVCache() : max_capacity(0), initialized(false) {}
+    KVCacheLayer() : max_capacity(0), initialized(false) {}

    /**
     * @brief Initialize or update cache capacity
@@ -39,11 +40,20 @@ struct KVCache {
     * @param seq_len Sequence length of new tokens
     * @param dtype Data type
     * @param device Device
+     * @param max_position_embeddings Maximum position embeddings (for initial capacity)
     */
    void ensure_capacity(size_t batch_size, size_t num_kv_heads, size_t head_dim, size_t seq_len,
-                         infinicore::DataType dtype, const infinicore::Device &device) {
+                         infinicore::DataType dtype, const infinicore::Device &device,
+                        size_t max_position_embeddings = 4096) {
        size_t required_capacity = seq_len + std::accumulate(cache_positions.begin(), cache_positions.end(), 0, [](int a, int b) { return std::max(a, b); });

+        // VALIDATION: Verify input parameters
+        if (num_kv_heads == 0 || head_dim == 0 || seq_len == 0) {
+            SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Invalid parameters - num_kv_heads: {}, head_dim: {}, seq_len: {}",
+                        num_kv_heads, head_dim, seq_len);
+            throw std::runtime_error("KV cache ensure_capacity: invalid parameters");
+        }
+
        // Lazy initialization
        if (!initialized) {
            max_capacity = std::max(required_capacity, size_t(4096)); // Start with at least 4096
@@ -53,6 +63,15 @@ struct KVCache {
                                                dtype, device);
            cache_positions = std::vector<size_t>(batch_size, 0);
            initialized = true;
+
+            // VALIDATION: Verify cache was created correctly
+            // Shape is [batch_size, num_kv_heads, max_capacity, head_dim]
+            if (k_cache->shape()[0] != batch_size || k_cache->shape()[1] != num_kv_heads ||
+                k_cache->shape()[2] != max_capacity || k_cache->shape()[3] != head_dim) {
+                SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Cache shape mismatch after initialization - expected: [{}, {}, {}, {}], got: {}",
+                            batch_size, num_kv_heads, max_capacity, head_dim, k_cache->info());
+                throw std::runtime_error("KV cache initialization: shape mismatch");
+            }
        }
        // Grow cache if needed (similar to DynamicLayer in Python)
        else if (required_capacity > max_capacity) {
@@ -83,10 +102,25 @@ struct KVCache {
            k_cache = k_new;
            v_cache = v_new;
            max_capacity = new_capacity;
+
+            // VALIDATION: Verify cache was grown correctly
+            // Shape is [batch_size, num_kv_heads, max_capacity, head_dim]
+            if (k_cache->shape()[2] != new_capacity) {
+                SPDLOG_ERROR("KVCacheLayer::ensure_capacity: New cache capacity mismatch - expected: {}, got: {}",
+                            new_capacity, k_cache->shape()[2]);
+                throw std::runtime_error("KV cache growth: capacity mismatch");
+            }
+        }
+
+        // VALIDATION: Final check that capacity is sufficient
+        if (required_capacity > max_capacity) {
+            SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Capacity still insufficient after growth - required: {}, max_capacity: {}",
+                        required_capacity, max_capacity);
+            throw std::runtime_error("KV cache ensure_capacity: capacity insufficient");
        }
    }

-    KVCache(size_t max_batch_size, size_t n_kv_head, size_t head_dim, infinicore::DataType dtype, size_t max_seqlen = 4096, infinicore::Device device = infinicore::context::getDevice())
+    KVCacheLayer(size_t max_batch_size, size_t n_kv_head, size_t head_dim, infinicore::DataType dtype, size_t max_seqlen = 4096, infinicore::Device device = infinicore::context::getDevice())
        : max_capacity(max_seqlen), initialized(false) {
        cache_positions = std::vector<size_t>(max_batch_size, 0);
        ensure_capacity(max_batch_size, n_kv_head, head_dim, max_capacity, dtype, device);
@@ -96,7 +130,7 @@ struct KVCache {
     * @brief Update cache with new key and value states
     * @param k_new New key states [batch_size, n_kv_head, seq_len, head_dim]
     * @param v_new New value states [batch_size, n_kv_head, seq_len, head_dim]
-     * @return Tuple of (k_total, v_total) with shape [n_kv_head, total_seq_len, head_dim]
+     * @return Tuple of (k_total, v_total) with shape [batch_size, n_kv_head, total_seq_len, head_dim]
     *
     * Note: This method writes to the cache. If using with attention op, the attention op
     * also writes to the cache, so this should be called AFTER attention, not before.
@@ -143,4 +177,120 @@ struct KVCache {
    }
 };

+/**
+ * @brief Model-level KV cache container (similar to DynamicCache in Python)
+ *
+ * Stores a list of KVCacheLayer objects, one per model layer.
+ * This aligns with Python backend's DynamicCache architecture.
+ */
+class DynamicCache {
+public:
+    /**
+     * @brief Construct DynamicCache with specified number of layers
+     *
+     * @param num_layers Number of model layers (creates one cache layer per model layer)
+     * @param max_position_embeddings Maximum position embeddings (used for initial capacity)
+     */
+    DynamicCache(size_t num_layers, size_t max_position_embeddings = 4096)
+        : layers_(num_layers), max_position_embeddings_(max_position_embeddings) {}
+
+    /**
+     * @brief Update cache with new key and value states for a specific layer
+     *
+     * @param layer_idx Layer index (0-based)
+     * @param k_new New key states [batch_size, n_kv_head, seq_len, head_dim]
+     * @param v_new New value states [batch_size, n_kv_head, seq_len, head_dim]
+     * @return Tuple of (k_total, v_total) with shape [batch_size, n_kv_head, total_seq_len, head_dim]
+     *
+     * This method updates the cache for the specified layer and returns the
+     * accumulated cache up to the current position.
+     */
+    std::pair<infinicore::Tensor, infinicore::Tensor> update(
+        size_t layer_idx,
+        const infinicore::Tensor &k_new,
+        const infinicore::Tensor &v_new) {
+        if (layer_idx >= layers_.size()) {
+            SPDLOG_ERROR("DynamicCache::update: layer_idx {} out of range (num_layers: {})",
+                        layer_idx, layers_.size());
+            throw std::runtime_error("DynamicCache: layer_idx out of range");
+        }
+
+        // Update the cache for this layer
+        return layers_[layer_idx].update(k_new, v_new);
+    }
+
+    /**
+     * @brief Update cache with new key and value states (convenience method without layer_idx)
+     * This is used when the cache is accessed directly without layer information
+     *
+     * @param k_new New key states [batch_size, n_kv_head, seq_len, head_dim]
+     * @param v_new New value states [batch_size, n_kv_head, seq_len, head_dim]
+     * @return Tuple of (k_total, v_total) with shape [batch_size, n_kv_head, total_seq_len, head_dim]
+     *
+     * Note: This assumes layer_idx=0. For multi-layer models, use update(layer_idx, k_new, v_new) instead.
+     */
+    std::pair<infinicore::Tensor, infinicore::Tensor> update(
+        const infinicore::Tensor &k_new,
+        const infinicore::Tensor &v_new) {
+        return update(0, k_new, v_new);
+    }
+
+    /**
+     * @brief Get the number of layers in this cache
+     */
+    size_t num_layers() const { return layers_.size(); }
+
+    /**
+     * @brief Get cache position for a specific layer
+     */
+    size_t cache_position(size_t layer_idx) const {
+        if (layer_idx >= layers_.size()) {
+            throw std::runtime_error("DynamicCache: layer_idx out of range");
+        }
+        if (layers_[layer_idx].cache_positions.empty()) {
+            return 0;
+        }
+        return layers_[layer_idx].cache_positions[0]; // All batch items should have same position
+    }
+
+    /**
+     * @brief Get max position embeddings (used for initial capacity)
+     */
+    size_t max_position_embeddings() const { return max_position_embeddings_; }
+
+    /**
+     * @brief Reset cache for all layers to a specific position
+     * This should be called when starting a new generation sequence or resetting to a specific position
+     * @param pos Position to reset to (defaults to 0)
+     */
+    void reset(size_t pos = 0) {
+        for (auto& layer : layers_) {
+            std::fill(layer.cache_positions.begin(), layer.cache_positions.end(), pos);
+            // Note: We don't reset initialized flag or clear the cache tensors
+            // to avoid reallocation. The cache will be overwritten on next update.
+        }
+    }
+
+    /**
+     * @brief Access a specific layer's cache (for advanced usage)
+     */
+    KVCacheLayer& layer(size_t layer_idx) {
+        if (layer_idx >= layers_.size()) {
+            throw std::runtime_error("DynamicCache: layer_idx out of range");
+        }
+        return layers_[layer_idx];
+    }
+
+    const KVCacheLayer& layer(size_t layer_idx) const {
+        if (layer_idx >= layers_.size()) {
+            throw std::runtime_error("DynamicCache: layer_idx out of range");
+        }
+        return layers_[layer_idx];
+    }
+
+private:
+    std::vector<KVCacheLayer> layers_;
+    size_t max_position_embeddings_;
+};
+
 } // namespace infinilm::cache
--- a/csrc/engine/infer_engine.cpp
+++ b/csrc/engine/infer_engine.cpp
 #include "infer_engine.hpp"
+#include "spdlog/spdlog.h"

 namespace infinilm::engine {

@@ -71,4 +72,14 @@ const distributed::DistConfig &InferEngine::get_dist_config() const {
    return communication_group_.get_dist_config();
 }

+//------------------------------------------------------
+// reset_cache
+//------------------------------------------------------
+void InferEngine::reset_cache(size_t pos, bool async) {
+    // Reset cache on all workers
+    for (auto &worker : workers_) {
+        worker->reset_cache(pos, async);
+    }
+}
+
 } // namespace infinilm::engine
--- a/csrc/engine/infer_engine.hpp
+++ b/csrc/engine/infer_engine.hpp
@@ -26,6 +26,11 @@ public:
    infinicore::Tensor generate(const infinicore::Tensor &input_ids,
                                const infinicore::Tensor &position_ids);

+    // Reset the internal cache in all workers (clears state between generations)
+    // By default, this is synchronous (blocks until reset completes).
+    // If async=true, this becomes asynchronous (unstable - use with caution).
+    void reset_cache(size_t pos = 0, bool async = false);
+
    ~InferEngine();

    const distributed::DistConfig &get_dist_config() const;

--- a/csrc/engine/rank_worker.cpp
+++ b/csrc/engine/rank_worker.cpp
@@ -111,6 +111,35 @@ void RankWorker::wait() {
    }
 }

+//------------------------------------------------------
+// reset_cache -- synchronous by default, async optional (unstable)
+//------------------------------------------------------
+void RankWorker::reset_cache(size_t pos, bool async) {
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        if (should_exit_) {
+            throw std::runtime_error("RankWorker is closing; cannot reset_cache");
+        }
+
+        pending_reset_pos_ = pos;
+        job_cmd_ = Command::RESET_CACHE;
+        has_job_ = true;
+        job_done_ = false;
+    }
+    cv_.notify_all();
+
+    // By default, wait for job completion (synchronous)
+    // If async=true, return immediately (unstable - use with caution)
+    if (!async) {
+        std::unique_lock<std::mutex> lk(mutex_);
+        cv_.wait(lk, [&] { return job_done_ || should_exit_; });
+
+        if (should_exit_) {
+            throw std::runtime_error("RankWorker stopped while resetting cache");
+        }
+    }
+}
+
 //------------------------------------------------------
 // close -- request shutdown and join thread
 //------------------------------------------------------
@@ -160,6 +189,7 @@ void RankWorker::thread_loop() {
            std::string local_param_name;
            infinicore::Tensor local_param;
            std::vector<std::any> local_args;
+            size_t local_reset_pos = 0;

            // Wait for a job or exit
            {
@@ -177,6 +207,8 @@ void RankWorker::thread_loop() {
                    local_param = pending_param_;
                } else if (local_cmd == Command::RUN) {
                    local_args = pending_args_;
+                } else if (local_cmd == Command::RESET_CACHE) {
+                    local_reset_pos = pending_reset_pos_;
                }

                // mark job as being processed
@@ -225,6 +257,25 @@ void RankWorker::thread_loop() {
                    spdlog::error("[{}] exception during forward: {}\n", info(), e.what());
                    break;
                }
+            } else if (local_cmd == Command::RESET_CACHE) {
+                try {
+                    // Generic reset_cache on the model interface
+                    model_->reset_cache(local_reset_pos);
+
+                    {
+                        std::lock_guard<std::mutex> lk(mutex_);
+                        job_done_ = true;
+                    }
+                    cv_.notify_all();
+
+                } catch (const std::exception &e) {
+                    std::lock_guard<std::mutex> lk(mutex_);
+                    should_exit_ = true;
+                    job_done_ = true;
+                    cv_.notify_all();
+                    spdlog::error("[{}] exception during reset_cache: {}\n", info(), e.what());
+                    break;
+                }
            } else {
                // Shouldn't reach here (no-op)
            }

--- a/csrc/engine/rank_worker.hpp
+++ b/csrc/engine/rank_worker.hpp
@@ -17,6 +17,7 @@ class RankWorker {
        INIT,
        LOAD,
        RUN,
+        RESET_CACHE,
        STOP
    };

@@ -34,6 +35,11 @@ public:
    // Submit a run (forward) job.
    void run(const std::vector<std::any> &args);

+    // Reset the internal cache in the model (clears state between generations)
+    // By default, this is synchronous (blocks until reset completes).
+    // If async=true, this becomes asynchronous (unstable - use with caution).
+    void reset_cache(size_t pos = 0, bool async = false);
+
    // Wait until run job completes. The result can be retrieved with get_output().
    void wait();

@@ -67,6 +73,7 @@ private:
    std::string pending_param_name_;
    infinicore::Tensor pending_param_;
    std::vector<std::any> pending_args_;
+    size_t pending_reset_pos_ = 0;

    // Output (protected by mutex)
    infinicore::Tensor output_;

--- a/csrc/models/debug_utils/tensor_utils.hpp
+++ b/csrc/models/debug_utils/tensor_utils.hpp
@@ -28,34 +28,100 @@ inline void log_tensor_stats(const infinicore::Tensor &tensor, const std::string

    SPDLOG_INFO("  {}: shape={}, dtype={}, device={}", name, shape_str, static_cast<int>(dtype), device.toString());

-    // For F32 tensors, compute and log statistics
-    if (dtype == infinicore::DataType::F32) {
+    // For F32, F16, and BF16 tensors, compute and log statistics
+    if (dtype == infinicore::DataType::F32 ||
+        dtype == infinicore::DataType::F16 ||
+        dtype == infinicore::DataType::BF16) {
        // Copy to CPU if needed and compute stats
        auto cpu_tensor = tensor->to(infinicore::Device(infinicore::Device::Type::CPU, 0));
        std::byte *raw_data = cpu_tensor->data();
-        float *data = reinterpret_cast<float*>(raw_data);
        size_t numel = cpu_tensor->numel();

        if (numel > 0) {
-            float min_val = *std::min_element(data, data + numel);
-            float max_val = *std::max_element(data, data + numel);
-            float sum = std::accumulate(data, data + numel, 0.0f);
-            float mean_val = sum / static_cast<float>(numel);
-
-            SPDLOG_INFO("    Stats: min={:.6e}, max={:.6e}, mean={:.6e}, numel={}",
-                       min_val, max_val, mean_val, numel);
-
-            // Log sample values at specific positions
-            if (log_samples && numel > 0) {
-                size_t sample_count = std::min(max_samples, numel);
-                SPDLOG_INFO("    Sample values (first {}):", sample_count);
-                for (size_t i = 0; i < sample_count; ++i) {
-                    SPDLOG_INFO("      [{}] = {:.6e}", i, data[i]);
+            if (dtype == infinicore::DataType::F32) {
+                float *data = reinterpret_cast<float*>(raw_data);
+                float min_val = *std::min_element(data, data + numel);
+                float max_val = *std::max_element(data, data + numel);
+                float sum = std::accumulate(data, data + numel, 0.0f);
+                float mean_val = sum / static_cast<float>(numel);
+
+                SPDLOG_INFO("    Stats: min={:.6e}, max={:.6e}, mean={:.6e}, numel={}",
+                           min_val, max_val, mean_val, numel);
+
+                // Log sample values at specific positions
+                if (log_samples && numel > 0) {
+                    size_t sample_count = std::min(max_samples, numel);
+                    SPDLOG_INFO("    Sample values (first {}):", sample_count);
+                    for (size_t i = 0; i < sample_count; ++i) {
+                        SPDLOG_INFO("      [{}] = {:.6e}", i, data[i]);
+                    }
+                }
+            } else if (dtype == infinicore::DataType::F16) {
+                // F16 is typically uint16_t, need to convert to float for logging
+                uint16_t *data = reinterpret_cast<uint16_t*>(raw_data);
+                std::vector<float> float_data(numel);
+                for (size_t i = 0; i < numel; ++i) {
+                    // Simple F16 to F32 conversion (approximate)
+                    uint16_t h = data[i];
+                    uint32_t sign = (h >> 15) & 0x1;
+                    uint32_t exp = (h >> 10) & 0x1F;
+                    uint32_t mant = h & 0x3FF;
+                    uint32_t f32 = (sign << 31) | ((exp + 112) << 23) | (mant << 13);
+                    float_data[i] = *reinterpret_cast<float*>(&f32);
+                }
+                float min_val = *std::min_element(float_data.begin(), float_data.end());
+                float max_val = *std::max_element(float_data.begin(), float_data.end());
+                float sum = std::accumulate(float_data.begin(), float_data.end(), 0.0f);
+                float mean_val = sum / static_cast<float>(numel);
+
+                SPDLOG_INFO("    Stats (F16): min={:.6e}, max={:.6e}, mean={:.6e}, numel={}",
+                           min_val, max_val, mean_val, numel);
+
+                if (log_samples && numel > 0) {
+                    size_t sample_count = std::min(max_samples, numel);
+                    SPDLOG_INFO("    Sample values (first {}):", sample_count);
+                    for (size_t i = 0; i < sample_count; ++i) {
+                        SPDLOG_INFO("      [{}] = {:.6e}", i, float_data[i]);
+                    }
+                }
+            } else if (dtype == infinicore::DataType::BF16) {
+                // BF16 is typically uint16_t, need to convert to float for logging
+                uint16_t *data = reinterpret_cast<uint16_t*>(raw_data);
+                std::vector<float> float_data(numel);
+                for (size_t i = 0; i < numel; ++i) {
+                    // BF16 to F32 conversion
+                    uint16_t b = data[i];
+                    uint32_t f32 = (static_cast<uint32_t>(b) << 16);
+                    float_data[i] = *reinterpret_cast<float*>(&f32);
+                }
+                float min_val = *std::min_element(float_data.begin(), float_data.end());
+                float max_val = *std::max_element(float_data.begin(), float_data.end());
+                float sum = std::accumulate(float_data.begin(), float_data.end(), 0.0f);
+                float mean_val = sum / static_cast<float>(numel);
+
+                SPDLOG_INFO("    Stats (BF16): min={:.6e}, max={:.6e}, mean={:.6e}, numel={}",
+                           min_val, max_val, mean_val, numel);
+
+                if (log_samples && numel > 0) {
+                    size_t sample_count = std::min(max_samples, numel);
+                    SPDLOG_INFO("    Sample values (first {}):", sample_count);
+                    for (size_t i = 0; i < sample_count; ++i) {
+                        SPDLOG_INFO("      [{}] = {:.6e}", i, float_data[i]);
+                    }
+
+                    // Also log last N values to see newly appended decode tokens
+                    // This is critical for debugging precision issues at decode steps
+                    if (numel > sample_count) {
+                        SPDLOG_INFO("    Sample values (last {}):", sample_count);
+                        for (size_t i = numel - sample_count; i < numel; ++i) {
+                            SPDLOG_INFO("      [{}] = {:.6e}", i, float_data[i]);
+                        }
+                    }
                }
            }
        }
    } else {
-        SPDLOG_INFO("  {} (Stats computation skipped for non-F32 tensor)", name);
+        SPDLOG_INFO("  {} (Stats computation skipped for unsupported dtype)", name);
    }
 }


--- a/csrc/models/infinilm_model.hpp
+++ b/csrc/models/infinilm_model.hpp
@@ -9,5 +9,7 @@ class InfinilmModel : public infinicore::nn::Module {
 public:
    virtual ~InfinilmModel() = default;
    virtual infinicore::Tensor forward(std::vector<std::any>) const = 0;
+    // Optional: reset cache; default no-op for models without cache
+    virtual void reset_cache(size_t pos = 0) {}
 };
 } // namespace infinilm
--- a/csrc/models/llama/llama_attention.cpp
+++ b/csrc/models/llama/llama_attention.cpp
@@ -13,27 +13,32 @@
 namespace infinilm::models::llama {

 LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device,
+                               size_t layer_idx,
                               infinicore::DataType dtype)
-    : hidden_size_(config.hidden_size),
+    : layer_idx_(layer_idx),
+      hidden_size_(config.hidden_size),
      num_attention_heads_(config.num_attention_heads),
      num_key_value_heads_(config.num_key_value_heads),
      head_dim_(config.head_dim),
      kv_dim_(config.kv_dim()),
-      use_bias_(config.attention_bias) {
+      use_bias_(config.attention_bias),
+      use_output_bias_(config.attention_output_bias),
+      max_position_embeddings_(config.max_position_embeddings) {
    // Initialize projection layers
    INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, hidden_size_, use_bias_,
                              dtype, device);
    INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, kv_dim_, use_bias_,
                              dtype, device);
    INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, kv_dim_, use_bias_,
-                              dtype, device);
-    INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_bias_,
+                               dtype, device);
+    // Output projection uses attention_output_bias (can be different from qkv)
+    INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_,
                              dtype, device);
 }

 infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states,
-                                           const infinicore::Tensor &position_ids,
-                                           void *kv_cache) const {
+                                            const infinicore::Tensor &position_ids,
+                                            void *kv_cache) const {
    if (!rotary_emb_) {
        throw std::runtime_error("LlamaAttention: rotary_emb not configured");
    }
@@ -72,26 +77,22 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
        throw std::runtime_error("Unexpected position_ids shape");
    }

-    // 4. Process each batch item separately for attention computation
-    infinilm::cache::KVCache *external_cache = static_cast<infinilm::cache::KVCache *>(kv_cache);
-
+    // 4. Prepare KV caches
    // Convert to [batch, n_head, seq_len, head_dim] for cache
    // Ensure contiguous after permute for F16 compatibility with cache operations
    q_reshaped = q_reshaped->permute({0, 2, 1, 3})->contiguous(); // [bs, n_q_head, seq_len, head_dim]
    auto k_permuted = k_reshaped->permute({0, 2, 1, 3});          // [bs, n_kv_head, seq_len, head_dim]
    auto v_permuted = v_reshaped->permute({0, 2, 1, 3});          // [bs, n_kv_head, seq_len, head_dim]
-
-    // 4. Prepare KV caches
+    infinilm::cache::DynamicCache *external_cache = static_cast<infinilm::cache::DynamicCache *>(kv_cache);
    infinicore::Tensor k_total; // [bs, n_kv_head, total_seq_len, head_dim]
    infinicore::Tensor v_total; // [bs, n_kv_head, total_seq_len, head_dim]
    if (external_cache != nullptr) {
-        auto [k_total_tmp, v_total_tmp] = external_cache->update(k_permuted, v_permuted);
+        auto [k_total_tmp, v_total_tmp] = external_cache->update(layer_idx_, k_permuted, v_permuted);
        k_total = k_total_tmp;
        v_total = v_total_tmp;
    } else {
-        auto [k_total_tmp, v_total_tmp] = internal_cache_.update(k_permuted, v_permuted);
-        k_total = k_total_tmp;
-        v_total = v_total_tmp;
+        // No external cache - this shouldn't happen in normal operation, but handle gracefully
+        throw std::runtime_error("LlamaAttention: kv_cache is required but nullptr provided");
    }
    auto total_seq_len = k_total->shape()[2];


--- a/csrc/models/llama/llama_attention.hpp
+++ b/csrc/models/llama/llama_attention.hpp
@@ -29,9 +29,11 @@ public:
     *
     * @param config Model configuration
     * @param device Device to create tensors on
+     * @param layer_idx Layer index for cache access
     * @param dtype Optional data type for model parameters (defaults to F32)
     */
    LlamaAttention(const LlamaConfig &config, const infinicore::Device &device,
+                  size_t layer_idx,
                  infinicore::DataType dtype = infinicore::DataType::F32);

    /**
@@ -39,13 +41,18 @@ public:
     *
     * @param hidden_states Input tensor of shape [batch, seq_len, hidden_size]
     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
-     * @param kv_cache Optional KV cache for incremental decoding
+     * @param kv_cache Optional model-level KV cache for incremental decoding
     * @return Output tensor of shape [batch, seq_len, hidden_size]
     */
    infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
                                const infinicore::Tensor &position_ids,
                                void *kv_cache = nullptr) const;

+    /**
+     * @brief Get the layer index
+     */
+    size_t layer_idx() const { return layer_idx_; }
+
    /**
     * @brief Provide shared RoPE module from parent model.
     */
@@ -68,15 +75,15 @@ protected:
    std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;

 private:
+    size_t layer_idx_;           // Layer index for cache access
    size_t hidden_size_;
    size_t num_attention_heads_;
    size_t num_key_value_heads_;
    size_t head_dim_;
    size_t kv_dim_;
-    bool use_bias_;
-
-    // Internal KV cache for when no external cache is provided
-    mutable infinilm::cache::KVCache internal_cache_;
+    bool use_bias_;              // Bias for Q/K/V projections
+    bool use_output_bias_;        // Bias for output projection (o_proj)
+    size_t max_position_embeddings_;  // For cache initialization (deprecated, kept for compatibility)
 };

 } // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_config.hpp
+++ b/csrc/models/llama/llama_config.hpp
@@ -3,6 +3,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <string>
+#include <vector>

 namespace infinilm::models::llama {

@@ -37,14 +38,23 @@ struct LlamaConfig {

    // Optional features
    bool use_cache = true;                   // Whether to use KV cache
-    bool attention_bias = false;             // Whether to use bias in attention projections
+    bool attention_bias = true;              // Whether to use bias in Q/K/V projections (default true for 9G7B compatibility)
+    bool attention_output_bias = false;      // Whether to use bias in output projection (o_proj)
    bool mlp_bias = false;                   // Whether to use bias in MLP projections
    bool tie_word_embeddings = false;        // Whether to tie input/output embeddings

+    // Training/initialization parameters
+    double attention_dropout = 0.0;          // Dropout ratio for attention probabilities
+    double initializer_range = 0.02;         // Standard deviation for weight initialization
+    size_t pretraining_tp = 1;                // Tensor parallelism rank used during pretraining
+
+    // Model metadata
+    std::string name_or_path = "";           // Model name or path identifier
+
    // Token IDs
    int64_t pad_token_id = -1;               // Padding token ID (optional)
-    int64_t bos_token_id = 1;                // Beginning of sequence token ID
-    int64_t eos_token_id = 2;                // End of sequence token ID
+    std::vector<int64_t> bos_token_id = {1}; // Beginning of sequence token ID(s)
+    std::vector<int64_t> eos_token_id = {2}; // End of sequence token ID(s)

    /**
     * @brief Compute key-value dimension for Grouped Query Attention (GQA)

--- a/csrc/models/llama/llama_decoder_layer.cpp
+++ b/csrc/models/llama/llama_decoder_layer.cpp
@@ -5,7 +5,9 @@
 namespace infinilm::models::llama {

 LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device,
-                                     infinicore::DataType dtype) {
+                                     size_t layer_idx,
+                                     infinicore::DataType dtype)
+    : layer_idx_(layer_idx) {
    // Initialize layer normalization layers
    INFINICORE_NN_MODULE_INIT(input_layernorm, config.hidden_size, config.rms_norm_eps,
                              dtype, device);
@@ -13,7 +15,7 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, const infinicore
                              dtype, device);

    // Initialize attention and MLP modules
-    INFINICORE_NN_MODULE_INIT(self_attn, config, device, dtype);
+    INFINICORE_NN_MODULE_INIT(self_attn, config, device, layer_idx, dtype);
    INFINICORE_NN_MODULE_INIT(mlp, config, device, dtype);
 }


--- a/csrc/models/llama/llama_decoder_layer.hpp
+++ b/csrc/models/llama/llama_decoder_layer.hpp
@@ -28,9 +28,11 @@ public:
     *
     * @param config Model configuration
     * @param device Device to create tensors on
+     * @param layer_idx Layer index for cache management and debugging
     * @param dtype Optional data type for model parameters (defaults to F32)
     */
    LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device,
+                     size_t layer_idx,
                     infinicore::DataType dtype = infinicore::DataType::F32);

    /**
@@ -45,6 +47,11 @@ public:
                                const infinicore::Tensor &position_ids,
                                void *kv_cache = nullptr) const;

+    /**
+     * @brief Get the layer index
+     */
+    size_t layer_idx() const { return layer_idx_; }
+
    void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
        if (self_attn_) {
            self_attn_->set_rotary_emb(rotary_emb);
@@ -60,6 +67,9 @@ protected:
    // Attention and MLP
    INFINICORE_NN_MODULE(LlamaAttention, self_attn);
    INFINICORE_NN_MODULE(LlamaMLP, mlp);
+
+private:
+    size_t layer_idx_;  // Layer index for cache management and debugging
 };

 } // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_for_causal_lm.cpp
+++ b/csrc/models/llama/llama_for_causal_lm.cpp
 #include "llama_for_causal_lm.hpp"
 #include "infinicore/nn/linear.hpp"
 #include "infinicore/ops.hpp"
+#include "infinicore/context/context.hpp"
+#include <iostream>

 namespace infinilm::models::llama {

 LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, const infinicore::Device &device,
                                   infinicore::DataType dtype) {

+    // Initialize module's device_ member
    device_ = device;
+
    // Initialize base model
    INFINICORE_NN_MODULE_INIT(model, config, device, dtype);

@@ -20,14 +24,21 @@ LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, const infinicore::

 infinicore::Tensor LlamaForCausalLM::forward(const infinicore::Tensor &input_ids,
                                             const infinicore::Tensor &position_ids,
-                                             std::vector<void *> *kv_caches) const {
+                                             void *kv_cache) const {
    // 1. Forward through base model to get hidden states
    auto position_ids_device = position_ids->to(device_);
-    auto hidden_states = model_->forward(input_ids, position_ids_device, kv_caches);
+    auto hidden_states = model_->forward(input_ids, position_ids_device, kv_cache);

    // 2. Apply language modeling head to get logits
    auto logits = lm_head_->forward(hidden_states);

+    // 3. CRITICAL: Synchronize the C++ backend's context after forward pass
+    // This ensures all C++ backend operations complete before returning to Python
+    if (device_.getType() != infinicore::Device::Type::CPU) {
+        infinicore::context::setDevice(device_, false);
+        infinicore::context::syncStream();
+    }
+
    return logits;
 }

@@ -49,4 +60,8 @@ infinicore::Tensor LlamaForCausalLM::forward(std::vector<std::any> args) const {
    return forward(input_ids, position_ids, kv_caches);
 }

+void LlamaForCausalLM::reset_cache(size_t pos) {
+    model_->reset_cache(pos);
+}
+
 } // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_for_causal_lm.hpp
+++ b/csrc/models/llama/llama_for_causal_lm.hpp
@@ -35,18 +35,18 @@ public:
     *
     * @param input_ids Token IDs tensor of shape [batch, seq_len]
     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
-     * @param kv_caches Optional KV caches for incremental decoding (one per layer)
+     * @param kv_cache Optional model-level KV cache for incremental decoding
     * @return Logits tensor of shape [batch, seq_len, vocab_size]
-     *
-     * Note: This is a placeholder forward method. The actual implementation
-     * will be added when integrating with the inference engine.
     */
    infinicore::Tensor forward(const infinicore::Tensor &input_ids,
                               const infinicore::Tensor &position_ids,
-                               std::vector<void *> *kv_caches = nullptr) const;
+                               void *kv_cache = nullptr) const;

    infinicore::Tensor forward(std::vector<std::any> args) const override;

+    // Reset internal cache position
+    void reset_cache(size_t pos = 0) override;
+
    // Module information
    const LlamaConfig &config() const { return model_->config(); }
    LlamaModel &model() { return *model_; }

--- a/csrc/models/llama/llama_model.cpp
+++ b/csrc/models/llama/llama_model.cpp
@@ -13,9 +13,15 @@ LlamaModel::LlamaModel(const LlamaConfig &config, const infinicore::Device &devi
    INFINICORE_NN_MODULE_INIT(embed_tokens, config.vocab_size, config.hidden_size,
                              std::nullopt, dtype, device);

-    // Initialize decoder layers
-    INFINICORE_NN_MODULE_VEC_INIT(layers, config.num_hidden_layers, LlamaDecoderLayer,
-                                   config, device, dtype);
+    // Initialize decoder layers with layer indices
+    // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments
+    //       (e.g., via a factory function or lambda that receives the layer index)
+    //       Currently, we can't use the macro because each layer needs a different layer_idx
+    layers_.reserve(config.num_hidden_layers);
+    for (size_t i = 0; i < config.num_hidden_layers; ++i) {
+        layers_.push_back(this->register_module<LlamaDecoderLayer>(
+            "layers." + std::to_string(i), config, device, i, dtype));
+    }

    // Initialize final layer normalization
    INFINICORE_NN_MODULE_INIT(norm, config.hidden_size, config.rms_norm_eps,
@@ -36,27 +42,59 @@ LlamaModel::LlamaModel(const LlamaConfig &config, const infinicore::Device &devi

 infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
                                       const infinicore::Tensor &position_ids,
-                                       std::vector<void *> *kv_caches) const {
+                                       void *kv_cache) const {
+    // Use persistent internal cache if no external cache is provided
+    // This matches Python backend behavior: if use_cache and past_key_values is None, create DynamicCache
+    // The cache persists across forward calls to enable incremental decoding
+    void *cache_to_use = kv_cache;
+
+    if (kv_cache == nullptr) {
+        // Create or reuse persistent internal cache at model level
+        // This ensures the cache persists across multiple forward calls (prefill -> decode -> decode...)
+        size_t seq_len = input_ids->shape()[1];
+
+        if (!cache_) {
+            // First time: create cache
+            cache_ = std::make_unique<infinilm::cache::DynamicCache>(
+                config_.num_hidden_layers,
+                config_.max_position_embeddings
+            );
+        }
+        cache_to_use = cache_.get();
+    }
+
    // 1. Embed tokens: input_ids -> [batch, seq_len, hidden_size]
    auto hidden_states = embed_tokens_->forward(input_ids);

    // 2. Process through all decoder layers
-    for (size_t i = 0; i < layers_.size(); ++i) {
-        void *kv_cache = (kv_caches && i < kv_caches->size()) ? (*kv_caches)[i] : nullptr;
-        hidden_states = layers_.at(i)->forward(hidden_states, position_ids, kv_cache);
+    size_t num_layers = layers_.size();
+    for (size_t i = 0; i < num_layers; ++i) {
+        // Pass model-level cache (layer index is now a property of the layer)
+        hidden_states = layers_.at(i)->forward(hidden_states, position_ids, cache_to_use);
+
+        // DEBUG: Disabled previous final layer logging
+        // Logging moved to decoder layer for post-attention normalization
    }

+
    // 3. Apply final layer normalization to last token only (aligns with transformers)

    // Narrow to last token: [batch, seq_len, hidden_size] -> [batch, 1, hidden_size]
    auto shape = hidden_states->shape();
    size_t seq_len = shape[1];
-    auto last_token = hidden_states; //->narrow({{1, seq_len - 1, 1}});
+    auto last_token = hidden_states->narrow({{1, seq_len - 1, 1}});

-    auto normalized_states = norm_->forward(hidden_states);
-    auto normalized_last_token = normalized_states->narrow({{1, seq_len - 1, 1}});
+    // DEBUG: Disabled previous final layer normalization logging
+    // Normalize only the last token (matches Python backend)
+    auto normalized_last_token = norm_->forward(last_token);

    return normalized_last_token;
 }

+void LlamaModel::reset_cache(size_t pos) const {
+    if (cache_) {
+        cache_->reset(pos);
+    }
+}
+
 } // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_model.hpp
+++ b/csrc/models/llama/llama_model.hpp
@@ -2,6 +2,7 @@

 #include "llama_config.hpp"
 #include "llama_decoder_layer.hpp"
+#include "cache/kv_cache.hpp"
 #include "infinicore/nn/module.hpp"
 #include "infinicore/nn/embedding.hpp"
 #include "infinicore/nn/rmsnorm.hpp"
@@ -9,6 +10,7 @@
 #include "infinicore/tensor.hpp"
 #include "infinicore/device.hpp"
 #include <vector>
+#include <memory>

 namespace infinilm::models::llama {

@@ -40,21 +42,27 @@ public:
     *
     * @param input_ids Token IDs tensor of shape [batch, seq_len]
     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
-     * @param kv_caches Optional KV caches for incremental decoding (one per layer)
+     * @param kv_cache Optional model-level KV cache for incremental decoding
     * @return Output tensor of shape [batch, seq_len, hidden_size]
-     *
-     * Note: This is a placeholder forward method. The actual implementation
-     * will be added when integrating with the inference engine.
     */
    infinicore::Tensor forward(const infinicore::Tensor &input_ids,
                                const infinicore::Tensor &position_ids,
-                                std::vector<void *> *kv_caches = nullptr) const;
+                                void *kv_cache = nullptr) const;


    // Module information
    const LlamaConfig &config() const { return config_; }
    size_t num_layers() const { return config_.num_hidden_layers; }

+
+    /**
+     * @brief Reset the internal cache to a specific position
+     * This should be called when starting a new generation sequence to prevent state
+     * from persisting between different questions/prompts
+     * @param pos Position to reset to (defaults to 0)
+     */
+    void reset_cache(size_t pos = 0) const;
+
 protected:
    // Token embeddings
    INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);
@@ -70,6 +78,10 @@ protected:

 private:
    LlamaConfig config_;
+    // Persistent cache for when no external cache is provided
+    // Mutable because it's not part of the model's learned parameters,
+    // but needs to persist across forward calls for incremental decoding
+    mutable std::unique_ptr<infinilm::cache::DynamicCache> cache_;
 };

 } // namespace infinilm::models::llama
--- a/csrc/pybind11/engine.hpp
+++ b/csrc/pybind11/engine.hpp
@@ -51,7 +51,11 @@ inline void bind_infer_engine(py::module &m) {
            }
            return result;
        })
-        .def("generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); }, "Run inference on all ranks with arbitrary arguments");
+        .def("generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); }, "Run inference on all ranks with arbitrary arguments")
+        .def("reset_cache", &InferEngine::reset_cache,
+             py::arg("pos") = 0, py::arg("async") = false,
+             "Reset the internal cache in all workers to a specific position (clears state between generations). "
+             "By default, this is synchronous. If async=True, this becomes asynchronous (unstable - use with caution).");

    // Optionally, you can add __repr__ for debugging
    m.attr("InferEngine").attr("__repr__") = py::cpp_function([](const InferEngine &self) {

--- a/csrc/pybind11/models/llama.hpp
+++ b/csrc/pybind11/models/llama.hpp
@@ -56,105 +56,80 @@ inline void bind_llama(py::module &m) {
        .def_readwrite("model_type", &LlamaConfig::model_type)
        .def_readwrite("rope_theta", &LlamaConfig::rope_theta)
        .def_readwrite("attention_bias", &LlamaConfig::attention_bias)
+        .def_readwrite("attention_output_bias", &LlamaConfig::attention_output_bias)
        .def_readwrite("mlp_bias", &LlamaConfig::mlp_bias)
        .def_readwrite("tie_word_embeddings", &LlamaConfig::tie_word_embeddings)
        .def_readwrite("use_cache", &LlamaConfig::use_cache)
+        .def_readwrite("attention_dropout", &LlamaConfig::attention_dropout)
+        .def_readwrite("initializer_range", &LlamaConfig::initializer_range)
+        .def_readwrite("pretraining_tp", &LlamaConfig::pretraining_tp)
+        .def_readwrite("name_or_path", &LlamaConfig::name_or_path)
        .def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
-        .def_readwrite("bos_token_id", &LlamaConfig::bos_token_id)
-        .def_readwrite("eos_token_id", &LlamaConfig::eos_token_id)
-        .def("validate", &LlamaConfig::validate)
-        .def("kv_dim", &LlamaConfig::kv_dim);
-
-    // Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here
-
-    // Helper function to convert Python object (InfiniCore tensor, numpy array, or torch tensor) to C++ Tensor
-    auto convert_to_tensor = [](py::object obj, const Device &device) -> infinicore::Tensor {
-        // First check if it's already an InfiniCore tensor (has _underlying attribute)
-        if (py::hasattr(obj, "_underlying")) {
-            try {
-                // Extract the underlying C++ tensor from Python InfiniCore tensor
-                auto underlying = obj.attr("_underlying");
-                auto infini_tensor = underlying.cast<infinicore::Tensor>();
-                return infini_tensor;
-            } catch (const py::cast_error &) {
-                // Fall through to other conversion methods
-            }
-        }
-
-        // Try direct cast (in case it's already a C++ tensor exposed to Python)
-        try {
-            auto infini_tensor = obj.cast<infinicore::Tensor>();
-            return infini_tensor;
-        } catch (const py::cast_error &) {
-            // Not an InfiniCore tensor, continue with other conversions
-        }
-
-        // Try to get data pointer and shape from numpy array or torch tensor
-        void *data_ptr = nullptr;
-        std::vector<size_t> shape;
-        infinicore::DataType dtype = infinicore::DataType::F32;
-
-        // Check if it's a numpy array
-        if (py::hasattr(obj, "__array_interface__")) {
-            auto array_info = obj.attr("__array_interface__");
-            auto data = array_info["data"];
-            if (py::isinstance<py::tuple>(data)) {
-                auto data_tuple = data.cast<py::tuple>();
-                data_ptr = reinterpret_cast<void *>(data_tuple[0].cast<uintptr_t>());
-            } else {
-                data_ptr = reinterpret_cast<void *>(data.cast<uintptr_t>());
-            }
-
-            auto shape_obj = array_info["shape"];
-            if (py::isinstance<py::tuple>(shape_obj)) {
-                auto shape_tuple = shape_obj.cast<py::tuple>();
-                for (auto dim : shape_tuple) {
-                    shape.push_back(dim.cast<size_t>());
+        .def_property("bos_token_id",
+            [](const LlamaConfig &self) {
+                // Always return as list to match Python config format
+                return py::cast(self.bos_token_id);
+            },
+            [](LlamaConfig &self, py::object value) {
+                // Accept both single int and list
+                if (py::isinstance<py::int_>(value)) {
+                    self.bos_token_id = {value.cast<int64_t>()};
+                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
+                    self.bos_token_id = value.cast<std::vector<int64_t>>();
+                } else {
+                    throw py::type_error("bos_token_id must be int or list of ints");
                }
-            } else {
-                shape.push_back(shape_obj.cast<size_t>());
-            }
-
-            // Get dtype
-            std::string typestr = array_info["typestr"].cast<std::string>();
-            if (typestr == "<f4" || typestr == "float32") {
-                dtype = infinicore::DataType::F32;
-            } else if (typestr == "<f2" || typestr == "float16") {
-                dtype = infinicore::DataType::F16;
-            } else if (typestr == "<i4" || typestr == "int32") {
-                dtype = infinicore::DataType::I32;
-            } else if (typestr == "<i8" || typestr == "int64") {
-                dtype = infinicore::DataType::I64;
-            }
-        } else if (py::hasattr(obj, "data_ptr")) {
-            // Try torch tensor
-            data_ptr = reinterpret_cast<void *>(obj.attr("data_ptr")().cast<uintptr_t>());
-            auto shape_obj = obj.attr("shape");
-            if (py::isinstance<py::tuple>(shape_obj) || py::isinstance<py::list>(shape_obj)) {
-                for (auto dim : shape_obj) {
-                    shape.push_back(dim.cast<size_t>());
+            })
+        .def_property("eos_token_id",
+            [](const LlamaConfig &self) {
+                // Always return as list to match Python config format
+                return py::cast(self.eos_token_id);
+            },
+            [](LlamaConfig &self, py::object value) {
+                // Accept both single int and list
+                if (py::isinstance<py::int_>(value)) {
+                    self.eos_token_id = {value.cast<int64_t>()};
+                } else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
+                    self.eos_token_id = value.cast<std::vector<int64_t>>();
+                } else {
+                    throw py::type_error("eos_token_id must be int or list of ints");
                }
-            } else {
-                shape.push_back(shape_obj.cast<size_t>());
-            }
-
-            // Get dtype from torch tensor
-            std::string dtype_str = py::str(obj.attr("dtype"));
-            if (dtype_str.find("float32") != std::string::npos) {
-                dtype = infinicore::DataType::F32;
-            } else if (dtype_str.find("float16") != std::string::npos) {
-                dtype = infinicore::DataType::F16;
-            } else if (dtype_str.find("int32") != std::string::npos) {
-                dtype = infinicore::DataType::I32;
-            } else if (dtype_str.find("int64") != std::string::npos) {
-                dtype = infinicore::DataType::I64;
-            }
-        } else {
-            throw std::runtime_error("Unsupported tensor type. Expected InfiniCore tensor, numpy array, or torch tensor.");
-        }
+            })
+        .def("validate", &LlamaConfig::validate)
+        .def("kv_dim", &LlamaConfig::kv_dim)
+        // Add __dir__ to make attributes discoverable via dir() in Python
+        .def("__dir__", [](const LlamaConfig &self) {
+            py::list dir_list;
+            dir_list.append("vocab_size");
+            dir_list.append("hidden_size");
+            dir_list.append("intermediate_size");
+            dir_list.append("num_hidden_layers");
+            dir_list.append("num_attention_heads");
+            dir_list.append("num_key_value_heads");
+            dir_list.append("head_dim");
+            dir_list.append("max_position_embeddings");
+            dir_list.append("rms_norm_eps");
+            dir_list.append("hidden_act");
+            dir_list.append("model_type");
+            dir_list.append("rope_theta");
+            dir_list.append("attention_bias");
+            dir_list.append("attention_output_bias");
+            dir_list.append("mlp_bias");
+            dir_list.append("tie_word_embeddings");
+            dir_list.append("use_cache");
+            dir_list.append("attention_dropout");
+            dir_list.append("initializer_range");
+            dir_list.append("pretraining_tp");
+            dir_list.append("name_or_path");
+            dir_list.append("pad_token_id");
+            dir_list.append("bos_token_id");
+            dir_list.append("eos_token_id");
+            dir_list.append("validate");
+            dir_list.append("kv_dim");
+            return dir_list;
+        });

-        return infinicore::Tensor::from_blob(data_ptr, shape, dtype, device);
-    };
+    // Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here

    // Bind LlamaForCausalLM
    py::class_<LlamaForCausalLM, std::shared_ptr<LlamaForCausalLM>>(m, "LlamaForCausalLM")
@@ -190,55 +165,62 @@ inline void bind_llama(py::module &m) {
                    return tensor;
                }
                throw std::runtime_error("Parameter '" + name + "' not found in model"); }, py::arg("name"))
-        .def("load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
+        .def("load_state_dict", [](LlamaForCausalLM &model, py::dict state_dict) {
                // Convert Python dict to C++ state_dict
                std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict;
                for (auto item : state_dict) {
                    std::string key = item.first.cast<std::string>();
                    py::object value = item.second.cast<py::object>();
-                    cpp_state_dict.emplace(key, convert_to_tensor(value, device));
+                    // Extract InfiniCore tensor from Python object
+                    infinicore::Tensor tensor;
+                    if (py::hasattr(value, "_underlying")) {
+                        tensor = value.attr("_underlying").cast<infinicore::Tensor>();
+                    } else {
+                        tensor = value.cast<infinicore::Tensor>();
+                    }
+                    cpp_state_dict.emplace(key, tensor);
                }
-                model.load_state_dict(cpp_state_dict); }, py::arg("state_dict"), py::arg("device"))
+                model.load_state_dict(cpp_state_dict); }, py::arg("state_dict"))
        .def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal)
-        .def("forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
-                // Helper to extract C++ tensor from Python object
-                auto get_tensor = [convert_to_tensor](py::object obj) -> infinicore::Tensor {
+        .def(
+            "reset_cache", [](const LlamaForCausalLM &model, size_t pos = 0) {
+            // Reset the internal cache to prevent state from persisting between generations
+            model.model().reset_cache(pos);
+        }, py::arg("pos") = 0, "Reset the internal cache to a specific position (clears state between generations)")
+        .def("forward", [](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_cache = py::none()) {
+                // Helper to extract C++ tensor from Python InfiniCore tensor
+                auto get_tensor = [](py::object obj) -> infinicore::Tensor {
                    // If it's already a Python InfiniCore tensor wrapper, extract underlying
                    if (py::hasattr(obj, "_underlying")) {
                        return obj.attr("_underlying").cast<infinicore::Tensor>();
                    }
                    // Try direct cast (in case it's already a C++ tensor)
-                    try {
-                        return obj.cast<infinicore::Tensor>();
-                    } catch (const py::cast_error &) {
-                        // Extract device from first tensor for conversion
-                        Device device = Device(Device::Type::CPU, 0);
-                        if (py::hasattr(obj, "device")) {
-                            try {
-                                auto py_device = obj.attr("device");
-                                if (py::hasattr(py_device, "_underlying")) {
-                                    device = py_device.attr("_underlying").cast<Device>();
-                                } else {
-                                    device = py_device.cast<Device>();
-                                }
-                            } catch (...) {
-                                // Keep default CPU device
-                            }
-                        }
-                        return convert_to_tensor(obj, device);
-                    }
+                    return obj.cast<infinicore::Tensor>();
                };

-                // Convert Python tensors to C++ tensors
+                // Extract InfiniCore tensors from Python objects
                auto infini_input_ids = get_tensor(input_ids);
                auto infini_position_ids = get_tensor(position_ids);

-                // Handle kv_caches if provided
-                std::vector<void *> *kv_caches_ptr = nullptr;
+            // Handle kv_cache if provided (model-level DynamicCache)
+            void *kv_cache_ptr = nullptr;
+            if (!kv_cache.is_none()) {
+                // Try to extract DynamicCache from Python object
+                if (py::hasattr(kv_cache, "_underlying")) {
+                    kv_cache_ptr = kv_cache.attr("_underlying").cast<void *>();
+                } else {
+                    // Try direct cast
+                    try {
+                        kv_cache_ptr = kv_cache.cast<void *>();
+                    } catch (...) {
+                        // If conversion fails, pass nullptr (cache will be ignored)
+                        kv_cache_ptr = nullptr;
+                    }
+                }
+            }

-                return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr); },
-             //
-             py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
+            return model.forward(infini_input_ids, infini_position_ids, kv_cache_ptr);
+        }, py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
 }

 } // namespace infinilm::models::llama
--- a/python/infinilm/generation/utils.py
+++ b/python/infinilm/generation/utils.py
@@ -119,6 +119,19 @@ class GenerationMixin:
    ):
        model_kwargs = kwargs

+        # -------------------------------------------------------------------- #
+        # CRITICAL: Reset internal cache before each new generation
+        # This prevents state from persisting between different questions/prompts
+        # -------------------------------------------------------------------- #
+        # Check if this is a cpp backend model (has _model attribute with reset_cache method)
+        if hasattr(self, '_model') and hasattr(self._model, 'reset_cache'):
+            try:
+                self._model.reset_cache()
+            except Exception as e:
+                # If reset_cache fails, log but continue (shouldn't happen)
+                import warnings
+                warnings.warn(f"Failed to reset cache: {e}")
+
        # -------------------------------------------------------------------- #
        #                       创建 cache                                      #
        # -------------------------------------------------------------------- #
@@ -166,6 +179,12 @@ class GenerationMixin:
            [eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
        )

+        # Extract sampling parameters from kwargs with defaults
+        random_val = model_kwargs.get("random_val", 0.1)
+        topp = model_kwargs.get("topp", 0.8)
+        topk = model_kwargs.get("topk", 1)
+        temperature = model_kwargs.get("temperature", 1.0)
+
        # -------------------------------------------------------------------------- #
        #                     初始化 position_ids
        # -------------------------------------------------------------------------- #
@@ -189,6 +208,7 @@ class GenerationMixin:
            #                     计算一次
            # -------------------------------------------------------------------------- #
            start_time = time.time()
+
            logits = self(**model_inputs)

            # -------------------------------------------------------------------------- #
@@ -206,15 +226,16 @@ class GenerationMixin:
                dtype=infinicore.int32,
                device=token_scores.device,
            )
+
            for i in range(0, batch_size):
                score = token_scores.narrow(0, i, 1).view((vocab_size,))
                out = next_tokens.narrow(0, i, 1).view([])
                infinicore.nn.functional.random_sample(
                    score,
-                    0.8,
-                    0.1,
-                    1,
-                    1.0,
+                    random_val,
+                    topp,
+                    topk,
+                    temperature,
                    out=out,
                )

@@ -236,7 +257,6 @@ class GenerationMixin:
            print(output_str, end="", flush=True)
            if stop_on_eos and token_id in eos_token_id_list:
                break
-
        print("\n</s>")
        print(f"\n\n\n Generation completed in {round(sum(time_list), 2)} ms")
        print(

--- a/python/infinilm/models/llama/backends/cpp.py
+++ b/python/infinilm/models/llama/backends/cpp.py
@@ -14,6 +14,9 @@ class LlamaConfig:

    This class wraps configuration_llama.LlamaConfig and provides
    a _underlying property that creates the C++ config object.
+
+    Automatically detects and handles both regular Llama models and Jiuge models
+    (fm9g7b, fm9g, minicpm) with appropriate defaults and validation.
    """

    def __init__(self, config_dict=None, **kwargs):
@@ -64,22 +67,61 @@ class LlamaConfig:
                except (AttributeError, TypeError):
                    pass

-            # Handle defaults
-            if (
-                not hasattr(self._cpp_config, "num_key_value_heads")
-                or self._cpp_config.num_key_value_heads == 0
-            ):
+            # Handle num_key_value_heads with validation
+            python_num_kv_heads = getattr(self._python_config, "num_key_value_heads", None)
+            if python_num_kv_heads is None or python_num_kv_heads == 0:
                self._cpp_config.num_key_value_heads = (
                    self._cpp_config.num_attention_heads
                )
-
-            if (
-                not hasattr(self._cpp_config, "head_dim")
-                or self._cpp_config.head_dim == 0
-            ):
-                self._cpp_config.head_dim = (
-                    self._cpp_config.hidden_size // self._cpp_config.num_attention_heads
-                )
+            else:
+                self._cpp_config.num_key_value_heads = python_num_kv_heads
+
+            # Handle head_dim with validation (critical for GEMM operations)
+            python_head_dim = getattr(self._python_config, "head_dim", None)
+            if python_head_dim is None or python_head_dim == 0:
+                # Compute from hidden_size and num_attention_heads
+                if self._cpp_config.hidden_size > 0 and self._cpp_config.num_attention_heads > 0:
+                    computed_head_dim = self._cpp_config.hidden_size // self._cpp_config.num_attention_heads
+                    self._cpp_config.head_dim = computed_head_dim
+                else:
+                    raise ValueError(
+                        f"Cannot compute head_dim: hidden_size={self._cpp_config.hidden_size}, "
+                        f"num_attention_heads={self._cpp_config.num_attention_heads}"
+                    )
+            else:
+                # Use from Python config
+                self._cpp_config.head_dim = python_head_dim
+                # Validate it matches expected value (warn but allow for flexibility)
+                if self._cpp_config.hidden_size > 0 and self._cpp_config.num_attention_heads > 0:
+                    expected_head_dim = self._cpp_config.hidden_size // self._cpp_config.num_attention_heads
+                    if self._cpp_config.head_dim != expected_head_dim:
+                        import warnings
+                        warnings.warn(
+                            f"head_dim ({self._cpp_config.head_dim}) != hidden_size/num_attention_heads ({expected_head_dim}). "
+                            f"Using head_dim from config."
+                        )
+
+            # Ensure vocab_size is set (explicit handling)
+            if hasattr(self._python_config, "vocab_size"):
+                self._cpp_config.vocab_size = self._python_config.vocab_size
+
+            # Validate config after setting all values (especially important for jiuge models)
+            if not self._cpp_config.validate():
+                raise ValueError("C++ LlamaConfig validation failed. Check config values.")
+
+            # Log key config values for debugging (especially useful for jiuge models)
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.info(
+                f"LlamaConfig ({self._python_config.model_type}) C++ LlamaConfig created: vocab_size={self._cpp_config.vocab_size}, "
+                f"hidden_size={self._cpp_config.hidden_size}, "
+                f"num_attention_heads={self._cpp_config.num_attention_heads}, "
+                f"num_key_value_heads={self._cpp_config.num_key_value_heads}, "
+                f"head_dim={self._cpp_config.head_dim}, "
+                f"kv_dim={self._cpp_config.kv_dim()}, "
+                f"attention_bias={self._cpp_config.attention_bias}, "
+                f"attention_output_bias={self._cpp_config.attention_output_bias}"
+            )

        return self._cpp_config

@@ -104,13 +146,23 @@ class LlamaForCausalLM(GenerationMixin):
        """
        super().__init__()

-        self.config = config
+        # Convert config to LlamaConfig (handles both regular Llama and Jiuge models)
+        if isinstance(config, dict):
+            config = LlamaConfig(**config)
+        elif not isinstance(config, LlamaConfig):
+            # Not a dict or LlamaConfig, try to convert
+            config = LlamaConfig(config)
+        # If already LlamaConfig, use as-is (it will auto-detect jiuge models)

        if device is None:
            device = infinicore.device()

        self.use_cache = False

+        # Store the Python wrapper config so it can be accessed later
+        # This is needed for DynamicCache which calls config.get_text_config()
+        self._config = config
+
        self._device = device
        # self._model = _infinilm.LlamaForCausalLM(
        #     config._underlying, device._underlying, dtype
@@ -149,10 +201,13 @@ class LlamaForCausalLM(GenerationMixin):
        """
        return self._model.get_parameter(name)

-    # @property
-    # def config(self):
-    #     """Get model configuration"""
-    #     return self._model.config()
+    @property
+    def config(self):
+        """Get model configuration"""
+        # Return the Python wrapper config instead of C++ config
+        # This ensures compatibility with code that expects PretrainedConfig methods
+        # like get_text_config() used by DynamicCache
+        return self._config

    def forward(self, input_ids, position_ids, *args, **kwargs):
        kv_caches = None
@@ -195,5 +250,6 @@ class LlamaForCausalLM(GenerationMixin):
        with open(config_path, "r") as f:
            config_dict = json.load(f)

+        # LlamaConfig automatically detects and handles jiuge models
        config = LlamaConfig(config_dict)
        return cls(config, device=device, dtype=dtype, **kwargs)