issue/74 add c++ Llama models and align to AutoLlama interface

Signed-off-by: Ceng23333 <441651826@qq.com>

issue/74 add c++ Llama models and align to AutoLlama interface
Signed-off-by: Ceng23333 <441651826@qq.com>
d6a641d3 · Ceng23333 · 3c6ad521 · d6a641d3 · d6a641d3 · d6a641d3
Commit d6a641d3 authored Dec 02, 2025 by Ceng23333
20 changed files
--- a/.gitignore
+++ b/.gitignore
 # Xmake cache
 .xmake/
 build/
+python/infinilm/lib/*.so
 # MacOS Cache
 .DS_Store
@@ -10,12 +11,13 @@ build/
 # Python
 __pycache__/
+*.egg-info/
 # Log
 *.log
 # Cache
-cache/
+.cache/
 # JSON
 *.json

--- a/.gitmodules
+++ b/.gitmodules
+[submodule "third_party/spdlog"]
+	path = third_party/spdlog
+	url = https://github.com/gabime/spdlog.git
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,
 - 测试模型推理服务性能
 ```bash
-python scripts/test_perf.py 
+python scripts/test_perf.py
 ```
 - 使用推理服务测试模型困惑度（Perplexity）
@@ -39,19 +39,32 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
 ## 使用方式(新版)
 - 编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
    - 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
    - 根据硬件平台，选择 xmake 构建配置
    - 编译安装InfiniCore
    - 安装 C++ 库
    - 安装 Python 包
+- 编译并安装 `InfiniLM` Python 包
+  - 安装第三方依赖
+  ```bash
+    git submodule update --init --recursive
+  ```
+  - 安装 InfiniLM Python 包
+  ```bash
+    pip install -e .
+  ```
 - 单次推理测试
    - llama示例
-```bash
+    ```bash
-python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
+    python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
-```
+    ```
-例如：
+    - 例如：
-```bash
+    ```bash
-python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0
+    python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
-```
+    ```
\ No newline at end of file
--- a/csrc/cache/kv_cache.hpp
+++ b/csrc/cache/kv_cache.hpp
+#pragma once
+#include "infinicore/tensor.hpp"
+#include "infinicore/device.hpp"
+#include <algorithm>
+#include <utility>
+#include <memory>
+namespace infinilm::cache {
+/**
+ * @brief Simple KV cache structure for incremental decoding
+ *
+ * Stores key and value caches with shape [n_kv_head, capacity, head_dim]
+ * Similar to DynamicLayer in Python cache_utils.py
+ *
+ * This is a common component that can be used by any model architecture
+ * that needs KV caching for attention mechanisms.
+ */
+struct KVCache {
+    infinicore::Tensor k_cache;  // [n_kv_head, capacity, head_dim]
+    infinicore::Tensor v_cache;  // [n_kv_head, capacity, head_dim]
+    size_t cache_position;        // Current position in cache
+    size_t max_capacity;          // Maximum capacity of cache
+    bool initialized;             // Whether cache has been initialized
+    KVCache()
+        : cache_position(0), max_capacity(0), initialized(false),
+          // Create empty placeholder tensors (will be replaced on first use)
+          k_cache(infinicore::Tensor::empty({1, 1, 1}, infinicore::DataType::F32,
+                                            infinicore::Device(infinicore::Device::Type::CPU, 0))),
+          v_cache(infinicore::Tensor::empty({1, 1, 1}, infinicore::DataType::F32,
+                                            infinicore::Device(infinicore::Device::Type::CPU, 0))) {}
+    /**
+     * @brief Initialize or update cache capacity
+     * @param num_kv_heads Number of key-value heads
+     * @param head_dim Head dimension
+     * @param seq_len Sequence length of new tokens
+     * @param dtype Data type
+     * @param device Device
+     */
+    void ensure_capacity(size_t num_kv_heads, size_t head_dim, size_t seq_len,
+                        infinicore::DataType dtype, const infinicore::Device &device) {
+        size_t required_capacity = cache_position + seq_len;
+        // Lazy initialization
+        if (!initialized) {
+            max_capacity = std::max(required_capacity, size_t(4096));  // Start with at least 4096
+            k_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
+                                                dtype, device);
+            v_cache = infinicore::Tensor::empty({num_kv_heads, max_capacity, head_dim},
+                                                dtype, device);
+            cache_position = 0;
+            initialized = true;
+        }
+        // Grow cache if needed (similar to DynamicLayer in Python)
+        else if (required_capacity > max_capacity) {
+            size_t new_capacity = std::max(max_capacity * 2, required_capacity);
+            auto k_new = infinicore::Tensor::empty({num_kv_heads, new_capacity, head_dim},
+                                                   dtype, device);
+            auto v_new = infinicore::Tensor::empty({num_kv_heads, new_capacity, head_dim},
+                                                   dtype, device);
+            // Copy existing cache data
+            if (cache_position > 0) {
+                auto k_slice = k_cache->narrow({{1, 0, cache_position}});
+                auto v_slice = v_cache->narrow({{1, 0, cache_position}});
+                k_new->narrow({{1, 0, cache_position}})->copy_from(k_slice);
+                v_new->narrow({{1, 0, cache_position}})->copy_from(v_slice);
+            }
+            k_cache = k_new;
+            v_cache = v_new;
+            max_capacity = new_capacity;
+        }
+    }
+    /**
+     * @brief Update cache with new key and value states
+     * @param k_new New key states [n_kv_head, seq_len, head_dim]
+     * @param v_new New value states [n_kv_head, seq_len, head_dim]
+     * @return Tuple of (k_total, v_total) with shape [n_kv_head, total_seq_len, head_dim]
+     *
+     * Note: This method writes to the cache. If using with attention op, the attention op
+     * also writes to the cache, so this should be called AFTER attention, not before.
+     */
+    std::pair<infinicore::Tensor, infinicore::Tensor> update(
+        const infinicore::Tensor &k_new,
+        const infinicore::Tensor &v_new) {
+        size_t seq_len = k_new->shape()[1];
+        size_t num_kv_heads = k_new->shape()[0];
+        size_t head_dim = k_new->shape()[2];
+        // Ensure capacity
+        ensure_capacity(num_kv_heads, head_dim, seq_len,
+                       k_new->dtype(), k_new->device());
+        // Copy new k/v into cache at current position
+        auto k_dst = k_cache->narrow({{1, cache_position, seq_len}});
+        auto v_dst = v_cache->narrow({{1, cache_position, seq_len}});
+        k_dst->copy_from(k_new);
+        v_dst->copy_from(v_new);
+        // Update position
+        cache_position += seq_len;
+        // Return the total cache up to current position
+        auto k_total = k_cache->narrow({{1, 0, cache_position}});
+        auto v_total = v_cache->narrow({{1, 0, cache_position}});
+        return std::make_pair(k_total->contiguous(), v_total->contiguous());
+    }
+};
+} // namespace infinilm::models::common
--- a/csrc/models/debug_utils/hooks.cpp
+++ b/csrc/models/debug_utils/hooks.cpp
+#include "hooks.hpp"
+#include <spdlog/spdlog.h>
+namespace infinilm::models::debug_utils {
+void HookRegistry::register_hook(const std::string &name, HookCallback callback) {
+    hooks_[name] = callback;
+    SPDLOG_DEBUG("HookRegistry: Registered hook '{}'", name);
+}
+void HookRegistry::call_hook(const std::string &name, const infinicore::Tensor &tensor, int layer_idx) const {
+    // Try exact match first
+    auto it = hooks_.find(name);
+    if (it != hooks_.end()) {
+        try {
+            it->second(name, tensor, layer_idx);
+        } catch (const std::exception &e) {
+            SPDLOG_ERROR("HookRegistry: Error calling hook '{}': {}", name, e.what());
+        }
+        return;
+    }
+    // Try pattern matching (e.g., "layer0_*" matches "layer0_q_after_proj")
+    for (const auto &[pattern, callback] : hooks_) {
+        if (pattern.back() == '*' && name.size() >= pattern.size() - 1) {
+            std::string prefix = pattern.substr(0, pattern.size() - 1);
+            if (name.substr(0, prefix.size()) == prefix) {
+                try {
+                    callback(name, tensor, layer_idx);
+                } catch (const std::exception &e) {
+                    SPDLOG_ERROR("HookRegistry: Error calling hook pattern '{}' for '{}': {}", pattern, name, e.what());
+                }
+                return;
+            }
+        }
+    }
+}
+void HookRegistry::clear() {
+    hooks_.clear();
+    SPDLOG_DEBUG("HookRegistry: Cleared all hooks");
+}
+} // namespace infinilm::models::debug_utils
--- a/csrc/models/debug_utils/hooks.hpp
+++ b/csrc/models/debug_utils/hooks.hpp
+#pragma once
+#include "infinicore/tensor.hpp"
+#include <functional>
+#include <string>
+#include <memory>
+#include <unordered_map>
+namespace infinilm::models::debug_utils {
+// TODO: move to InfiniCore as common utils in future work
+/**
+ * @brief Hook callback type for capturing intermediate values (DEBUG ONLY)
+ *
+ * Hook functions are called with:
+ * - name: Identifier for the intermediate value (e.g., "layer0_q_after_proj")
+ * - tensor: The intermediate tensor value
+ * - layer_idx: Layer index (for layer-specific hooks, -1 if not applicable)
+ *
+ * NOTE: This is a debug utility. Do not use in production code.
+ */
+using HookCallback = std::function<void(const std::string &name, const infinicore::Tensor &tensor, int layer_idx)>;
+/**
+ * @brief Hook registry for managing hooks (DEBUG ONLY)
+ *
+ * NOTE: This is a debug utility for capturing intermediate tensor values
+ * during model execution. Do not use in production code.
+ */
+class HookRegistry {
+public:
+    /**
+     * @brief Register a hook callback
+     *
+     * @param name Hook name (can be pattern like "layer0_*" or specific name)
+     * @param callback Hook callback function
+     */
+    void register_hook(const std::string &name, HookCallback callback);
+    /**
+     * @brief Call hook if registered
+     *
+     * @param name Full hook name
+     * @param tensor Tensor to pass to hook
+     * @param layer_idx Layer index (-1 if not applicable)
+     */
+    void call_hook(const std::string &name, const infinicore::Tensor &tensor, int layer_idx = -1) const;
+    /**
+     * @brief Clear all hooks
+     */
+    void clear();
+    /**
+     * @brief Check if any hooks are registered
+     */
+    bool has_hooks() const { return !hooks_.empty(); }
+private:
+    std::unordered_map<std::string, HookCallback> hooks_;
+};
+/**
+ * @brief Macro to simplify hook registration (DEBUG ONLY)
+ *
+ * Usage: REGISTER_HOOK(registry, "hook_name", callback)
+ */
+#define REGISTER_HOOK(registry, name, callback) \
+    (registry)->register_hook(name, callback)
+/**
+ * @brief Macro to simplify hook calls with automatic null and has_hooks checks (DEBUG ONLY)
+ *
+ * Usage: CALL_HOOK(registry, "hook_name", tensor)
+ *        Note: layer_idx defaults to -1
+ */
+#define CALL_HOOK(registry, name, tensor) \
+    do { \
+        if ((registry) && (registry)->has_hooks()) { \
+            (registry)->call_hook(name, tensor, -1); \
+        } \
+    } while (0)
+/**
+ * @brief Macro to simplify hook calls with explicit layer index (DEBUG ONLY)
+ *
+ * Usage: CALL_HOOK_LAYER(registry, "hook_name", tensor, layer_idx)
+ */
+#define CALL_HOOK_LAYER(registry, name, tensor, layer_idx) \
+    do { \
+        if ((registry) && (registry)->has_hooks()) { \
+            (registry)->call_hook(name, tensor, layer_idx); \
+        } \
+    } while (0)
+/**
+ * @brief Macros to simplify hook_registry and hook_prefix management in model classes
+ */
+// Declare hook_registry and hook_prefix member variables
+#define HOOK_REGISTRY_MEMBER() \
+    std::shared_ptr<debug_utils::HookRegistry> hook_registry_; \
+    std::string hook_prefix_;
+// Set hook_registry and hook_prefix (no forwarding to submodules)
+#define SET_HOOK_REGISTRY_SIMPLE() \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+    }
+// Helper macro to build incremental hook prefix
+#define BUILD_HOOK_PREFIX(prefix, name) \
+    (prefix.empty() ? std::string(name) : prefix + "_" + std::string(name))
+// Set hook_registry and hook_prefix and forward to one or more submodules
+// Usage: SET_HOOK_REGISTRY(submodule1) or SET_HOOK_REGISTRY(submodule1, submodule2)
+// The hook_prefix will be incremented for each submodule (e.g., "layer0" -> "layer0_attention")
+// Note: Currently supports up to 2 submodules. For more, extend the pattern below.
+#define SET_HOOK_REGISTRY(...) \
+    SET_HOOK_REGISTRY_IMPL(__VA_ARGS__)
+// Helper to handle variable number of arguments using a reliable pattern
+#define SET_HOOK_REGISTRY_IMPL(...) \
+    SET_HOOK_REGISTRY_GET_NTH(__VA_ARGS__, SET_HOOK_REGISTRY_2, SET_HOOK_REGISTRY_1, SET_HOOK_REGISTRY_0,)(__VA_ARGS__)
+// Get the selector based on argument count
+// Pattern: when we have N args, the (N+1)th parameter from the end is the selector
+// For 0 args: _1=SET_HOOK_REGISTRY_2, _2=SET_HOOK_REGISTRY_1, _3=SET_HOOK_REGISTRY_0, N=(empty) → need to use _3
+// For 1 arg: _1=arg, _2=SET_HOOK_REGISTRY_2, _3=SET_HOOK_REGISTRY_1, N=SET_HOOK_REGISTRY_0 → wrong, need _3
+// For 2 args: _1=arg1, _2=arg2, _3=SET_HOOK_REGISTRY_2, N=SET_HOOK_REGISTRY_1 → wrong, need _3
+// Use _3 as the selector (it's in the right position for all cases)
+#define SET_HOOK_REGISTRY_GET_NTH(_1, _2, _3, N, ...) _3
+// Implementation for 0 args (shouldn't be used, but handle gracefully)
+#define SET_HOOK_REGISTRY_0() \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+    }
+// Implementation for 1 arg
+#define SET_HOOK_REGISTRY_1(submodule) \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+        if (submodule##_) { \
+            std::string submodule_prefix = BUILD_HOOK_PREFIX(hook_prefix, #submodule); \
+            submodule##_->set_hook_registry(hook_registry, submodule_prefix); \
+        } \
+    }
+// Implementation for 2 args
+#define SET_HOOK_REGISTRY_2(submodule1, submodule2) \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+        if (submodule1##_) { \
+            std::string submodule1_prefix = BUILD_HOOK_PREFIX(hook_prefix, #submodule1); \
+            submodule1##_->set_hook_registry(hook_registry, submodule1_prefix); \
+        } \
+        if (submodule2##_) { \
+            std::string submodule2_prefix = BUILD_HOOK_PREFIX(hook_prefix, #submodule2); \
+            submodule2##_->set_hook_registry(hook_registry, submodule2_prefix); \
+        } \
+    }
+// Set hook_registry and hook_prefix for a vector of submodules
+// For vectors, the prefix is incremented with an index (e.g., "layer0", "layer1", ...)
+// If parent has a prefix, it becomes "parent_layer0", "parent_layer1", etc.
+#define SET_HOOK_REGISTRY_VEC(vec_name) \
+    void set_hook_registry(const std::shared_ptr<debug_utils::HookRegistry> &hook_registry, const std::string &hook_prefix = "") { \
+        hook_registry_ = hook_registry; \
+        hook_prefix_ = hook_prefix; \
+        for (size_t i = 0; i < vec_name##_.size(); ++i) { \
+            if (vec_name##_[i]) { \
+                std::string layer_name = "layer" + std::to_string(i); \
+                std::string item_prefix = BUILD_HOOK_PREFIX(hook_prefix, layer_name); \
+                vec_name##_[i]->set_hook_registry(hook_registry, item_prefix); \
+            } \
+        } \
+    }
+} // namespace infinilm::models::debug_utils
--- a/csrc/models/debug_utils/tensor_utils.hpp
+++ b/csrc/models/debug_utils/tensor_utils.hpp
+#pragma once
+#include "infinicore/tensor.hpp"
+#include <spdlog/spdlog.h>
+#include <algorithm>
+#include <numeric>
+#include <vector>
+#include <string>
+namespace infinilm::models::debug_utils {
+// Helper function to log tensor statistics and sample values
+// This is useful for debugging intermediate values in model forward passes
+// NOTE: This is a debug utility. Do not use in production code.
+inline void log_tensor_stats(const infinicore::Tensor &tensor, const std::string &name,
+                             bool log_samples = true, size_t max_samples = 10) {
+    auto shape = tensor->shape();
+    auto dtype = tensor->dtype();
+    auto device = tensor->device();
+    // Log basic info
+    std::string shape_str = "[";
+    for (size_t i = 0; i < shape.size(); ++i) {
+        if (i > 0) shape_str += ", ";
+        shape_str += std::to_string(shape[i]);
+    }
+    shape_str += "]";
+    SPDLOG_INFO("  {}: shape={}, dtype={}, device={}", name, shape_str, static_cast<int>(dtype), device.toString());
+    // For F32 tensors, compute and log statistics
+    if (dtype == infinicore::DataType::F32) {
+        // Copy to CPU if needed and compute stats
+        auto cpu_tensor = tensor->to(infinicore::Device(infinicore::Device::Type::CPU, 0));
+        std::byte *raw_data = cpu_tensor->data();
+        float *data = reinterpret_cast<float*>(raw_data);
+        size_t numel = cpu_tensor->numel();
+        if (numel > 0) {
+            float min_val = *std::min_element(data, data + numel);
+            float max_val = *std::max_element(data, data + numel);
+            float sum = std::accumulate(data, data + numel, 0.0f);
+            float mean_val = sum / static_cast<float>(numel);
+            SPDLOG_INFO("    Stats: min={:.6e}, max={:.6e}, mean={:.6e}, numel={}",
+                       min_val, max_val, mean_val, numel);
+            // Log sample values at specific positions
+            if (log_samples && numel > 0) {
+                size_t sample_count = std::min(max_samples, numel);
+                SPDLOG_INFO("    Sample values (first {}):", sample_count);
+                for (size_t i = 0; i < sample_count; ++i) {
+                    SPDLOG_INFO("      [{}] = {:.6e}", i, data[i]);
+                }
+            }
+        }
+    } else {
+        SPDLOG_INFO("  {} (Stats computation skipped for non-F32 tensor)", name);
+    }
+}
+// Helper function to log specific tensor positions (for debugging)
+// NOTE: This is a debug utility. Do not use in production code.
+inline void log_tensor_positions(const infinicore::Tensor &tensor, const std::string &name,
+                                 const std::vector<std::vector<size_t>> &positions) {
+    auto shape = tensor->shape();
+    auto dtype = tensor->dtype();
+    // Only log for F32 tensors (or copy to CPU)
+    if (dtype == infinicore::DataType::F32) {
+        auto cpu_tensor = tensor->to(infinicore::Device(infinicore::Device::Type::CPU, 0));
+        std::byte *raw_data = cpu_tensor->data();
+        float *data = reinterpret_cast<float*>(raw_data);
+        SPDLOG_INFO("  {}: Logging specific positions:", name);
+        for (const auto &pos : positions) {
+            if (pos.size() != shape.size()) {
+                SPDLOG_INFO("    Position {}: dimension mismatch (expected {} dims, got {})",
+                           pos.size(), shape.size());
+                continue;
+            }
+            // Calculate linear index
+            size_t idx = 0;
+            size_t stride = 1;
+            bool valid = true;
+            for (int i = static_cast<int>(shape.size()) - 1; i >= 0; --i) {
+                if (pos[i] >= shape[i]) {
+                    valid = false;
+                    break;
+                }
+                idx += pos[i] * stride;
+                stride *= shape[i];
+            }
+            if (valid && idx < cpu_tensor->numel()) {
+                std::string pos_str = "[";
+                for (size_t i = 0; i < pos.size(); ++i) {
+                    if (i > 0) pos_str += ", ";
+                    pos_str += std::to_string(pos[i]);
+                }
+                pos_str += "]";
+                SPDLOG_INFO("    Position {}: value = {:.6e}", pos_str, data[idx]);
+            } else {
+                std::string pos_str = "[";
+                for (size_t i = 0; i < pos.size(); ++i) {
+                    if (i > 0) pos_str += ", ";
+                    pos_str += std::to_string(pos[i]);
+                }
+                pos_str += "]";
+                SPDLOG_INFO("    Position {}: invalid (out of bounds)", pos_str);
+            }
+        }
+    }
+}
+} // namespace infinilm::models::debug_utils
--- a/csrc/models/llama/llama.hpp
+++ b/csrc/models/llama/llama.hpp
+#pragma once
+/**
+ * @file llama.hpp
+ * @brief Main header file for Llama model architecture
+ *
+ * This header includes all components of the Llama model architecture
+ * built using InfiniCore::nn::Module pattern.
+ *
+ * Components:
+ * - LlamaConfig: Model configuration structure
+ * - LlamaAttention: Multi-head self-attention module
+ * - LlamaMLP: Feed-forward network module
+ * - LlamaDecoderLayer: Single transformer decoder layer
+ * - LlamaModel: Core transformer model (without LM head)
+ * - LlamaForCausalLM: Complete model with language modeling head
+ */
+#include "llama_config.hpp"
+#include "llama_attention.hpp"
+#include "llama_mlp.hpp"
+#include "llama_decoder_layer.hpp"
+#include "llama_model.hpp"
+#include "llama_for_causal_lm.hpp"
--- a/csrc/models/llama/llama_attention.cpp
+++ b/csrc/models/llama/llama_attention.cpp
+#include "llama_attention.hpp"
+#include "infinicore/nn/linear.hpp"
+#include "infinicore/nn/rope.hpp"
+#include "infinicore/ops.hpp"
+#include "infinicore/ops/mul.hpp"
+#include <spdlog/spdlog.h>
+#include <cmath>
+#include <cstring>
+#include <stdexcept>
+#include <iostream>
+#include <algorithm>
+namespace infinilm::models::llama {
+LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device,
+                                infinicore::DataType dtype)
+    : hidden_size_(config.hidden_size),
+      num_attention_heads_(config.num_attention_heads),
+      num_key_value_heads_(config.num_key_value_heads),
+      head_dim_(config.head_dim),
+      kv_dim_(config.kv_dim()),
+      use_bias_(config.attention_bias) {
+    // Initialize projection layers
+    INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, hidden_size_, use_bias_,
+                               dtype, device);
+    INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, kv_dim_, use_bias_,
+                               dtype, device);
+    INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, kv_dim_, use_bias_,
+                               dtype, device);
+    INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_bias_,
+                               dtype, device);
+}
+infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states,
+                                            const infinicore::Tensor &position_ids,
+                                            void *kv_cache) const {
+    if (!rotary_emb_) {
+        throw std::runtime_error("LlamaAttention: rotary_emb not configured");
+    }
+    // Input shape: [batch, seq_len, hidden_size]
+    auto hidden_states_mutable = hidden_states;
+    auto shape = hidden_states->shape();
+    size_t batch_size = shape[0];
+    size_t seq_len = shape[1];
+    // 1. Project Q, K, V
+    auto q = q_proj_->forward(hidden_states_mutable);  // [batch, seq_len, hidden_size]
+    auto k = k_proj_->forward(hidden_states_mutable);  // [batch, seq_len, kv_dim]
+    auto v = v_proj_->forward(hidden_states_mutable);  // [batch, seq_len, kv_dim]
+    // 2. Reshape for multi-head attention
+    // Reshape Q, K, V to include batch dimension
+    // Python: query_states = self.q_proj(hidden_states).view(querys_shape)
+    // The view operation requires the tensor to be contiguous in the required dimensions
+    auto q_reshaped = q->view({batch_size, seq_len, num_attention_heads_, head_dim_});
+    auto k_reshaped = k->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+    auto v_reshaped = v->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+    // 3. Prepare position_ids for RoPE - align with Python pattern
+    // Python: bs, num = pos_ids.shape; pos_ids = pos_ids.view((bs * num,))
+    auto pos_shape = position_ids->shape();
+    infinicore::Tensor pos_ids_for_rope = position_ids;
+    if (pos_shape.size() == 2) {
+        auto pos_narrowed = position_ids->narrow({{0, 0, 1}});
+        pos_ids_for_rope = pos_narrowed->contiguous()->view({pos_shape[1]});
+    } else if (pos_shape.size() == 1) {
+        pos_ids_for_rope = position_ids->contiguous();
+    } else {
+        throw std::runtime_error("Unexpected position_ids shape");
+    }
+    // 4. Apply RoPE to full batch - align with Python pattern
+    // Python: x = x.view((bs * seq_len, num_heads, head_dim))
+    // Python asserts: seq_len * x_stride[1] == x_stride[0] (contiguous in dim=0 and dim=1)
+    // The kernel requires stride(2) == 1 (last dimension contiguous)
+    // Python's assertion + stride(2) == 1 means the tensor is fully contiguous
+    // However, to be safe and match Python's behavior exactly, ensure fully contiguous
+    auto q_for_rope = q_reshaped->view({batch_size * seq_len, num_attention_heads_, head_dim_})->contiguous();
+    auto k_for_rope = k_reshaped->view({batch_size * seq_len, num_key_value_heads_, head_dim_})->contiguous();
+    // Call RoPE on full batch (matching Python pattern)
+    auto q_rope_out = rotary_emb_->forward(q_for_rope, pos_ids_for_rope);
+    auto k_rope_out = rotary_emb_->forward(k_for_rope, pos_ids_for_rope);
+    // Reshape back to [batch_size, seq_len, num_heads, head_dim] (matching Python pattern)
+    q_rope_out = q_rope_out->view({batch_size, seq_len, num_attention_heads_, head_dim_});
+    k_rope_out = k_rope_out->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+    // 5. Process each batch item separately for attention computation
+    infinilm::cache::KVCache *external_cache = static_cast<infinilm::cache::KVCache *>(kv_cache);
+    auto output_tensor = infinicore::Tensor::empty(
+        {batch_size, seq_len, hidden_size_},
+        q->dtype(),
+        q->device()
+    );
+    for (size_t b = 0; b < batch_size; ++b) {
+        // Extract batch item from RoPE output (already computed above for full batch)
+        // Ensure contiguous after narrow+view to avoid stride issues in GEMM operations
+        auto q_batch = q_rope_out->narrow({{0, b, 1}})->view({seq_len, num_attention_heads_, head_dim_});
+        auto k_batch = k_rope_out->narrow({{0, b, 1}})->view({seq_len, num_key_value_heads_, head_dim_});
+        auto v_batch = v_reshaped->narrow({{0, b, 1}})->view({seq_len, num_key_value_heads_, head_dim_});
+        // Convert to [n_head, seq_len, head_dim] for cache
+        // Ensure contiguous after permute for F16 compatibility with cache operations
+        auto q_rope = q_batch->permute({1, 0, 2})->contiguous();  // [n_q_head, seq_len, head_dim]
+        auto k_rope = k_batch->permute({1, 0, 2})->contiguous();  // [n_kv_head, seq_len, head_dim]
+        auto v_permuted = v_batch->permute({1, 0, 2})->contiguous();  // [n_kv_head, seq_len, head_dim]
+        // 5. Prepare KV caches
+        infinicore::Tensor k_total = infinicore::Tensor::empty({1, 1, 1}, k_rope->dtype(), k_rope->device());
+        infinicore::Tensor v_total = infinicore::Tensor::empty({1, 1, 1}, v_permuted->dtype(), v_permuted->device());
+        if (external_cache != nullptr) {
+            auto [k_total_tmp, v_total_tmp] = external_cache->update(k_rope, v_permuted);
+            k_total = k_total_tmp;
+            v_total = v_total_tmp;
+        } else {
+            auto [k_total_tmp, v_total_tmp] = internal_cache_.update(k_rope, v_permuted);
+            k_total = k_total_tmp;
+            v_total = v_total_tmp;
+        }
+        // 6. Compute attention - strictly align with Python pattern
+        // Python: query_states_i = query_states.narrow(0, i, 1).view((seq_len, num_attention_heads, head_dim))
+        // Python: key_states_i = key_states_total.narrow(0, i, 1).view((total_seq_len, num_key_value_heads, head_dim))
+        // Python: value_states_i = value_states_total.narrow(0, i, 1).view((total_seq_len, num_key_value_heads, head_dim))
+        // Python: attention_i = grouped_query_attention(query_states_i, key_states_i, value_states_i, scaling=self.scaling)
+        // Extract from KV cache (k_total and v_total are [n_kv_head, total_seq_len, head_dim])
+        // Python: key_states_total.narrow(0, i, 1).view((total_seq_len, num_key_value_heads, head_dim))
+        // Python's narrow+view ensures contiguous memory, so we need to ensure contiguous before permute
+        auto k_for_attn = k_total->permute({1, 0, 2});  // [total_seq_len, n_kv_head, head_dim]
+        auto v_for_attn = v_total->permute({1, 0, 2});  // [total_seq_len, n_kv_head, head_dim]
+        // q_batch is already [seq_len, n_q_head, head_dim] from above
+        auto q_for_attn = q_batch;  // [seq_len, n_q_head, head_dim]
+        // Python: grouped_query_attention calls repeat_kv if ngroup > 1
+        // Python: repeat_kv expands [total_seq_len, num_key_value_heads, head_dim] -> [total_seq_len, num_attention_heads, head_dim]
+        size_t ngroup = num_attention_heads_ / num_key_value_heads_;
+        if (ngroup > 1) {
+            // Python: repeat_kv uses as_strided to expand
+            size_t total_seq_len = k_for_attn->shape()[0];
+            size_t n_kv_head = k_for_attn->shape()[1];
+            size_t head_dim = k_for_attn->shape()[2];
+            auto k_strides = k_for_attn->strides();
+            auto k_strided = k_for_attn->as_strided(
+                {total_seq_len, n_kv_head, ngroup, head_dim},
+                {k_strides[0], k_strides[1], 0, k_strides[2]}
+            );
+            k_for_attn = k_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim});
+            auto v_strides = v_for_attn->strides();
+            auto v_strided = v_for_attn->as_strided(
+                {total_seq_len, n_kv_head, ngroup, head_dim},
+                {v_strides[0], v_strides[1], 0, v_strides[2]}
+            );
+            v_for_attn = v_strided->contiguous()->view({total_seq_len, n_kv_head * ngroup, head_dim});
+        }
+        // Python: multi_head_attention(querys, keys, values, scaling)
+        // Python: Q = querys.permute((1, 0, 2))  # [num_heads, seq_len, head_dim]
+        // Python: K = keys  # [total_seq_len, num_heads, head_dim] (NO permute!)
+        // Python: V = values.permute((1, 0, 2))  # [num_heads, total_seq_len, head_dim]
+        auto Q = q_for_attn->permute({1, 0, 2});  // [n_q_head, seq_len, head_dim]
+        auto K = k_for_attn;  // [total_seq_len, n_q_head, head_dim] - keep as-is (matching Python)
+        auto V = v_for_attn->permute({1, 0, 2});  // [n_q_head, total_seq_len, head_dim]
+        // Python: attn_weight = Q @ K.permute((1, 2, 0))
+        // Python: K.permute((1, 2, 0)) transforms [total_seq_len, num_heads, head_dim] -> [num_heads, head_dim, total_seq_len]
+        auto K_transposed = K->permute({1, 2, 0});  // [n_q_head, head_dim, total_seq_len]
+        // Use GEMM with alpha=scaling to combine scaling with matrix multiplication
+        // This is more efficient than doing matmul followed by mul
+        float scaling = 1.0f / std::sqrt(static_cast<float>(head_dim_));
+        auto attn_weight = infinicore::op::matmul(Q, K_transposed, scaling);  // [n_q_head, seq_len, total_seq_len]
+        infinicore::op::causal_softmax_(attn_weight, attn_weight);
+        auto out = infinicore::op::matmul(attn_weight, V);  // [n_q_head, seq_len, head_dim]
+        // Python: return out.permute((1, 0, 2)).contiguous()  # [seq_len, num_heads, head_dim]
+        auto attn_output = out->permute({1, 0, 2})->contiguous();  // [seq_len, n_q_head, head_dim]
+        // Python: attn_output_i.copy_(attention_i)
+        // Python: attn_output = attn_output.view(hidden_states_shape)  # [bs, seq_len, hidden_size]
+        // Copy to output tensor - attn_output is [seq_len, num_attention_heads, head_dim]
+        auto output_batch = output_tensor->narrow({{0, b, 1}})->view({seq_len, hidden_size_});
+        auto attn_flat = attn_output->contiguous()->view({seq_len, hidden_size_});
+        output_batch->copy_from(attn_flat);
+    }
+    // 8. Apply output projection to all batches
+    auto output = o_proj_->forward(output_tensor);
+    return output;
+}
+void LlamaAttention::set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
+    rotary_emb_ = rotary_emb;
+}
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_attention.hpp
+++ b/csrc/models/llama/llama_attention.hpp
+#pragma once
+#include "llama_config.hpp"
+#include "cache/kv_cache.hpp"
+#include "infinicore/nn/module.hpp"
+#include "infinicore/nn/linear.hpp"
+#include "infinicore/nn/rope.hpp"
+#include "infinicore/tensor.hpp"
+#include "infinicore/device.hpp"
+#include <algorithm>
+#include <utility>
+#include <memory>
+namespace infinilm::models::llama {
+/**
+ * @brief Multi-head self-attention module for Llama
+ *
+ * Implements the attention mechanism with:
+ * - Query, Key, Value projections
+ * - Output projection
+ * - Rotary Position Embeddings (RoPE) applied to Q and K
+ * - Support for Grouped Query Attention (GQA)
+ */
+class LlamaAttention : public infinicore::nn::Module {
+public:
+    /**
+     * @brief Construct LlamaAttention module
+     *
+     * @param config Model configuration
+     * @param device Device to create tensors on
+     * @param dtype Optional data type for model parameters (defaults to F32)
+     */
+    LlamaAttention(const LlamaConfig &config, const infinicore::Device &device,
+                  infinicore::DataType dtype = infinicore::DataType::F32);
+    /**
+     * @brief Forward pass: compute attention
+     *
+     * @param hidden_states Input tensor of shape [batch, seq_len, hidden_size]
+     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
+     * @param kv_cache Optional KV cache for incremental decoding
+     * @return Output tensor of shape [batch, seq_len, hidden_size]
+     */
+    infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
+                                const infinicore::Tensor &position_ids,
+                                void *kv_cache = nullptr) const;
+    /**
+     * @brief Provide shared RoPE module from parent model.
+     */
+    void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb);
+    // Module information
+    size_t num_heads() const { return num_attention_heads_; }
+    size_t num_kv_heads() const { return num_key_value_heads_; }
+    size_t head_dim() const { return head_dim_; }
+    size_t hidden_size() const { return hidden_size_; }
+protected:
+    // Projection layers
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, q_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, k_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, v_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, o_proj);
+    // Shared Rotary Position Embeddings (RoPE)
+    std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
+private:
+    size_t hidden_size_;
+    size_t num_attention_heads_;
+    size_t num_key_value_heads_;
+    size_t head_dim_;
+    size_t kv_dim_;
+    bool use_bias_;
+    // Internal KV cache for when no external cache is provided
+    mutable infinilm::cache::KVCache internal_cache_;
+};
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_config.hpp
+++ b/csrc/models/llama/llama_config.hpp
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <string>
+namespace infinilm::models::llama {
+/**
+ * @brief Configuration structure for Llama model architecture
+ *
+ * This struct holds all hyperparameters needed to construct a Llama model.
+ * It follows the same structure as HuggingFace's LlamaConfig.
+ */
+struct LlamaConfig {
+    // Vocabulary and embedding
+    size_t vocab_size = 32000;              // Vocabulary size
+    size_t hidden_size = 4096;               // Hidden dimension size
+    size_t intermediate_size = 11008;        // MLP intermediate dimension
+    // Architecture
+    size_t num_hidden_layers = 32;           // Number of decoder layers
+    size_t num_attention_heads = 32;         // Number of attention heads
+    size_t num_key_value_heads = 32;         // Number of key-value heads (for GQA)
+    size_t head_dim = 128;                   // Attention head dimension (hidden_size / num_attention_heads)
+    // Position embeddings
+    size_t max_position_embeddings = 2048;   // Maximum sequence length
+    double rope_theta = 10000.0;             // RoPE base frequency
+    // Normalization
+    double rms_norm_eps = 1e-6;              // RMSNorm epsilon
+    // Activation
+    std::string hidden_act = "silu";         // Activation function (typically "silu")
+    std::string model_type = "llama";        // Model type identifier (matches HF configs)
+    // Optional features
+    bool use_cache = true;                   // Whether to use KV cache
+    bool attention_bias = false;             // Whether to use bias in attention projections
+    bool mlp_bias = false;                   // Whether to use bias in MLP projections
+    bool tie_word_embeddings = false;        // Whether to tie input/output embeddings
+    // Token IDs
+    int64_t pad_token_id = -1;               // Padding token ID (optional)
+    int64_t bos_token_id = 1;                // Beginning of sequence token ID
+    int64_t eos_token_id = 2;                // End of sequence token ID
+    /**
+     * @brief Compute key-value dimension for Grouped Query Attention (GQA)
+     * @return The dimension for key/value projections
+     */
+    size_t kv_dim() const {
+        return hidden_size * num_key_value_heads / num_attention_heads;
+    }
+    /**
+     * @brief Validate configuration parameters
+     * @return true if configuration is valid
+     */
+    bool validate() const {
+        if (hidden_size % num_attention_heads != 0) {
+            return false;
+        }
+        if (num_attention_heads % num_key_value_heads != 0) {
+            return false;
+        }
+        if (head_dim != hidden_size / num_attention_heads) {
+            return false;
+        }
+        return true;
+    }
+};
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_decoder_layer.cpp
+++ b/csrc/models/llama/llama_decoder_layer.cpp
+#include "llama_decoder_layer.hpp"
+#include "infinicore/nn/rmsnorm.hpp"
+#include "infinicore/ops.hpp"
+namespace infinilm::models::llama {
+LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device,
+                                     infinicore::DataType dtype) {
+    // Initialize layer normalization layers
+    INFINICORE_NN_MODULE_INIT(input_layernorm, config.hidden_size, config.rms_norm_eps,
+                              dtype, device);
+    INFINICORE_NN_MODULE_INIT(post_attention_layernorm, config.hidden_size, config.rms_norm_eps,
+                              dtype, device);
+    // Initialize attention and MLP modules
+    INFINICORE_NN_MODULE_INIT(self_attn, config, device, dtype);
+    INFINICORE_NN_MODULE_INIT(mlp, config, device, dtype);
+}
+infinicore::Tensor LlamaDecoderLayer::forward(const infinicore::Tensor &hidden_states,
+                                               const infinicore::Tensor &position_ids,
+                                               void *kv_cache) const {
+    // Save residual for attention
+    auto residual = hidden_states;
+    // 1. Pre-attention layer normalization
+    auto normed_states = input_layernorm_->forward(hidden_states);
+    // 2. Self-attention with residual connection
+    auto attn_output = self_attn_->forward(normed_states, position_ids, kv_cache);
+    // Add residual: hidden_states = hidden_states + attn_output
+    auto output = infinicore::op::add(residual, attn_output);
+    // Save residual for MLP
+    residual = output;
+    // 3. Post-attention layer normalization
+    normed_states = post_attention_layernorm_->forward(output);
+    // 4. MLP with residual connection
+    auto mlp_output = mlp_->forward(normed_states);
+    // Add residual: output = output + mlp_output
+    output = infinicore::op::add(residual, mlp_output);
+    return output;
+}
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_decoder_layer.hpp
+++ b/csrc/models/llama/llama_decoder_layer.hpp
+#pragma once
+#include "llama_config.hpp"
+#include "llama_attention.hpp"
+#include "llama_mlp.hpp"
+#include "infinicore/nn/module.hpp"
+#include "infinicore/nn/rmsnorm.hpp"
+#include "infinicore/tensor.hpp"
+#include "infinicore/device.hpp"
+namespace infinilm::models::llama {
+/**
+ * @brief Single decoder layer (transformer block) for Llama
+ *
+ * Each decoder layer consists of:
+ * - Input layer normalization (RMSNorm)
+ * - Self-attention mechanism
+ * - Post-attention layer normalization (RMSNorm)
+ * - MLP feed-forward network
+ *
+ * Residual connections are applied around both attention and MLP blocks.
+ */
+class LlamaDecoderLayer : public infinicore::nn::Module {
+public:
+    /**
+     * @brief Construct LlamaDecoderLayer module
+     *
+     * @param config Model configuration
+     * @param device Device to create tensors on
+     * @param dtype Optional data type for model parameters (defaults to F32)
+     */
+    LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device,
+                     infinicore::DataType dtype = infinicore::DataType::F32);
+    /**
+     * @brief Forward pass: process one decoder layer
+     *
+     * @param hidden_states Input tensor of shape [batch, seq_len, hidden_size]
+     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
+     * @param kv_cache Optional KV cache for incremental decoding
+     * @return Output tensor of shape [batch, seq_len, hidden_size]
+     */
+    infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
+                                const infinicore::Tensor &position_ids,
+                                void *kv_cache = nullptr) const;
+    void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
+        if (self_attn_) {
+            self_attn_->set_rotary_emb(rotary_emb);
+        }
+    }
+protected:
+    // Layer normalization
+    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm);
+    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm);
+    // Attention and MLP
+    INFINICORE_NN_MODULE(LlamaAttention, self_attn);
+    INFINICORE_NN_MODULE(LlamaMLP, mlp);
+};
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_for_causal_lm.cpp
+++ b/csrc/models/llama/llama_for_causal_lm.cpp
+#include "llama_for_causal_lm.hpp"
+#include "infinicore/nn/linear.hpp"
+#include "infinicore/ops.hpp"
+namespace infinilm::models::llama {
+LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, const infinicore::Device &device,
+                                   infinicore::DataType dtype) {
+    // Initialize base model
+    INFINICORE_NN_MODULE_INIT(model, config, device, dtype);
+    // Initialize language modeling head
+    // Note: If tie_word_embeddings is true, we would share weights with embed_tokens
+    // For now, we create a separate linear layer
+    INFINICORE_NN_MODULE_INIT(lm_head, config.hidden_size, config.vocab_size, false,
+                              dtype, device);
+}
+infinicore::Tensor LlamaForCausalLM::forward(const infinicore::Tensor &input_ids,
+                                              const infinicore::Tensor &position_ids,
+                                              std::vector<void *> *kv_caches) const {
+    // 1. Forward through base model to get hidden states
+    auto hidden_states = model_->forward(input_ids, position_ids, kv_caches);
+    // 2. Apply language modeling head to get logits
+    auto logits = lm_head_->forward(hidden_states);
+    return logits;
+}
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_for_causal_lm.hpp
+++ b/csrc/models/llama/llama_for_causal_lm.hpp
+#pragma once
+#include "llama_model.hpp"
+#include "infinicore/nn/module.hpp"
+#include "infinicore/nn/linear.hpp"
+#include "infinicore/tensor.hpp"
+#include "infinicore/device.hpp"
+namespace infinilm::models::llama {
+/**
+ * @brief Llama model for Causal Language Modeling
+ *
+ * Extends LlamaModel by adding a language modeling head (lm_head) that
+ * projects hidden states to vocabulary logits.
+ *
+ * This matches the structure of HuggingFace's LlamaForCausalLM.
+ */
+class LlamaForCausalLM : public infinicore::nn::Module {
+public:
+    /**
+     * @brief Construct LlamaForCausalLM module
+     *
+     * @param config Model configuration
+     * @param device Device to create tensors on
+     * @param dtype Optional data type for model parameters (defaults to F32)
+     */
+    LlamaForCausalLM(const LlamaConfig &config, const infinicore::Device &device,
+                     infinicore::DataType dtype = infinicore::DataType::F32);
+    /**
+     * @brief Forward pass: compute language modeling logits
+     *
+     * @param input_ids Token IDs tensor of shape [batch, seq_len]
+     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
+     * @param kv_caches Optional KV caches for incremental decoding (one per layer)
+     * @return Logits tensor of shape [batch, seq_len, vocab_size]
+     *
+     * Note: This is a placeholder forward method. The actual implementation
+     * will be added when integrating with the inference engine.
+     */
+    infinicore::Tensor forward(const infinicore::Tensor &input_ids,
+                                const infinicore::Tensor &position_ids,
+                                std::vector<void *> *kv_caches = nullptr) const;
+    // Module information
+    const LlamaConfig &config() const { return model_->config(); }
+    LlamaModel &model() { return *model_; }
+    const LlamaModel &model() const { return *model_; }
+protected:
+    // Base model
+    INFINICORE_NN_MODULE(LlamaModel, model);
+    // Language modeling head
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, lm_head);
+};
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_mlp.cpp
+++ b/csrc/models/llama/llama_mlp.cpp
+#include "llama_mlp.hpp"
+#include "infinicore/nn/linear.hpp"
+#include "infinicore/ops.hpp"
+namespace infinilm::models::llama {
+LlamaMLP::LlamaMLP(const LlamaConfig &config, const infinicore::Device &device,
+                   infinicore::DataType dtype)
+    : hidden_size_(config.hidden_size),
+      intermediate_size_(config.intermediate_size),
+      use_bias_(config.mlp_bias) {
+    // Initialize projection layers
+    INFINICORE_NN_MODULE_INIT(gate_proj, hidden_size_, intermediate_size_, use_bias_,
+                              dtype, device);
+    INFINICORE_NN_MODULE_INIT(up_proj, hidden_size_, intermediate_size_, use_bias_,
+                              dtype, device);
+    INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_,
+                              dtype, device);
+}
+infinicore::Tensor LlamaMLP::forward(const infinicore::Tensor &hidden_states) const {
+    // 1. Project to gate and up
+    auto hidden_states_mutable = hidden_states;
+    auto gate = gate_proj_->forward(hidden_states_mutable);
+    auto up = up_proj_->forward(hidden_states_mutable);
+    // 2. Apply SwiGLU: silu(gate) * up
+    // Note: swiglu kernel expects (up, gate) and computes gate * sigmoid(gate) * up
+    // So we pass (up, gate) to get the correct result: gate * sigmoid(gate) * up
+    auto intermediate = infinicore::op::swiglu(up, gate);
+    // 3. Project down
+    auto output = down_proj_->forward(intermediate);
+    return output;
+}
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_mlp.hpp
+++ b/csrc/models/llama/llama_mlp.hpp
+#pragma once
+#include "llama_config.hpp"
+#include "infinicore/nn/module.hpp"
+#include "infinicore/nn/linear.hpp"
+#include "infinicore/tensor.hpp"
+#include "infinicore/device.hpp"
+namespace infinilm::models::llama {
+/**
+ * @brief MLP (Feed-Forward Network) module for Llama
+ *
+ * Implements the MLP block with:
+ * - Gate projection
+ * - Up projection
+ * - Down projection
+ * - SiLU activation function
+ *
+ * Formula: down_proj(SiLU(gate_proj(x)) * up_proj(x))
+ */
+class LlamaMLP : public infinicore::nn::Module {
+public:
+    /**
+     * @brief Construct LlamaMLP module
+     *
+     * @param config Model configuration
+     * @param device Device to create tensors on
+     * @param dtype Optional data type for model parameters (defaults to F32)
+     */
+    LlamaMLP(const LlamaConfig &config, const infinicore::Device &device,
+             infinicore::DataType dtype = infinicore::DataType::F32);
+    /**
+     * @brief Forward pass: compute MLP output
+     *
+     * @param hidden_states Input tensor of shape [batch, seq_len, hidden_size]
+     * @return Output tensor of shape [batch, seq_len, hidden_size]
+     */
+    infinicore::Tensor forward(const infinicore::Tensor &hidden_states) const;
+    // Module information
+    size_t hidden_size() const { return hidden_size_; }
+    size_t intermediate_size() const { return intermediate_size_; }
+protected:
+    // Projection layers
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, gate_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, up_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, down_proj);
+private:
+    size_t hidden_size_;
+    size_t intermediate_size_;
+    bool use_bias_;
+};
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_model.cpp
+++ b/csrc/models/llama/llama_model.cpp
+#include "llama_model.hpp"
+#include "infinicore/nn/embedding.hpp"
+#include "infinicore/nn/rmsnorm.hpp"
+#include "infinicore/nn/rope.hpp"
+#include "infinicore/ops.hpp"
+namespace infinilm::models::llama {
+LlamaModel::LlamaModel(const LlamaConfig &config, const infinicore::Device &device,
+                       infinicore::DataType dtype)
+    : config_(config) {
+    // Initialize token embeddings
+    INFINICORE_NN_MODULE_INIT(embed_tokens, config.vocab_size, config.hidden_size,
+                              std::nullopt, dtype, device);
+    // Initialize decoder layers
+    INFINICORE_NN_MODULE_VEC_INIT(layers, config.num_hidden_layers, LlamaDecoderLayer,
+                                   config, device, dtype);
+    // Initialize final layer normalization
+    INFINICORE_NN_MODULE_INIT(norm, config.hidden_size, config.rms_norm_eps,
+                              dtype, device);
+    // Initialize Rotary Position Embeddings (shared across all layers)
+    // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing
+    INFINICORE_NN_MODULE_INIT(rotary_emb, config.head_dim, config.max_position_embeddings,
+                              config.rope_theta, infinicore::nn::RoPE::Algo::GPT_NEOX,
+                              dtype, device);
+    for (auto &layer : layers_) {
+        if (layer) {
+            layer->set_rotary_emb(rotary_emb_);
+        }
+    }
+}
+infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
+                                       const infinicore::Tensor &position_ids,
+                                       std::vector<void *> *kv_caches) const {
+    // 1. Embed tokens: input_ids -> [batch, seq_len, hidden_size]
+    auto hidden_states = embed_tokens_->forward(input_ids);
+    // 2. Process through all decoder layers
+    for (size_t i = 0; i < layers_.size(); ++i) {
+        void *kv_cache = (kv_caches && i < kv_caches->size()) ? (*kv_caches)[i] : nullptr;
+        hidden_states = layers_.at(i)->forward(hidden_states, position_ids, kv_cache);
+    }
+    // 3. Apply final layer normalization to last token only (aligns with transformers)
+    // Narrow to last token: [batch, seq_len, hidden_size] -> [batch, 1, hidden_size]
+    auto shape = hidden_states->shape();
+    size_t seq_len = shape[1];
+    auto last_token = hidden_states; //->narrow({{1, seq_len - 1, 1}});
+    auto normalized_states = norm_->forward(hidden_states);
+    auto normalized_last_token = normalized_states->narrow({{1, seq_len - 1, 1}});
+    return normalized_last_token;
+}
+} // namespace infinilm::models::llama
--- a/csrc/models/llama/llama_model.hpp
+++ b/csrc/models/llama/llama_model.hpp
+#pragma once
+#include "llama_config.hpp"
+#include "llama_decoder_layer.hpp"
+#include "infinicore/nn/module.hpp"
+#include "infinicore/nn/embedding.hpp"
+#include "infinicore/nn/rmsnorm.hpp"
+#include "infinicore/nn/rope.hpp"
+#include "infinicore/tensor.hpp"
+#include "infinicore/device.hpp"
+#include <vector>
+namespace infinilm::models::llama {
+/**
+ * @brief Main Llama model architecture (without language modeling head)
+ *
+ * This is the core transformer model consisting of:
+ * - Token embeddings (embed_tokens)
+ * - Multiple decoder layers (layers)
+ * - Final layer normalization (norm)
+ * - Rotary Position Embeddings (rotary_emb)
+ *
+ * This matches the structure of HuggingFace's LlamaModel.
+ */
+class LlamaModel : public infinicore::nn::Module {
+public:
+    /**
+     * @brief Construct LlamaModel module
+     *
+     * @param config Model configuration
+     * @param device Device to create tensors on
+     * @param dtype Optional data type for model parameters (defaults to F32)
+     */
+    LlamaModel(const LlamaConfig &config, const infinicore::Device &device,
+               infinicore::DataType dtype = infinicore::DataType::F32);
+    /**
+     * @brief Forward pass: process input through the model
+     *
+     * @param input_ids Token IDs tensor of shape [batch, seq_len]
+     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
+     * @param kv_caches Optional KV caches for incremental decoding (one per layer)
+     * @return Output tensor of shape [batch, seq_len, hidden_size]
+     *
+     * Note: This is a placeholder forward method. The actual implementation
+     * will be added when integrating with the inference engine.
+     */
+    infinicore::Tensor forward(const infinicore::Tensor &input_ids,
+                                const infinicore::Tensor &position_ids,
+                                std::vector<void *> *kv_caches = nullptr) const;
+    // Module information
+    const LlamaConfig &config() const { return config_; }
+    size_t num_layers() const { return config_.num_hidden_layers; }
+protected:
+    // Token embeddings
+    INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);
+    // Decoder layers
+    INFINICORE_NN_MODULE_VEC(LlamaDecoderLayer, layers);
+    // Final normalization
+    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm);
+    // Rotary Position Embeddings (shared across all layers)
+    INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb);
+private:
+    LlamaConfig config_;
+};
+} // namespace infinilm::models::llama
--- a/csrc/models/pybind11/models.cc
+++ b/csrc/models/pybind11/models.cc
+#include <pybind11/pybind11.h>
+#include "models/llama.hpp"
+namespace py = pybind11;
+PYBIND11_MODULE(_infinilm_llama, m) {
+    m.doc() = "InfiniLM Llama model Python bindings";
+    infinilm::models::llama::bind_llama(m);
+}