llama_model.hpp 4.38 KB
Newer Older
1
2
#pragma once

3
#include "../../cache/kv_cache.hpp"
4
5
#include "llama_decoder_layer.hpp"

6
#include "infinicore/nn/embedding.hpp"
Your Name's avatar
Your Name committed
7
#include "infinicore/nn/module.hpp"
8
9
10
#include "infinicore/nn/rmsnorm.hpp"
#include "infinicore/nn/rope.hpp"
#include "infinicore/tensor.hpp"
Your Name's avatar
Your Name committed
11
12
#include "llama_config.hpp"
#include "llama_decoder_layer.hpp"
Ceng's avatar
Ceng committed
13
#include <memory>
Your Name's avatar
Your Name committed
14
15
16
#include <vector>

#include "../../engine/distributed/distributed.hpp"
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

namespace infinilm::models::llama {

/**
 * @brief Main Llama model architecture (without language modeling head)
 *
 * This is the core transformer model consisting of:
 * - Token embeddings (embed_tokens)
 * - Multiple decoder layers (layers)
 * - Final layer normalization (norm)
 * - Rotary Position Embeddings (rotary_emb)
 *
 * This matches the structure of HuggingFace's LlamaModel.
 */
class LlamaModel : public infinicore::nn::Module {
public:
    /**
     * @brief Construct LlamaModel module
     *
     * @param config Model configuration
     * @param device Device to create tensors on
     * @param dtype Optional data type for model parameters (defaults to F32)
     */
40
41
42
43
44
45
46
47
48
49
50
51
    /**
     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
     *
     * ⚠️ DEVELOPMENT POLICY:
     *   - NO new development or feature additions permitted on this interface
     *   - Only critical bug fixes (security/stability) allowed until removal
     *   - All new code MUST migrate to the polymorphic overload below
     *
     * Replacement: Use the polymorphic overload of this same function name with updated signature
     * Reason: Legacy signature lacks support for dynamic quantization modes.
     * Removal target: v0.2.0 (Q2 2026)
     */
Your Name's avatar
Your Name committed
52
53
    LlamaModel(const LlamaConfig &config,
               const infinicore::Device &device,
54
55
               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
               backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
56

57
58
    LlamaModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
               const infinicore::Device &device,
59
60
               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
               backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
61

62
63
64
    /**
     * @brief Forward pass: process input through the model
     *
65
66
     * @param input_ids Token IDs tensor of shape [batch, seq_len]. Batch is 1 when continuous batch is used,
     *                 and tokens from all requests are concatenated along seq_len dimension.
67
     * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
68
69
     * @param past_sequence_lengths Cache positions tensor of shape [n_req]
     * @param total_sequence_lengths Total sequence lengths tensor of shape [n_req]
PanZezhong's avatar
PanZezhong committed
70
     * @param input_offsets Input offsets (starting position) of each request in a continuous batch of shape [n_req + 1]
71
72
73
     * @return Output tensor of shape [batch, seq_len, hidden_size]
     */
    infinicore::Tensor forward(const infinicore::Tensor &input_ids,
Your Name's avatar
Your Name committed
74
                               const infinicore::Tensor &position_ids,
75
76
                               std::optional<infinicore::Tensor> past_sequence_lengths,
                               std::optional<infinicore::Tensor> total_sequence_lengths,
77
                               std::optional<infinicore::Tensor> input_offsets,
78
                               std::optional<infinicore::Tensor> cu_seqlens,
79
80
                               std::optional<infinicore::Tensor> block_tables,
                               std::optional<infinicore::Tensor> slot_mapping) const;
PanZezhong's avatar
PanZezhong committed
81
82

    void reset_cache(const cache::CacheConfig *cache_config);
83
84

    // Module information
85
    size_t num_layers() const { return model_config_->get<size_t>("num_hidden_layers"); }
86
87
88
89
90
91
92
93
94
95
96
97
98
99

protected:
    // Token embeddings
    INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);

    // Decoder layers
    INFINICORE_NN_MODULE_VEC(LlamaDecoderLayer, layers);

    // Final normalization
    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm);

    // Rotary Position Embeddings (shared across all layers)
    INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb);

PanZezhong's avatar
PanZezhong committed
100
101
102
103
    engine::distributed::RankInfo rank_info_;

    std::shared_ptr<cache::Cache> kv_cache_;

104
105
private:
    LlamaConfig config_;
106
107

    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
108
109
110
};

} // namespace infinilm::models::llama