llama_model.cpp 8.29 KB
Newer Older
1
2
3
4
5
#include "llama_model.hpp"
#include "infinicore/nn/embedding.hpp"
#include "infinicore/nn/rmsnorm.hpp"
#include "infinicore/nn/rope.hpp"
#include "infinicore/ops.hpp"
6
#include <iostream>
7
8

namespace infinilm::models::llama {
9
10
11
12
13
14
15
16
17
18
19
20
/**
 * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
 *
 * ⚠️ DEVELOPMENT POLICY:
 *   - NO new development or feature additions permitted on this interface
 *   - Only critical bug fixes (security/stability) allowed until removal
 *   - All new code MUST migrate to the polymorphic overload below
 *
 * Replacement: Use the polymorphic overload of this same function name with updated signature
 * Reason: Legacy signature lacks support for dynamic quantization modes.
 * Removal target: v0.2.0 (Q2 2026)
 */
Your Name's avatar
Your Name committed
21
22
LlamaModel::LlamaModel(const LlamaConfig &config,
                       const infinicore::Device &device,
23
24
                       engine::distributed::RankInfo rank_info,
                       backends::AttentionBackend attention_backend)
PanZezhong's avatar
PanZezhong committed
25
    : config_(config), rank_info_(rank_info) {
26
    const auto &dtype{config.dtype};
27
28
29
30
    // Initialize token embeddings
    INFINICORE_NN_MODULE_INIT(embed_tokens, config.vocab_size, config.hidden_size,
                              std::nullopt, dtype, device);

Ceng's avatar
Ceng committed
31
32
33
34
35
36
37
    // Initialize decoder layers with layer indices
    // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments
    //       (e.g., via a factory function or lambda that receives the layer index)
    //       Currently, we can't use the macro because each layer needs a different layer_idx
    layers_.reserve(config.num_hidden_layers);
    for (size_t i = 0; i < config.num_hidden_layers; ++i) {
        layers_.push_back(this->register_module<LlamaDecoderLayer>(
38
            "layers." + std::to_string(i), config, device, i, rank_info, attention_backend));
Ceng's avatar
Ceng committed
39
    }
40
41
42
43
44
45
46
47
48

    // Initialize final layer normalization
    INFINICORE_NN_MODULE_INIT(norm, config.hidden_size, config.rms_norm_eps,
                              dtype, device);

    // Initialize Rotary Position Embeddings (shared across all layers)
    // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing
    INFINICORE_NN_MODULE_INIT(rotary_emb, config.head_dim, config.max_position_embeddings,
                              config.rope_theta, infinicore::nn::RoPE::Algo::GPT_NEOX,
PanZezhong's avatar
PanZezhong committed
49
                              dtype, device, config.rope_scaling);
50
51
52
53
54
55
56
57

    for (auto &layer : layers_) {
        if (layer) {
            layer->set_rotary_emb(rotary_emb_);
        }
    }
}

58
59
LlamaModel::LlamaModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
                       const infinicore::Device &device,
60
61
                       engine::distributed::RankInfo rank_info,
                       backends::AttentionBackend attention_backend)
62
63
64
65
66
67
68
69
70
71
72
73
    : model_config_(model_config), rank_info_(rank_info) {
    const auto &dtype{model_config_->get_dtype()};
    // Initialize token embeddings
    INFINICORE_NN_MODULE_INIT(embed_tokens, model_config_->get<size_t>("vocab_size"), model_config_->get<size_t>("hidden_size"),
                              std::nullopt, dtype, device);
    // Initialize decoder layers with layer indices
    // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments
    //       (e.g., via a factory function or lambda that receives the layer index)
    //       Currently, we can't use the macro because each layer needs a different layer_idx
    layers_.reserve(model_config_->get<size_t>("num_hidden_layers"));
    for (size_t i = 0; i < model_config_->get<size_t>("num_hidden_layers"); ++i) {
        layers_.push_back(this->register_module<LlamaDecoderLayer>(
74
            "layers." + std::to_string(i), model_config_, device, i, rank_info, attention_backend));
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    }
    // Initialize final layer normalization
    INFINICORE_NN_MODULE_INIT(norm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
                              dtype, device);
    // Initialize Rotary Position Embeddings (shared across all layers)
    // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing
    INFINICORE_NN_MODULE_INIT(rotary_emb, model_config_->get_head_dim(), model_config_->get<size_t>("max_position_embeddings"),
                              model_config_->get<double>("rope_theta"), infinicore::nn::RoPE::Algo::GPT_NEOX,
                              dtype, device, model_config_->get_rope_scaling());

    for (auto &layer : layers_) {
        if (layer) {
            layer->set_rotary_emb(rotary_emb_);
        }
    }
}

92
93
infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
                                       const infinicore::Tensor &position_ids,
94
95
                                       std::optional<infinicore::Tensor> past_sequence_lengths,
                                       std::optional<infinicore::Tensor> total_sequence_lengths,
96
                                       std::optional<infinicore::Tensor> input_offsets,
97
                                       std::optional<infinicore::Tensor> cu_seqlens,
98
99
                                       std::optional<infinicore::Tensor> block_tables,
                                       std::optional<infinicore::Tensor> slot_mapping) const {
100
101
102
103
    // 1. Embed tokens: input_ids -> [batch, seq_len, hidden_size]
    auto hidden_states = embed_tokens_->forward(input_ids);

    // 2. Process through all decoder layers
Ceng's avatar
Ceng committed
104
    size_t num_layers = layers_.size();
105
    infinicore::Tensor residual;
Ceng's avatar
Ceng committed
106
    for (size_t i = 0; i < num_layers; ++i) {
107
108
109
110
111
112
113
114
        layers_.at(i)->forward(
            hidden_states,
            residual,
            position_ids,
            kv_cache_,
            past_sequence_lengths,
            total_sequence_lengths,
            input_offsets,
115
            cu_seqlens,
116
117
            block_tables,
            slot_mapping);
118
119
    }

120
121
122
    norm_->forward_inplace(hidden_states, residual);

    return hidden_states;
123
124
}

PanZezhong's avatar
PanZezhong committed
125
126
127
128
void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
    if (cache_config == nullptr) {
        kv_cache_ = nullptr;
        return;
129
    }
130
131
    if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config);
        kv_cache_config && model_config_ == nullptr) {
PanZezhong's avatar
PanZezhong committed
132
133
134
135
136
137
138
139
140
141
        kv_cache_ = std::make_shared<cache::StaticKVCache>(
            config_.head_dim,
            config_.head_dim,
            config_.num_key_value_heads,
            config_.num_key_value_heads,
            config_.num_hidden_layers,
            config_.max_position_embeddings,
            config_.dtype,
            *kv_cache_config,
            rank_info_);
142
143
    } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config);
               paged_kv_cache_config && model_config_ == nullptr) {
144
145
146
147
148
149
150
151
152
        kv_cache_ = std::make_shared<cache::PagedKVCache>(
            config_.head_dim,
            config_.head_dim,
            config_.num_key_value_heads,
            config_.num_key_value_heads,
            config_.num_hidden_layers,
            config_.dtype,
            *paged_kv_cache_config,
            rank_info_);
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
    } else if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config)) {
        kv_cache_ = std::make_shared<cache::StaticKVCache>(
            model_config_->get_head_dim(),
            model_config_->get_head_dim(),
            model_config_->get<size_t>("num_key_value_heads"),
            model_config_->get<size_t>("num_key_value_heads"),
            model_config_->get<size_t>("num_hidden_layers"),
            model_config_->get<size_t>("max_position_embeddings"),
            model_config_->get_dtype(),
            *kv_cache_config,
            rank_info_);
    } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config)) {
        kv_cache_ = std::make_shared<cache::PagedKVCache>(
            model_config_->get_head_dim(),
            model_config_->get_head_dim(),
            model_config_->get<size_t>("num_key_value_heads"),
            model_config_->get<size_t>("num_key_value_heads"),
            model_config_->get<size_t>("num_hidden_layers"),
            model_config_->get_dtype(),
            *paged_kv_cache_config,
            rank_info_);
PanZezhong's avatar
PanZezhong committed
174
175
    } else {
        throw std::runtime_error("Unsupported cache type");
Ceng's avatar
Ceng committed
176
177
178
    }
}

179
} // namespace infinilm::models::llama