llama_decoder_layer.cpp 4.14 KB
Newer Older
1
2
3
#include "llama_decoder_layer.hpp"
#include "infinicore/nn/rmsnorm.hpp"
#include "infinicore/ops.hpp"
4
5
#include <optional>

6
namespace infinilm::models::llama {
7
8
9
10
11
12
13
14
15
16
17
18
/**
 * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
 *
 * ⚠️ DEVELOPMENT POLICY:
 *   - NO new development or feature additions permitted on this interface
 *   - Only critical bug fixes (security/stability) allowed until removal
 *   - All new code MUST migrate to the polymorphic overload below
 *
 * Replacement: Use the polymorphic overload of this same function name with updated signature
 * Reason: Legacy signature lacks support for dynamic quantization modes.
 * Removal target: v0.2.0 (Q2 2026)
 */
Your Name's avatar
Your Name committed
19
20
LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config,
                                     const infinicore::Device &device,
Ceng's avatar
Ceng committed
21
                                     size_t layer_idx,
22
23
                                     engine::distributed::RankInfo rank_info,
                                     backends::AttentionBackend attention_backend) : layer_idx_(layer_idx), rank_info_(rank_info) {
24
25
26
    const auto &dtype{config.dtype};

    // Initialize layer normalization layers
27
28
29
30
31
32
    INFINICORE_NN_MODULE_INIT(input_layernorm, config.hidden_size, config.rms_norm_eps,
                              dtype, device);
    INFINICORE_NN_MODULE_INIT(post_attention_layernorm, config.hidden_size, config.rms_norm_eps,
                              dtype, device);

    // Initialize attention and MLP modules
33
    INFINICORE_NN_MODULE_INIT(self_attn, config, device, layer_idx, rank_info_, attention_backend);
34
    INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_);
35
36
}

37
38
39
LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
                                     const infinicore::Device &device,
                                     size_t layer_idx,
40
41
                                     engine::distributed::RankInfo rank_info,
                                     backends::AttentionBackend attention_backend) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) {
42
43
44
45
46
47
48
49
    const auto &dtype{model_config_->get_dtype()};
    // Initialize layer normalization layers
    INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
                              dtype, device);
    INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
                              dtype, device);

    // Initialize attention and MLP modules
50
    INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_, attention_backend);
51
52
53
    INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_);
}

54
55
56
57
58
59
60
61
std::tuple<infinicore::Tensor, infinicore::Tensor>
LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states,
                           infinicore::Tensor &residual,
                           const infinicore::Tensor &position_ids,
                           std::shared_ptr<infinilm::cache::Cache> kv_cache,
                           std::optional<infinicore::Tensor> past_sequence_lengths,
                           std::optional<infinicore::Tensor> total_sequence_lengths,
                           std::optional<infinicore::Tensor> input_offsets,
62
                           std::optional<infinicore::Tensor> cu_seqlens,
63
64
65
66
67
68
                           std::optional<infinicore::Tensor> block_tables,
                           std::optional<infinicore::Tensor> slot_mapping) const {
    // 1. Attention layer normalization
    input_layernorm_->forward_inplace(hidden_states, residual);

    // 2. Self-attention
69
70
    hidden_states = self_attn_->forward(
        hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping);
71
72

    // 3. Post-attention layer normalization
73
    post_attention_layernorm_->forward_inplace(hidden_states, residual);
74

75
76
    // 4. MLP
    hidden_states = mlp_->forward(hidden_states);
77

78
    return std::make_tuple(hidden_states, residual);
79
80
81
}

} // namespace infinilm::models::llama