#pragma once #include "../../cache/kv_cache.hpp" #include "llama_decoder_layer.hpp" #include "infinicore/nn/embedding.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/nn/rmsnorm.hpp" #include "infinicore/nn/rope.hpp" #include "infinicore/tensor.hpp" #include "llama_config.hpp" #include "llama_decoder_layer.hpp" #include #include #include "../../engine/distributed/distributed.hpp" namespace infinilm::models::llama { /** * @brief Main Llama model architecture (without language modeling head) * * This is the core transformer model consisting of: * - Token embeddings (embed_tokens) * - Multiple decoder layers (layers) * - Final layer normalization (norm) * - Rotary Position Embeddings (rotary_emb) * * This matches the structure of HuggingFace's LlamaModel. */ class LlamaModel : public infinicore::nn::Module { public: /** * @brief Construct LlamaModel module * * @param config Model configuration * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ /** * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). * * ⚠️ DEVELOPMENT POLICY: * - NO new development or feature additions permitted on this interface * - Only critical bug fixes (security/stability) allowed until removal * - All new code MUST migrate to the polymorphic overload below * * Replacement: Use the polymorphic overload of this same function name with updated signature * Reason: Legacy signature lacks support for dynamic quantization modes. * Removal target: v0.2.0 (Q2 2026) */ LlamaModel(const LlamaConfig &config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); LlamaModel(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); /** * @brief Forward pass: process input through the model * * @param input_ids Token IDs tensor of shape [batch, seq_len]. Batch is 1 when continuous batch is used, * and tokens from all requests are concatenated along seq_len dimension. * @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len] * @param past_sequence_lengths Cache positions tensor of shape [n_req] * @param total_sequence_lengths Total sequence lengths tensor of shape [n_req] * @param input_offsets Input offsets (starting position) of each request in a continuous batch of shape [n_req + 1] * @return Output tensor of shape [batch, seq_len, hidden_size] */ infinicore::Tensor forward(const infinicore::Tensor &input_ids, const infinicore::Tensor &position_ids, std::optional past_sequence_lengths, std::optional total_sequence_lengths, std::optional input_offsets, std::optional cu_seqlens, std::optional block_tables, std::optional slot_mapping) const; void reset_cache(const cache::CacheConfig *cache_config); // Module information size_t num_layers() const { return model_config_->get("num_hidden_layers"); } protected: // Token embeddings INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens); // Decoder layers INFINICORE_NN_MODULE_VEC(LlamaDecoderLayer, layers); // Final normalization INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm); // Rotary Position Embeddings (shared across all layers) INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb); engine::distributed::RankInfo rank_info_; std::shared_ptr kv_cache_; private: LlamaConfig config_; std::shared_ptr model_config_; }; } // namespace infinilm::models::llama