Unverified Commit 71c70586 authored by qinyiqun's avatar qinyiqun Committed by GitHub
Browse files

demo131 - multiple issues regarding quatization, qy, etc.



* issue/204 - support graph in server scripts

* issue/208 - adapt to ali ppu

* issue/194 - add quantization modify configs accordingly

支持nv w8 1batch 1tp

增加json支持

InfiniLM 增加量化层和global config

以一种比较优雅的方式增加了quant config的支持

修改部分代码结构,删除无用代码

跟随inifnicore修改

删除所有的model_config,统一使用global_config

跟随InfiniLM最新代码修改

修改函数参数顺序

改名global config 为model config

Refactor: add new API alongside legacy interfaces with deprecation warnings

添加w4 inifnicore相关内容,以及将Quantization config划入InfiniCore

添加w4 inifnicore相关内容,以及将Quantization config划入InfiniCore

* issue/175 - qy device support

qy_page_131: add qy device

success qy inference_server.py

* Issue/170 - Add HYGON support and improve device type handling.

* Issue/193: feats for deployment
Signed-off-by: default avatarCeng23333 <441651826@qq.com>

* skip responding eos token
Signed-off-by: default avatarCeng23333 <441651826@qq.com>

* issue/143 use add_rmsnorm, nt flash attn, nt kv caching

* issue/204 - support graph in server scripts

* issue/208 - adapt to ali ppu

* rebase main

* issue/216 feat: support static kv cache in server

* fix llm server cache config

* demo131 - resolve mishandled conflicts

* demo131 - further adjust attn and caching logic

* demo131 - resolve merge requirements

---------
Signed-off-by: default avatarCeng23333 <441651826@qq.com>
Co-authored-by: default avatarwooway777 <wooway777@gmail.com>
Co-authored-by: default avatarxgqdut2016 <kenan_gewei@163.com>
Co-authored-by: default avatargongchensu <zhuyue_134@qq.com>
Co-authored-by: default avatarCeng23333 <441651826@qq.com>
Co-authored-by: default avatarPanZezhong <panzezhong@qiyuanlab.com>
Co-authored-by: default avatarMaYuhang <2902139028@qq.com>
parent ee59b3f5
[submodule "third_party/spdlog"] [submodule "third_party/spdlog"]
path = third_party/spdlog path = third_party/spdlog
url = https://github.com/gabime/spdlog.git url = https://github.com/gabime/spdlog.git
[submodule "third_party/json"]
path = third_party/json
url = https://github.com/nlohmann/json.git
...@@ -71,7 +71,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA ...@@ -71,7 +71,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
- 单次推理测试 - 单次推理测试
- llama示例 - llama示例
```bash ```bash
python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir> python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali] --model_path=<path/to/model_dir>
``` ```
- 例如: - 例如:
```bash ```bash
......
...@@ -95,8 +95,7 @@ StaticKVCache::update(size_t layer_idx, ...@@ -95,8 +95,7 @@ StaticKVCache::update(size_t layer_idx,
if (device.getType() == infinicore::Device::Type::NVIDIA if (device.getType() == infinicore::Device::Type::NVIDIA
|| device.getType() == infinicore::Device::Type::ILUVATAR || device.getType() == infinicore::Device::Type::ILUVATAR
|| device.getType() == infinicore::Device::Type::METAX || device.getType() == infinicore::Device::Type::METAX) {
|| device.getType() == infinicore::Device::Type::CAMBRICON) {
infinicore::op::kv_caching_( infinicore::op::kv_caching_(
k_cache_layer, k_cache_layer,
v_cache_layer, v_cache_layer,
......
#include "model_config.hpp"
namespace infinilm::config {
ModelConfig::ModelConfig(const std::string &path) {
std::ifstream file(path);
if (file.is_open()) {
file >> config_json;
file.close();
} else {
throw std::runtime_error("Could not open config file: " + path);
}
this->quant_config = QuantConfig(config_json["quantization_config"]);
}
infinicore::quantization::QuantScheme
ModelConfig::get_quant_scheme() const {
if (quant_config.get_quant_scheme() != infinicore::quantization::QuantScheme::NONE) {
return quant_config.get_quant_scheme();
} else {
return infinicore::quantization::QuantScheme::NONE;
}
}
std::shared_ptr<infinicore::nn::RoPE::ScalingConfig>
ModelConfig::get_rope_scaling() const {
if (!config_json.contains("rope_scaling") || config_json["rope_scaling"].is_null()) {
return nullptr;
}
const auto &rope_scaling = config_json["rope_scaling"];
if (!rope_scaling.is_object()) {
throw std::runtime_error("rope_scaling must be an object");
}
if (!rope_scaling.contains("type")) {
throw std::runtime_error("rope_scaling must contain 'type' field");
}
std::string type_str = rope_scaling["type"].get<std::string>();
if (type_str == "longrope") {
// Required fields for LongRopeConfig
if (!rope_scaling.contains("short_factor") || !rope_scaling.contains("long_factor") || !rope_scaling.contains("original_max_position_embeddings")) {
throw std::runtime_error(
"LongRopeConfig requires 'short_factor', 'long_factor', and 'original_max_position_embeddings'");
}
auto short_factor = rope_scaling["short_factor"].get<std::vector<float>>();
auto long_factor = rope_scaling["long_factor"].get<std::vector<float>>();
size_t original_max_position_embeddings = rope_scaling["original_max_position_embeddings"].get<size_t>();
float factor = 1.0f;
if (rope_scaling.contains("factor")) {
factor = rope_scaling["factor"].get<float>();
}
return std::make_shared<infinicore::nn::RoPE::LongRopeConfig>(
std::move(short_factor),
std::move(long_factor),
original_max_position_embeddings,
factor);
} else if (type_str == "default" || type_str == "none") {
// Default scaling, no scaling applied
return nullptr;
} else {
throw std::runtime_error("Unsupported rope_scaling type: " + type_str);
}
}
infinicore::DataType
ModelConfig::get_dtype() const {
try {
std::string dtype_str = this->get<std::string>("torch_dtype");
if (dtype_str == "float32") {
return infinicore::DataType::F32;
} else if (dtype_str == "float16") {
return infinicore::DataType::F16;
} else if (dtype_str == "bfloat16") {
return infinicore::DataType::BF16;
} else if (dtype_str == "int8") {
return infinicore::DataType::I8;
} else {
throw std::runtime_error("Unsupported dtype string: " + dtype_str);
}
} catch (const std::exception &e) {
throw std::runtime_error("Error getting dtype from config: " + std::string(e.what()));
}
}
} // namespace infinilm::config
#pragma once
#include "infinicore/nn/rope.hpp"
#include "infinicore/ops.hpp"
#include "quant_config.hpp"
#include <fstream>
#include <string>
namespace infinilm::config {
class ModelConfig {
// Model config is implemented using nlohmann/json and is primarily used for advanced configuration
// beyond the standard model config. It is initialized via ModelConfig(const std::string& path)
// and passed through the InferEngine during inference.
public:
ModelConfig() = default;
// Not Implemented
// ModelConfig(const nlohmann::json &json) : config_json(json) {};
ModelConfig(const std::string &path);
// Template Function to get a value by key with type safety
template <typename T>
T get(const std::string &key) const {
if (!config_json.contains(key)) {
throw std::out_of_range("Key '" + key + "' not found in config.");
}
try {
return config_json.at(key).get<T>();
} catch (const nlohmann::json::type_error &e) {
throw std::runtime_error("Type conversion failed for key '" + key + "': " + std::string(e.what()));
}
}
template <typename T>
T get_or(const std::string &key, const T &default_value) const {
if (!config_json.contains(key) || config_json.at(key).is_null()) {
return default_value;
}
try {
return config_json.at(key).get<T>();
} catch (const nlohmann::json::type_error &) {
// If type conversion fails, return default value
return default_value;
}
}
size_t get_kv_dim() const {
return get<size_t>("hidden_size") * get<size_t>("num_key_value_heads") / get<size_t>("num_attention_heads");
}
size_t get_head_dim() const {
if (config_json.contains("head_dim")) {
return get<size_t>("head_dim");
}
return get<size_t>("hidden_size") / get<size_t>("num_attention_heads");
}
QuantConfig get_quant_config() const {
return quant_config;
}
std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization_method() const {
return quant_config.get_quantization_method();
}
infinicore::DataType get_dtype() const;
infinicore::quantization::QuantScheme get_quant_scheme() const;
std::shared_ptr<infinicore::nn::RoPE::ScalingConfig> get_rope_scaling() const;
private:
nlohmann::json config_json;
QuantConfig quant_config;
};
} // namespace infinilm::config
#include "quant_config.hpp"
namespace infinilm::config {
QuantConfig::QuantConfig(const nlohmann::json &json) : quantization_config(json) {
this->quantization_method = get_quantization_method();
}
std::shared_ptr<infinicore::quantization::BaseQuantization>
QuantConfig::get_quantization_method() const {
if (quantization_config.is_null()) {
// return nullptr;
return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
}
// Determine the quantization scheme from the JSON config
if (quantization_config["quant_method"] == "compressed-tensors") {
return std::make_shared<infinicore::quantization::CompressedTensors>(quantization_config);
} else if (quantization_config["quant_method"] == "awq") {
return std::make_shared<infinicore::quantization::AWQ>(quantization_config);
} else {
return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config);
}
// Add other schemes as needed
return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
}
} // namespace infinilm::config
#pragma once
// #include "../quantization/quantization.hpp"
#include "infinicore/quantization.hpp"
#include "nlohmann/json.hpp"
namespace infinilm::config {
class QuantConfig {
// QuantConfig is used to store and parse the "quantization" field from config.json.
// This is currently a basic version and will be extended in the future.
public:
QuantConfig() = default;
QuantConfig(const nlohmann::json &json);
std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization_method() const;
infinicore::quantization::QuantScheme get_quant_scheme() const {
if (quantization_method != nullptr) {
return quantization_method->get_quant_scheme();
} else {
return infinicore::quantization::QuantScheme::NONE;
}
}
private:
nlohmann::json quantization_config;
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_method;
};
} // namespace infinilm::config
...@@ -6,6 +6,18 @@ namespace infinilm::engine { ...@@ -6,6 +6,18 @@ namespace infinilm::engine {
//------------------------------------------------------ //------------------------------------------------------
// Constructor // Constructor
//------------------------------------------------------ //------------------------------------------------------
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
InferEngine::InferEngine( InferEngine::InferEngine(
const InfinilmModel::Config &config, const InfinilmModel::Config &config,
const distributed::DistConfig &distributed_config, const distributed::DistConfig &distributed_config,
...@@ -13,11 +25,40 @@ InferEngine::InferEngine( ...@@ -13,11 +25,40 @@ InferEngine::InferEngine(
const cache::CacheConfig *cache_config, const cache::CacheConfig *cache_config,
bool enable_graph_compiling) // Changed parameter bool enable_graph_compiling) // Changed parameter
: communication_group_(distributed_config, device_type), : communication_group_(distributed_config, device_type),
model_config_(config) { legacy_model_config_(config) {
if (cache_config != nullptr) {
cache_config_ = cache_config->unique_copy();
}
// Create one RankWorker per rank
int world_size = communication_group_.get_world_size();
barrier_ = std::make_unique<RankBarrier>((size_t)world_size);
workers_.reserve(world_size);
for (int r = 0; r < world_size; ++r) {
workers_.emplace_back(std::make_unique<RankWorker>(
legacy_model_config_,
communication_group_.get_rank_info(r),
cache_config_ != nullptr ? cache_config_.get() : nullptr,
barrier_.get(),
enable_graph_compiling));
}
// Compile the model on all workers
this->compile();
}
InferEngine::InferEngine(
const std::string &model_path,
const distributed::DistConfig &distributed_config,
infinicore::Device::Type device_type,
const cache::CacheConfig *cache_config,
bool enable_graph_compiling) // Changed parameter
: communication_group_(distributed_config, device_type) {
if (cache_config != nullptr) { if (cache_config != nullptr) {
cache_config_ = cache_config->unique_copy(); cache_config_ = cache_config->unique_copy();
} }
// Load model config if model_path is provided, model_path must be valid, and config.json exists
this->model_config_ = std::make_shared<infinilm::config::ModelConfig>(model_path + "/config.json");
// Create one RankWorker per rank // Create one RankWorker per rank
int world_size = communication_group_.get_world_size(); int world_size = communication_group_.get_world_size();
barrier_ = std::make_unique<RankBarrier>((size_t)world_size); barrier_ = std::make_unique<RankBarrier>((size_t)world_size);
...@@ -30,7 +71,6 @@ InferEngine::InferEngine( ...@@ -30,7 +71,6 @@ InferEngine::InferEngine(
barrier_.get(), barrier_.get(),
enable_graph_compiling)); enable_graph_compiling));
} }
// Compile the model on all workers // Compile the model on all workers
this->compile(); this->compile();
} }
......
#pragma once #pragma once
#include "../config/model_config.hpp"
#include "../models/infinilm_model.hpp" #include "../models/infinilm_model.hpp"
#include "../models/llama/llama_config.hpp" #include "../models/llama/llama_config.hpp"
#include "distributed/distributed.hpp" #include "distributed/distributed.hpp"
...@@ -19,6 +20,18 @@ public: ...@@ -19,6 +20,18 @@ public:
using Output = RankWorker::Output; using Output = RankWorker::Output;
// Updated constructor: accept CacheConfig instead of CacheType // Updated constructor: accept CacheConfig instead of CacheType
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
InferEngine( InferEngine(
const InfinilmModel::Config &config, const InfinilmModel::Config &config,
const distributed::DistConfig &distributed_config = distributed::DistConfig(), const distributed::DistConfig &distributed_config = distributed::DistConfig(),
...@@ -26,6 +39,13 @@ public: ...@@ -26,6 +39,13 @@ public:
const cache::CacheConfig *cache_config = nullptr, const cache::CacheConfig *cache_config = nullptr,
bool enable_graph_compiling = false); bool enable_graph_compiling = false);
InferEngine(
const std::string &model_path = "",
const distributed::DistConfig &distributed_config = distributed::DistConfig(),
infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
const cache::CacheConfig *cache_config = nullptr,
bool enable_graph_compiling = false);
// Load a parameter to all workers (each can extract its shard inside RankWorker) // Load a parameter to all workers (each can extract its shard inside RankWorker)
void load_param(const std::string &name, const infinicore::Tensor &param); void load_param(const std::string &name, const infinicore::Tensor &param);
...@@ -50,8 +70,9 @@ protected: ...@@ -50,8 +70,9 @@ protected:
std::vector<std::unique_ptr<RankWorker>> workers_; std::vector<std::unique_ptr<RankWorker>> workers_;
std::unique_ptr<RankBarrier> barrier_; std::unique_ptr<RankBarrier> barrier_;
distributed::CommunicationGroup communication_group_; distributed::CommunicationGroup communication_group_;
const InfinilmModel::Config &model_config_;
std::unique_ptr<cache::CacheConfig> cache_config_; std::unique_ptr<cache::CacheConfig> cache_config_;
const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
std::shared_ptr<infinilm::config::ModelConfig> model_config_;
}; };
} // namespace infinilm::engine } // namespace infinilm::engine
...@@ -10,12 +10,24 @@ ...@@ -10,12 +10,24 @@
namespace infinilm::engine { namespace infinilm::engine {
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
RankWorker::RankWorker(const InfinilmModel::Config &model_config, RankWorker::RankWorker(const InfinilmModel::Config &model_config,
const distributed::RankInfo &rank_info, const distributed::RankInfo &rank_info,
const cache::CacheConfig *cache_config, const cache::CacheConfig *cache_config,
RankBarrier *barrier, RankBarrier *barrier,
bool enable_graph_compiling) bool enable_graph_compiling)
: model_config_(model_config), : legacy_model_config_(model_config),
rank_info_(rank_info), rank_info_(rank_info),
enable_graph_compiling_(enable_graph_compiling), enable_graph_compiling_(enable_graph_compiling),
job_cmd_(Command::INIT), job_cmd_(Command::INIT),
...@@ -36,6 +48,32 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config, ...@@ -36,6 +48,32 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config,
cv_.wait(lk, [&] { return init_done_; }); cv_.wait(lk, [&] { return init_done_; });
} }
RankWorker::RankWorker(
std::shared_ptr<infinilm::config::ModelConfig> model_config,
const distributed::RankInfo &rank_info,
const cache::CacheConfig *cache_config,
RankBarrier *barrier,
bool enable_graph_compiling)
: model_config_(model_config),
rank_info_(rank_info),
enable_graph_compiling_(enable_graph_compiling),
job_cmd_(Command::INIT),
has_job_(false),
job_done_(false),
should_exit_(false),
init_done_(false),
rng_(std::random_device{}()),
barrier_(barrier) {
if (cache_config != nullptr) {
pending_cache_config_ = cache_config->unique_copy();
}
// start the thread
thread_ = std::thread(&RankWorker::thread_loop, this);
// Wait until the worker thread finishes initialization (model created)
std::unique_lock<std::mutex> lk(mutex_);
cv_.wait(lk, [&] { return init_done_; });
}
std::string RankWorker::info() const { std::string RankWorker::info() const {
std::stringstream ss; std::stringstream ss;
...@@ -195,7 +233,13 @@ void RankWorker::thread_loop() { ...@@ -195,7 +233,13 @@ void RankWorker::thread_loop() {
infinicore::context::setDevice(rank_info_.device); infinicore::context::setDevice(rank_info_.device);
// Create model using factory (may be expensive) // Create model using factory (may be expensive)
if (model_config_ == nullptr) {
model_ = InfinilmModelFactory::createModel(legacy_model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
} else {
model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
}
if (!model_) { if (!model_) {
throw std::runtime_error("Failed to create model"); throw std::runtime_error("Failed to create model");
} }
......
#pragma once #pragma once
#include "../cache/cache.hpp" #include "../cache/cache.hpp"
#include "../config/model_config.hpp"
#include "../models/model_factory.hpp" #include "../models/model_factory.hpp"
#include "compiler/general_compiler.hpp" #include "compiler/general_compiler.hpp"
#include "distributed/distributed.hpp" #include "distributed/distributed.hpp"
...@@ -62,6 +63,12 @@ public: ...@@ -62,6 +63,12 @@ public:
RankBarrier *barrier, RankBarrier *barrier,
bool enable_graph_compiling); bool enable_graph_compiling);
RankWorker(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const distributed::RankInfo &rank_info,
const cache::CacheConfig *cache_config,
RankBarrier *barrier,
bool enable_graph_compiling);
// Submit a parameter load job and wait until the load completes on the worker thread. // Submit a parameter load job and wait until the load completes on the worker thread.
void load_param(const std::string &name, void load_param(const std::string &name,
const infinicore::Tensor &param); const infinicore::Tensor &param);
...@@ -94,7 +101,8 @@ private: ...@@ -94,7 +101,8 @@ private:
private: private:
// Worker properties // Worker properties
const InfinilmModel::Config &model_config_; const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
std::shared_ptr<infinilm::config::ModelConfig> model_config_;
distributed::RankInfo rank_info_; distributed::RankInfo rank_info_;
std::shared_ptr<InfinilmModel> model_; std::shared_ptr<InfinilmModel> model_;
std::shared_ptr<cache::Cache> cache_; std::shared_ptr<cache::Cache> cache_;
......
...@@ -6,6 +6,18 @@ namespace infinilm::layers { ...@@ -6,6 +6,18 @@ namespace infinilm::layers {
// --------------------------------------------------------- // ---------------------------------------------------------
// QKV Parallel Linear // QKV Parallel Linear
// --------------------------------------------------------- // ---------------------------------------------------------
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
QKVParallelLinear::QKVParallelLinear(size_t hidden_size, QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
size_t head_dim, size_t head_dim,
size_t num_q_head, size_t num_q_head,
...@@ -57,6 +69,61 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, ...@@ -57,6 +69,61 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
v_out_size_ = num_v_head_ * v_dim_ / tp_size_; v_out_size_ = num_v_head_ * v_dim_ / tp_size_;
} }
QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
size_t head_dim,
size_t num_q_head,
size_t num_kv_head,
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
bool bias,
const infinicore::DataType &dtype,
const infinicore::Device &device,
engine::distributed::RankInfo rank_info)
: QKVParallelLinear(hidden_size,
head_dim, head_dim, head_dim,
num_q_head, num_kv_head, num_kv_head,
bias, bias, bias,
quantization,
dtype, device, rank_info) {}
QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
size_t q_dim, size_t k_dim, size_t v_dim,
size_t num_q_head, size_t num_k_head, size_t num_v_head,
bool q_bias, bool k_bias, bool v_bias,
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
const infinicore::DataType &dtype,
const infinicore::Device &device,
engine::distributed::RankInfo rank_info)
: infinicore::nn::ColumnParallelLinear(
hidden_size,
num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim,
quantization,
(q_bias || k_bias || v_bias),
dtype,
device,
rank_info.tp_rank,
rank_info.tp_size),
q_dim_(q_dim),
k_dim_(k_dim),
v_dim_(v_dim),
num_q_head_(num_q_head),
num_k_head_(num_k_head),
num_v_head_(num_v_head),
q_bias_(q_bias),
k_bias_(k_bias),
v_bias_(v_bias) {
if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) {
throw std::runtime_error("QKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size");
}
if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) {
throw std::runtime_error("q_bias, k_bias, v_bias must all match");
}
q_out_size_ = num_q_head_ * q_dim_ / tp_size_;
k_out_size_ = num_k_head_ * k_dim_ / tp_size_;
v_out_size_ = num_v_head_ * v_dim_ / tp_size_;
}
std::tuple<infinicore::Tensor, infinicore::Tensor, infinicore::Tensor> std::tuple<infinicore::Tensor, infinicore::Tensor, infinicore::Tensor>
QKVParallelLinear::forward_split(infinicore::Tensor &input) { QKVParallelLinear::forward_split(infinicore::Tensor &input) {
auto output = this->forward(input); auto output = this->forward(input);
...@@ -86,6 +153,40 @@ infinicore::nn::Parameter QKVParallelLinear::get_v_weight() const { ...@@ -86,6 +153,40 @@ infinicore::nn::Parameter QKVParallelLinear::get_v_weight() const {
0, tp_rank_, tp_size_); 0, tp_rank_, tp_size_);
} }
infinicore::nn::Parameter QKVParallelLinear::get_q_weight_scale() const {
return infinicore::nn::Parameter(
weight_scale_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_);
}
infinicore::nn::Parameter QKVParallelLinear::get_k_weight_scale() const {
return infinicore::nn::Parameter(
weight_scale_->narrow({{0, q_out_size_, k_out_size_}}),
0, tp_rank_, tp_size_);
}
infinicore::nn::Parameter QKVParallelLinear::get_v_weight_scale() const {
return infinicore::nn::Parameter(
weight_scale_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}),
0, tp_rank_, tp_size_);
}
infinicore::nn::Parameter QKVParallelLinear::get_q_weight_zeros() const {
return infinicore::nn::Parameter(
weight_zeros_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_);
}
infinicore::nn::Parameter QKVParallelLinear::get_k_weight_zeros() const {
return infinicore::nn::Parameter(
weight_zeros_->narrow({{0, q_out_size_, k_out_size_}}),
0, tp_rank_, tp_size_);
}
infinicore::nn::Parameter QKVParallelLinear::get_v_weight_zeros() const {
return infinicore::nn::Parameter(
weight_zeros_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}),
0, tp_rank_, tp_size_);
}
infinicore::nn::Parameter QKVParallelLinear::get_q_bias() const { infinicore::nn::Parameter QKVParallelLinear::get_q_bias() const {
if (!q_bias_) { if (!q_bias_) {
return infinicore::nn::Parameter(); return infinicore::nn::Parameter();
...@@ -120,6 +221,18 @@ bool QKVParallelLinear::has_v_bias() const { return v_bias_; } ...@@ -120,6 +221,18 @@ bool QKVParallelLinear::has_v_bias() const { return v_bias_; }
// --------------------------------------------------------- // ---------------------------------------------------------
// Gate-Up Parallel Linear // Gate-Up Parallel Linear
// --------------------------------------------------------- // ---------------------------------------------------------
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias,
const infinicore::DataType &dtype, const infinicore::Device &device, const infinicore::DataType &dtype, const infinicore::Device &device,
engine::distributed::RankInfo rank_info) engine::distributed::RankInfo rank_info)
...@@ -135,6 +248,22 @@ GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermedia ...@@ -135,6 +248,22 @@ GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermedia
} }
} }
GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
const infinicore::DataType &dtype, const infinicore::Device &device,
engine::distributed::RankInfo rank_info)
: GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, quantization, dtype, device, rank_info) {
}
GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
const infinicore::DataType &dtype, const infinicore::Device &device,
engine::distributed::RankInfo rank_info)
: infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quantization, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) {
if (gate_bias_ != up_bias_) {
throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time");
}
}
std::tuple<infinicore::Tensor, infinicore::Tensor> GateUpParallelLinear::forward_split(infinicore::Tensor &input) { std::tuple<infinicore::Tensor, infinicore::Tensor> GateUpParallelLinear::forward_split(infinicore::Tensor &input) {
auto output = this->forward(input); auto output = this->forward(input);
auto cols = output->shape()[2]; auto cols = output->shape()[2];
...@@ -168,6 +297,22 @@ infinicore::nn::Parameter GateUpParallelLinear::get_up_bias() const { ...@@ -168,6 +297,22 @@ infinicore::nn::Parameter GateUpParallelLinear::get_up_bias() const {
} }
} }
infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_scale() const {
return infinicore::nn::Parameter(weight_scale_->narrow({{0, 0, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_);
}
infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_scale() const {
return infinicore::nn::Parameter(weight_scale_->narrow({{0, weight_scale_->size(0) / 2, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_);
}
infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_zeros() const {
return infinicore::nn::Parameter(weight_zeros_->narrow({{0, 0, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_);
}
infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_zeros() const {
return infinicore::nn::Parameter(weight_zeros_->narrow({{0, weight_zeros_->size(0) / 2, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_);
}
bool GateUpParallelLinear::has_gate_bias() const { bool GateUpParallelLinear::has_gate_bias() const {
return gate_bias_; return gate_bias_;
} }
......
#pragma once #pragma once
#include "infinicore/nn/linear.hpp" #include "infinicore/nn/linear.hpp"
#include "infinicore/quantization.hpp"
#include "../engine/distributed/communication_group.hpp" #include "../engine/distributed/communication_group.hpp"
...@@ -23,6 +24,25 @@ public: ...@@ -23,6 +24,25 @@ public:
const infinicore::Device &device = infinicore::Device(), const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
explicit QKVParallelLinear(size_t hidden_size,
size_t q_dim, size_t k_dim, size_t v_dim,
size_t num_q_head, size_t num_k_head, size_t num_v_head,
bool q_bias, bool k_bias, bool v_bias,
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
const infinicore::DataType &dtype = infinicore::DataType::F32,
const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
// A more common case where all heads have the same dimension
explicit QKVParallelLinear(size_t hidden_size,
size_t head_dim,
size_t num_q_head, size_t num_kv_head,
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
bool bias = false,
const infinicore::DataType &dtype = infinicore::DataType::F32,
const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
std::tuple<infinicore::Tensor, infinicore::Tensor, infinicore::Tensor> std::tuple<infinicore::Tensor, infinicore::Tensor, infinicore::Tensor>
forward_split(infinicore::Tensor &input); forward_split(infinicore::Tensor &input);
...@@ -30,6 +50,14 @@ public: ...@@ -30,6 +50,14 @@ public:
infinicore::nn::Parameter get_k_weight() const; infinicore::nn::Parameter get_k_weight() const;
infinicore::nn::Parameter get_v_weight() const; infinicore::nn::Parameter get_v_weight() const;
infinicore::nn::Parameter get_q_weight_scale() const;
infinicore::nn::Parameter get_k_weight_scale() const;
infinicore::nn::Parameter get_v_weight_scale() const;
infinicore::nn::Parameter get_q_weight_zeros() const;
infinicore::nn::Parameter get_k_weight_zeros() const;
infinicore::nn::Parameter get_v_weight_zeros() const;
infinicore::nn::Parameter get_q_bias() const; infinicore::nn::Parameter get_q_bias() const;
infinicore::nn::Parameter get_k_bias() const; infinicore::nn::Parameter get_k_bias() const;
infinicore::nn::Parameter get_v_bias() const; infinicore::nn::Parameter get_v_bias() const;
...@@ -55,6 +83,18 @@ private: ...@@ -55,6 +83,18 @@ private:
class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear {
public: public:
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false,
const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
...@@ -63,14 +103,33 @@ public: ...@@ -63,14 +103,33 @@ public:
const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
bool bias = false,
const infinicore::DataType &dtype = infinicore::DataType::F32,
const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
std::tuple<infinicore::Tensor, infinicore::Tensor> forward_split(infinicore::Tensor &input); std::tuple<infinicore::Tensor, infinicore::Tensor> forward_split(infinicore::Tensor &input);
infinicore::nn::Parameter get_gate_weight() const; infinicore::nn::Parameter get_gate_weight() const;
infinicore::nn::Parameter get_gate_weight_scale() const;
infinicore::nn::Parameter get_gate_weight_zeros() const;
infinicore::nn::Parameter get_gate_bias() const; infinicore::nn::Parameter get_gate_bias() const;
infinicore::nn::Parameter get_up_weight() const; infinicore::nn::Parameter get_up_weight() const;
infinicore::nn::Parameter get_up_weight_scale() const;
infinicore::nn::Parameter get_up_weight_zeros() const;
infinicore::nn::Parameter get_up_bias() const; infinicore::nn::Parameter get_up_bias() const;
bool has_gate_bias() const; bool has_gate_bias() const;
...@@ -103,4 +162,62 @@ private: ...@@ -103,4 +162,62 @@ private:
if (name##_->has_up_bias()) \ if (name##_->has_up_bias()) \
this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
// ========================= QKV Quantization ==================================
#define INFINILM_QKV_LINEAR_W8A8_INIT(name, q_name, k_name, v_name, ...) \
name##_ = std::make_shared<layers::QKVParallelLinear>(__VA_ARGS__); \
this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight()); \
this->register_parameter(std::string(q_name) + ".weight_scale", name##_->get_q_weight_scale()); \
this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight()); \
this->register_parameter(std::string(k_name) + ".weight_scale", name##_->get_k_weight_scale()); \
this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight()); \
this->register_parameter(std::string(v_name) + ".weight_scale", name##_->get_v_weight_scale()); \
if (name##_->has_q_bias()) \
this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \
if (name##_->has_k_bias()) \
this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \
if (name##_->has_v_bias()) \
this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
#define INFINILM_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...) \
name##_ = std::make_shared<layers::QKVParallelLinear>(__VA_ARGS__); \
this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight()); \
this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros()); \
this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale()); \
this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight()); \
this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros()); \
this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale()); \
this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight()); \
this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros()); \
this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale()); \
if (name##_->has_q_bias()) \
this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \
if (name##_->has_k_bias()) \
this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \
if (name##_->has_v_bias()) \
this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
// ========================= Gate-Up Quantization ==============================
#define INFINILM_GATE_UP_LINEAR_W8A8_INIT(name, gate_name, up_name, ...) \
name##_ = std::make_shared<layers::GateUpParallelLinear>(__VA_ARGS__); \
this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight()); \
this->register_parameter(std::string(gate_name) + ".weight_scale", name##_->get_gate_weight_scale()); \
this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight()); \
this->register_parameter(std::string(up_name) + ".weight_scale", name##_->get_up_weight_scale()); \
if (name##_->has_gate_bias()) \
this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \
if (name##_->has_up_bias()) \
this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
#define INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(name, gate_name, up_name, ...) \
name##_ = std::make_shared<layers::GateUpParallelLinear>(__VA_ARGS__); \
this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight()); \
this->register_parameter(std::string(gate_name) + ".scales", name##_->get_gate_weight_scale()); \
this->register_parameter(std::string(gate_name) + ".qzeros", name##_->get_gate_weight_zeros()); \
this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight()); \
this->register_parameter(std::string(up_name) + ".scales", name##_->get_up_weight_scale()); \
this->register_parameter(std::string(up_name) + ".qzeros", name##_->get_up_weight_zeros()); \
if (name##_->has_gate_bias()) \
this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \
if (name##_->has_up_bias()) \
this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
} // namespace infinilm::layers } // namespace infinilm::layers
#pragma once #pragma once
#include "infinicore/nn/module.hpp"
#include "../cache/cache.hpp" #include "../cache/cache.hpp"
#include "infinicore/nn/module.hpp"
#include "nlohmann/json.hpp"
#include <any> #include <any>
...@@ -13,7 +13,6 @@ class InfinilmModel : public infinicore::nn::Module { ...@@ -13,7 +13,6 @@ class InfinilmModel : public infinicore::nn::Module {
public: public:
struct Config { struct Config {
std::string model_type; std::string model_type;
virtual ~Config() = default; virtual ~Config() = default;
}; };
......
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
* - LlamaForCausalLM: Complete model with language modeling head * - LlamaForCausalLM: Complete model with language modeling head
*/ */
#include "llama_config.hpp" #include "../../config/model_config.hpp"
#include "llama_attention.hpp" #include "llama_attention.hpp"
#include "llama_mlp.hpp"
#include "llama_decoder_layer.hpp" #include "llama_decoder_layer.hpp"
#include "llama_model.hpp"
#include "llama_for_causal_lm.hpp" #include "llama_for_causal_lm.hpp"
#include "llama_mlp.hpp"
#include "llama_model.hpp"
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <iostream>
#include <optional> #include <optional>
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <stdexcept> #include <stdexcept>
...@@ -17,6 +16,18 @@ ...@@ -17,6 +16,18 @@
namespace infinilm::models::llama { namespace infinilm::models::llama {
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
LlamaAttention::LlamaAttention(const LlamaConfig &config, LlamaAttention::LlamaAttention(const LlamaConfig &config,
const infinicore::Device &device, const infinicore::Device &device,
size_t layer_idx, size_t layer_idx,
...@@ -61,6 +72,65 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, ...@@ -61,6 +72,65 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config,
} }
} }
LlamaAttention::LlamaAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device,
size_t layer_idx,
engine::distributed::RankInfo rank_info)
: model_config_(model_config),
layer_idx_(layer_idx),
hidden_size_(model_config->get<size_t>("hidden_size")),
num_attention_heads_(model_config->get<size_t>("num_attention_heads")),
num_key_value_heads_(model_config->get<size_t>("num_key_value_heads")),
head_dim_(model_config->get_head_dim()),
kv_dim_(model_config->get_kv_dim()),
use_bias_(model_config->get_or<bool>("attention_bias", true)),
use_output_bias_(model_config->get_or<bool>("attention_output_bias", false)),
max_position_embeddings_(model_config->get<size_t>("max_position_embeddings")),
rank_info_(rank_info) {
const auto &dtype{model_config_->get_dtype()};
int tp_rank = rank_info.tp_rank;
int tp_size = rank_info.tp_size;
int num_attention_heads = model_config_->get<size_t>("num_attention_heads");
int num_key_value_heads = model_config_->get<size_t>("num_key_value_heads");
if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) {
this->num_attention_heads_ = num_attention_heads / tp_size;
this->num_key_value_heads_ = num_key_value_heads / tp_size;
} else {
throw std::runtime_error("num_attention_heads / tp_size error.");
}
scaling_ = 1.0f / std::sqrt(static_cast<float>(head_dim_));
auto quant_scheme = this->model_config_->get_quant_scheme();
switch (quant_scheme) {
case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8:
INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_,
dtype, device, rank_info);
INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_,
dtype, device, tp_rank, tp_size, rank_info.comm);
break;
case infinicore::quantization::QuantScheme::AWQ_W4A16:
INFINILM_QKV_LINEAR_W4A16AWQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_,
dtype, device, rank_info);
INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_,
dtype, device, tp_rank, tp_size, rank_info.comm);
break;
default:
INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_,
dtype, device, rank_info);
INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_,
dtype, device, tp_rank, tp_size, rank_info.comm);
break;
}
if (model_config_->get<std::string>("model_type") == "qwen3") {
INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
}
}
infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_states, infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_states,
const infinicore::Tensor &position_ids, const infinicore::Tensor &position_ids,
std::shared_ptr<infinilm::cache::Cache> kv_cache, std::shared_ptr<infinilm::cache::Cache> kv_cache,
...@@ -75,7 +145,7 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta ...@@ -75,7 +145,7 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
// 1. Project Q, K, V // 1. Project Q, K, V
auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable); auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
if (use_qk_norm_) { if (use_qk_norm_ || model_config_->get_or<std::string>("model_type", "None") == "qwen3") {
q = q_norm_->forward(q->view({batch_size * seq_len, num_attention_heads_, head_dim_})); q = q_norm_->forward(q->view({batch_size * seq_len, num_attention_heads_, head_dim_}));
k = k_norm_->forward(k->view({batch_size * seq_len, num_key_value_heads_, head_dim_})); k = k_norm_->forward(k->view({batch_size * seq_len, num_key_value_heads_, head_dim_}));
} }
...@@ -126,9 +196,8 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta ...@@ -126,9 +196,8 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
} }
infinicore::Tensor attn_output; infinicore::Tensor attn_output;
if (q_reshaped->device().getType() == infinicore::Device::Type::NVIDIA if (false) {
|| q_reshaped->device().getType() == infinicore::Device::Type::ILUVATAR // experimental nineoothed flash attention
|| q_reshaped->device().getType() == infinicore::Device::Type::CAMBRICON) {
attn_output = infinicore::op::flash_attention(q_reshaped, k_total, v_total, total_sequence_lengths.value(), scaling_, true); attn_output = infinicore::op::flash_attention(q_reshaped, k_total, v_total, total_sequence_lengths.value(), scaling_, true);
attn_output = attn_output->permute({0, 2, 1, 3}) attn_output = attn_output->permute({0, 2, 1, 3})
->contiguous() ->contiguous()
...@@ -197,7 +266,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd ...@@ -197,7 +266,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
auto k_reshaped = k->view({seq_len, num_key_value_heads_, head_dim_}); auto k_reshaped = k->view({seq_len, num_key_value_heads_, head_dim_});
auto v_reshaped = v->view({seq_len, num_key_value_heads_, head_dim_}); auto v_reshaped = v->view({seq_len, num_key_value_heads_, head_dim_});
if (use_qk_norm_) { if (use_qk_norm_ || model_config_->get_or<std::string>("model_type", "None") == "qwen3") {
q_reshaped = q_norm_->forward(q_reshaped); q_reshaped = q_norm_->forward(q_reshaped);
k_reshaped = k_norm_->forward(k_reshaped); k_reshaped = k_norm_->forward(k_reshaped);
} }
......
#pragma once #pragma once
#include "../../cache/kv_cache.hpp" #include "../../cache/kv_cache.hpp"
#include "../../config/model_config.hpp"
#include "../../engine/distributed/distributed.hpp" #include "../../engine/distributed/distributed.hpp"
#include "../../layers/fused_linear.hpp" #include "../../layers/fused_linear.hpp"
#include "llama_config.hpp" #include "llama_config.hpp"
...@@ -36,11 +37,28 @@ public: ...@@ -36,11 +37,28 @@ public:
* @param layer_idx Layer index for cache access * @param layer_idx Layer index for cache access
* @param dtype Optional data type for model parameters (defaults to F32) * @param dtype Optional data type for model parameters (defaults to F32)
*/ */
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
LlamaAttention(const LlamaConfig &config, LlamaAttention(const LlamaConfig &config,
const infinicore::Device &device, const infinicore::Device &device,
size_t layer_idx, size_t layer_idx,
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
LlamaAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device,
size_t layer_idx,
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
/** /**
* @brief Forward pass: compute attention * @brief Forward pass: compute attention
* *
...@@ -101,6 +119,7 @@ protected: ...@@ -101,6 +119,7 @@ protected:
std::shared_ptr<infinicore::nn::RoPE> rotary_emb_; std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
private: private:
std::shared_ptr<infinilm::config::ModelConfig> model_config_ = std::make_shared<infinilm::config::ModelConfig>();
size_t layer_idx_; // Layer index for cache access size_t layer_idx_; // Layer index for cache access
size_t hidden_size_; size_t hidden_size_;
size_t num_attention_heads_; size_t num_attention_heads_;
...@@ -109,7 +128,7 @@ private: ...@@ -109,7 +128,7 @@ private:
size_t kv_dim_; size_t kv_dim_;
bool use_bias_; // Bias for Q/K/V projections bool use_bias_; // Bias for Q/K/V projections
bool use_output_bias_; // Bias for output projection (o_proj) bool use_output_bias_; // Bias for output projection (o_proj)
bool use_qk_norm_; // Whether to use QK RMSNorm bool use_qk_norm_ = false; // Whether to use QK RMSNorm
size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility) size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility)
float scaling_; float scaling_;
......
#include "llama_decoder_layer.hpp" #include "llama_decoder_layer.hpp"
#include "infinicore/nn/rmsnorm.hpp" #include "infinicore/nn/rmsnorm.hpp"
#include "infinicore/ops.hpp" #include "infinicore/ops.hpp"
#include <optional> #include <optional>
namespace infinilm::models::llama { namespace infinilm::models::llama {
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config,
const infinicore::Device &device, const infinicore::Device &device,
size_t layer_idx, size_t layer_idx,
...@@ -23,6 +33,22 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, ...@@ -23,6 +33,22 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config,
INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_); INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_);
} }
LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device,
size_t layer_idx,
engine::distributed::RankInfo rank_info) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) {
const auto &dtype{model_config_->get_dtype()};
// Initialize layer normalization layers
INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
dtype, device);
INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
dtype, device);
// Initialize attention and MLP modules
INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_);
INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_);
}
std::tuple<infinicore::Tensor, infinicore::Tensor> std::tuple<infinicore::Tensor, infinicore::Tensor>
LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states, LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states,
infinicore::Tensor &residual, infinicore::Tensor &residual,
......
...@@ -33,11 +33,28 @@ public: ...@@ -33,11 +33,28 @@ public:
* @param layer_idx Layer index for cache management and debugging * @param layer_idx Layer index for cache management and debugging
* @param dtype Optional data type for model parameters (defaults to F32) * @param dtype Optional data type for model parameters (defaults to F32)
*/ */
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
LlamaDecoderLayer(const LlamaConfig &config, LlamaDecoderLayer(const LlamaConfig &config,
const infinicore::Device &device, const infinicore::Device &device,
size_t layer_idx, size_t layer_idx,
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
LlamaDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device,
size_t layer_idx,
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
/** /**
* @brief Forward pass: process one decoder layer * @brief Forward pass: process one decoder layer
* *
...@@ -79,6 +96,7 @@ protected: ...@@ -79,6 +96,7 @@ protected:
INFINICORE_NN_MODULE(LlamaAttention, self_attn); INFINICORE_NN_MODULE(LlamaAttention, self_attn);
INFINICORE_NN_MODULE(LlamaMLP, mlp); INFINICORE_NN_MODULE(LlamaMLP, mlp);
engine::distributed::RankInfo rank_info_; engine::distributed::RankInfo rank_info_;
std::shared_ptr<infinilm::config::ModelConfig> model_config_;
private: private:
size_t layer_idx_; // Layer index for cache management and debugging size_t layer_idx_; // Layer index for cache management and debugging
......
...@@ -2,19 +2,26 @@ ...@@ -2,19 +2,26 @@
#include "infinicore/context/context.hpp" #include "infinicore/context/context.hpp"
#include "infinicore/nn/linear.hpp" #include "infinicore/nn/linear.hpp"
#include "infinicore/ops.hpp" #include "infinicore/ops.hpp"
#include <iostream>
namespace infinilm::models::llama { namespace infinilm::models::llama {
/**
* @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
*
* ⚠️ DEVELOPMENT POLICY:
* - NO new development or feature additions permitted on this interface
* - Only critical bug fixes (security/stability) allowed until removal
* - All new code MUST migrate to the polymorphic overload below
*
* Replacement: Use the polymorphic overload of this same function name with updated signature
* Reason: Legacy signature lacks support for dynamic quantization modes.
* Removal target: v0.2.0 (Q2 2026)
*/
LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config,
const infinicore::Device &device, const infinicore::Device &device,
engine::distributed::RankInfo rank_info) { engine::distributed::RankInfo rank_info) {
// Initialize module's device_ member // Initialize module's device_ member
device_ = device; device_ = device;
const auto &dtype{config.dtype}; const auto &dtype{config.dtype};
// Initialize base model // Initialize base model
INFINICORE_NN_MODULE_INIT(model, config, device, rank_info); INFINICORE_NN_MODULE_INIT(model, config, device, rank_info);
...@@ -25,6 +32,24 @@ LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, ...@@ -25,6 +32,24 @@ LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config,
dtype, device); dtype, device);
} }
LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device,
engine::distributed::RankInfo rank_info) {
// Initialize module's device_ member
device_ = device;
const auto &dtype{model_config->get_dtype()};
// Initialize base model
INFINICORE_NN_MODULE_INIT(model, model_config, device, rank_info);
// Initialize language modeling head
// Note: If tie_word_embeddings is true, we would share weights with embed_tokens
// For now, we create a separate linear layer
INFINICORE_NN_MODULE_INIT(lm_head, model_config->get<size_t>("hidden_size"), model_config->get<size_t>("vocab_size"), false,
dtype, device);
}
LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const { LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
auto input_ids = input.input_ids.value(); auto input_ids = input.input_ids.value();
auto position_ids = input.position_ids.value(); auto position_ids = input.position_ids.value();
...@@ -40,7 +65,6 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const { ...@@ -40,7 +65,6 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
// 2. Apply language modeling head to get logits // 2. Apply language modeling head to get logits
auto logits = lm_head_->forward(hidden_states); auto logits = lm_head_->forward(hidden_states);
return {logits}; return {logits};
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment