Unverified Commit 9c256a17 authored by Ceng's avatar Ceng Committed by GitHub
Browse files

issue/106 适配模型9G7B

parent 39bea30a
......@@ -4,6 +4,7 @@
#include "infinicore/device.hpp"
#include "infinicore/tensor.hpp"
#include "infinicore/context/context.hpp"
#include <algorithm>
#include <memory>
#include <numeric>
......@@ -14,23 +15,23 @@
namespace infinilm::cache {
/**
* @brief Simple KV cache structure for incremental decoding
* @brief Single layer's KV cache for incremental decoding
*
* Stores key and value caches with shape [batch_size, n_kv_head, capacity, head_dim]
* Similar to DynamicLayer in Python cache_utils.py
*
* This is a common component that can be used by any model architecture
* that needs KV caching for attention mechanisms.
* This represents a single layer's cache within a model-level cache container.
*/
struct KVCache {
struct KVCacheLayer {
infinicore::Tensor k_cache; // [batch_size, n_kv_head, capacity, head_dim]
infinicore::Tensor v_cache; // [batch_size, n_kv_head, capacity, head_dim]
std::vector<size_t> cache_positions; // Current position in cache
size_t max_capacity; // Maximum capacity of cache
bool initialized; // Whether cache has been initialized
KVCache() : max_capacity(0), initialized(false) {}
KVCacheLayer() : max_capacity(0), initialized(false) {}
/**
* @brief Initialize or update cache capacity
......@@ -39,11 +40,20 @@ struct KVCache {
* @param seq_len Sequence length of new tokens
* @param dtype Data type
* @param device Device
* @param max_position_embeddings Maximum position embeddings (for initial capacity)
*/
void ensure_capacity(size_t batch_size, size_t num_kv_heads, size_t head_dim, size_t seq_len,
infinicore::DataType dtype, const infinicore::Device &device) {
infinicore::DataType dtype, const infinicore::Device &device,
size_t max_position_embeddings = 4096) {
size_t required_capacity = seq_len + std::accumulate(cache_positions.begin(), cache_positions.end(), 0, [](int a, int b) { return std::max(a, b); });
// VALIDATION: Verify input parameters
if (num_kv_heads == 0 || head_dim == 0 || seq_len == 0) {
SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Invalid parameters - num_kv_heads: {}, head_dim: {}, seq_len: {}",
num_kv_heads, head_dim, seq_len);
throw std::runtime_error("KV cache ensure_capacity: invalid parameters");
}
// Lazy initialization
if (!initialized) {
max_capacity = std::max(required_capacity, size_t(4096)); // Start with at least 4096
......@@ -53,6 +63,15 @@ struct KVCache {
dtype, device);
cache_positions = std::vector<size_t>(batch_size, 0);
initialized = true;
// VALIDATION: Verify cache was created correctly
// Shape is [batch_size, num_kv_heads, max_capacity, head_dim]
if (k_cache->shape()[0] != batch_size || k_cache->shape()[1] != num_kv_heads ||
k_cache->shape()[2] != max_capacity || k_cache->shape()[3] != head_dim) {
SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Cache shape mismatch after initialization - expected: [{}, {}, {}, {}], got: {}",
batch_size, num_kv_heads, max_capacity, head_dim, k_cache->info());
throw std::runtime_error("KV cache initialization: shape mismatch");
}
}
// Grow cache if needed (similar to DynamicLayer in Python)
else if (required_capacity > max_capacity) {
......@@ -83,10 +102,25 @@ struct KVCache {
k_cache = k_new;
v_cache = v_new;
max_capacity = new_capacity;
// VALIDATION: Verify cache was grown correctly
// Shape is [batch_size, num_kv_heads, max_capacity, head_dim]
if (k_cache->shape()[2] != new_capacity) {
SPDLOG_ERROR("KVCacheLayer::ensure_capacity: New cache capacity mismatch - expected: {}, got: {}",
new_capacity, k_cache->shape()[2]);
throw std::runtime_error("KV cache growth: capacity mismatch");
}
}
// VALIDATION: Final check that capacity is sufficient
if (required_capacity > max_capacity) {
SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Capacity still insufficient after growth - required: {}, max_capacity: {}",
required_capacity, max_capacity);
throw std::runtime_error("KV cache ensure_capacity: capacity insufficient");
}
}
KVCache(size_t max_batch_size, size_t n_kv_head, size_t head_dim, infinicore::DataType dtype, size_t max_seqlen = 4096, infinicore::Device device = infinicore::context::getDevice())
KVCacheLayer(size_t max_batch_size, size_t n_kv_head, size_t head_dim, infinicore::DataType dtype, size_t max_seqlen = 4096, infinicore::Device device = infinicore::context::getDevice())
: max_capacity(max_seqlen), initialized(false) {
cache_positions = std::vector<size_t>(max_batch_size, 0);
ensure_capacity(max_batch_size, n_kv_head, head_dim, max_capacity, dtype, device);
......@@ -96,7 +130,7 @@ struct KVCache {
* @brief Update cache with new key and value states
* @param k_new New key states [batch_size, n_kv_head, seq_len, head_dim]
* @param v_new New value states [batch_size, n_kv_head, seq_len, head_dim]
* @return Tuple of (k_total, v_total) with shape [n_kv_head, total_seq_len, head_dim]
* @return Tuple of (k_total, v_total) with shape [batch_size, n_kv_head, total_seq_len, head_dim]
*
* Note: This method writes to the cache. If using with attention op, the attention op
* also writes to the cache, so this should be called AFTER attention, not before.
......@@ -143,4 +177,120 @@ struct KVCache {
}
};
/**
* @brief Model-level KV cache container (similar to DynamicCache in Python)
*
* Stores a list of KVCacheLayer objects, one per model layer.
* This aligns with Python backend's DynamicCache architecture.
*/
class DynamicCache {
public:
/**
* @brief Construct DynamicCache with specified number of layers
*
* @param num_layers Number of model layers (creates one cache layer per model layer)
* @param max_position_embeddings Maximum position embeddings (used for initial capacity)
*/
DynamicCache(size_t num_layers, size_t max_position_embeddings = 4096)
: layers_(num_layers), max_position_embeddings_(max_position_embeddings) {}
/**
* @brief Update cache with new key and value states for a specific layer
*
* @param layer_idx Layer index (0-based)
* @param k_new New key states [batch_size, n_kv_head, seq_len, head_dim]
* @param v_new New value states [batch_size, n_kv_head, seq_len, head_dim]
* @return Tuple of (k_total, v_total) with shape [batch_size, n_kv_head, total_seq_len, head_dim]
*
* This method updates the cache for the specified layer and returns the
* accumulated cache up to the current position.
*/
std::pair<infinicore::Tensor, infinicore::Tensor> update(
size_t layer_idx,
const infinicore::Tensor &k_new,
const infinicore::Tensor &v_new) {
if (layer_idx >= layers_.size()) {
SPDLOG_ERROR("DynamicCache::update: layer_idx {} out of range (num_layers: {})",
layer_idx, layers_.size());
throw std::runtime_error("DynamicCache: layer_idx out of range");
}
// Update the cache for this layer
return layers_[layer_idx].update(k_new, v_new);
}
/**
* @brief Update cache with new key and value states (convenience method without layer_idx)
* This is used when the cache is accessed directly without layer information
*
* @param k_new New key states [batch_size, n_kv_head, seq_len, head_dim]
* @param v_new New value states [batch_size, n_kv_head, seq_len, head_dim]
* @return Tuple of (k_total, v_total) with shape [batch_size, n_kv_head, total_seq_len, head_dim]
*
* Note: This assumes layer_idx=0. For multi-layer models, use update(layer_idx, k_new, v_new) instead.
*/
std::pair<infinicore::Tensor, infinicore::Tensor> update(
const infinicore::Tensor &k_new,
const infinicore::Tensor &v_new) {
return update(0, k_new, v_new);
}
/**
* @brief Get the number of layers in this cache
*/
size_t num_layers() const { return layers_.size(); }
/**
* @brief Get cache position for a specific layer
*/
size_t cache_position(size_t layer_idx) const {
if (layer_idx >= layers_.size()) {
throw std::runtime_error("DynamicCache: layer_idx out of range");
}
if (layers_[layer_idx].cache_positions.empty()) {
return 0;
}
return layers_[layer_idx].cache_positions[0]; // All batch items should have same position
}
/**
* @brief Get max position embeddings (used for initial capacity)
*/
size_t max_position_embeddings() const { return max_position_embeddings_; }
/**
* @brief Reset cache for all layers to a specific position
* This should be called when starting a new generation sequence or resetting to a specific position
* @param pos Position to reset to (defaults to 0)
*/
void reset(size_t pos = 0) {
for (auto& layer : layers_) {
std::fill(layer.cache_positions.begin(), layer.cache_positions.end(), pos);
// Note: We don't reset initialized flag or clear the cache tensors
// to avoid reallocation. The cache will be overwritten on next update.
}
}
/**
* @brief Access a specific layer's cache (for advanced usage)
*/
KVCacheLayer& layer(size_t layer_idx) {
if (layer_idx >= layers_.size()) {
throw std::runtime_error("DynamicCache: layer_idx out of range");
}
return layers_[layer_idx];
}
const KVCacheLayer& layer(size_t layer_idx) const {
if (layer_idx >= layers_.size()) {
throw std::runtime_error("DynamicCache: layer_idx out of range");
}
return layers_[layer_idx];
}
private:
std::vector<KVCacheLayer> layers_;
size_t max_position_embeddings_;
};
} // namespace infinilm::cache
#include "infer_engine.hpp"
#include "spdlog/spdlog.h"
namespace infinilm::engine {
......@@ -71,4 +72,14 @@ const distributed::DistConfig &InferEngine::get_dist_config() const {
return communication_group_.get_dist_config();
}
//------------------------------------------------------
// reset_cache
//------------------------------------------------------
void InferEngine::reset_cache(size_t pos, bool async) {
// Reset cache on all workers
for (auto &worker : workers_) {
worker->reset_cache(pos, async);
}
}
} // namespace infinilm::engine
......@@ -26,6 +26,11 @@ public:
infinicore::Tensor generate(const infinicore::Tensor &input_ids,
const infinicore::Tensor &position_ids);
// Reset the internal cache in all workers (clears state between generations)
// By default, this is synchronous (blocks until reset completes).
// If async=true, this becomes asynchronous (unstable - use with caution).
void reset_cache(size_t pos = 0, bool async = false);
~InferEngine();
const distributed::DistConfig &get_dist_config() const;
......
......@@ -111,6 +111,35 @@ void RankWorker::wait() {
}
}
//------------------------------------------------------
// reset_cache -- synchronous by default, async optional (unstable)
//------------------------------------------------------
void RankWorker::reset_cache(size_t pos, bool async) {
{
std::lock_guard<std::mutex> lock(mutex_);
if (should_exit_) {
throw std::runtime_error("RankWorker is closing; cannot reset_cache");
}
pending_reset_pos_ = pos;
job_cmd_ = Command::RESET_CACHE;
has_job_ = true;
job_done_ = false;
}
cv_.notify_all();
// By default, wait for job completion (synchronous)
// If async=true, return immediately (unstable - use with caution)
if (!async) {
std::unique_lock<std::mutex> lk(mutex_);
cv_.wait(lk, [&] { return job_done_ || should_exit_; });
if (should_exit_) {
throw std::runtime_error("RankWorker stopped while resetting cache");
}
}
}
//------------------------------------------------------
// close -- request shutdown and join thread
//------------------------------------------------------
......@@ -160,6 +189,7 @@ void RankWorker::thread_loop() {
std::string local_param_name;
infinicore::Tensor local_param;
std::vector<std::any> local_args;
size_t local_reset_pos = 0;
// Wait for a job or exit
{
......@@ -177,6 +207,8 @@ void RankWorker::thread_loop() {
local_param = pending_param_;
} else if (local_cmd == Command::RUN) {
local_args = pending_args_;
} else if (local_cmd == Command::RESET_CACHE) {
local_reset_pos = pending_reset_pos_;
}
// mark job as being processed
......@@ -225,6 +257,25 @@ void RankWorker::thread_loop() {
spdlog::error("[{}] exception during forward: {}\n", info(), e.what());
break;
}
} else if (local_cmd == Command::RESET_CACHE) {
try {
// Generic reset_cache on the model interface
model_->reset_cache(local_reset_pos);
{
std::lock_guard<std::mutex> lk(mutex_);
job_done_ = true;
}
cv_.notify_all();
} catch (const std::exception &e) {
std::lock_guard<std::mutex> lk(mutex_);
should_exit_ = true;
job_done_ = true;
cv_.notify_all();
spdlog::error("[{}] exception during reset_cache: {}\n", info(), e.what());
break;
}
} else {
// Shouldn't reach here (no-op)
}
......
......@@ -17,6 +17,7 @@ class RankWorker {
INIT,
LOAD,
RUN,
RESET_CACHE,
STOP
};
......@@ -34,6 +35,11 @@ public:
// Submit a run (forward) job.
void run(const std::vector<std::any> &args);
// Reset the internal cache in the model (clears state between generations)
// By default, this is synchronous (blocks until reset completes).
// If async=true, this becomes asynchronous (unstable - use with caution).
void reset_cache(size_t pos = 0, bool async = false);
// Wait until run job completes. The result can be retrieved with get_output().
void wait();
......@@ -67,6 +73,7 @@ private:
std::string pending_param_name_;
infinicore::Tensor pending_param_;
std::vector<std::any> pending_args_;
size_t pending_reset_pos_ = 0;
// Output (protected by mutex)
infinicore::Tensor output_;
......
......@@ -28,34 +28,100 @@ inline void log_tensor_stats(const infinicore::Tensor &tensor, const std::string
SPDLOG_INFO(" {}: shape={}, dtype={}, device={}", name, shape_str, static_cast<int>(dtype), device.toString());
// For F32 tensors, compute and log statistics
if (dtype == infinicore::DataType::F32) {
// For F32, F16, and BF16 tensors, compute and log statistics
if (dtype == infinicore::DataType::F32 ||
dtype == infinicore::DataType::F16 ||
dtype == infinicore::DataType::BF16) {
// Copy to CPU if needed and compute stats
auto cpu_tensor = tensor->to(infinicore::Device(infinicore::Device::Type::CPU, 0));
std::byte *raw_data = cpu_tensor->data();
float *data = reinterpret_cast<float*>(raw_data);
size_t numel = cpu_tensor->numel();
if (numel > 0) {
float min_val = *std::min_element(data, data + numel);
float max_val = *std::max_element(data, data + numel);
float sum = std::accumulate(data, data + numel, 0.0f);
float mean_val = sum / static_cast<float>(numel);
SPDLOG_INFO(" Stats: min={:.6e}, max={:.6e}, mean={:.6e}, numel={}",
min_val, max_val, mean_val, numel);
// Log sample values at specific positions
if (log_samples && numel > 0) {
size_t sample_count = std::min(max_samples, numel);
SPDLOG_INFO(" Sample values (first {}):", sample_count);
for (size_t i = 0; i < sample_count; ++i) {
SPDLOG_INFO(" [{}] = {:.6e}", i, data[i]);
if (dtype == infinicore::DataType::F32) {
float *data = reinterpret_cast<float*>(raw_data);
float min_val = *std::min_element(data, data + numel);
float max_val = *std::max_element(data, data + numel);
float sum = std::accumulate(data, data + numel, 0.0f);
float mean_val = sum / static_cast<float>(numel);
SPDLOG_INFO(" Stats: min={:.6e}, max={:.6e}, mean={:.6e}, numel={}",
min_val, max_val, mean_val, numel);
// Log sample values at specific positions
if (log_samples && numel > 0) {
size_t sample_count = std::min(max_samples, numel);
SPDLOG_INFO(" Sample values (first {}):", sample_count);
for (size_t i = 0; i < sample_count; ++i) {
SPDLOG_INFO(" [{}] = {:.6e}", i, data[i]);
}
}
} else if (dtype == infinicore::DataType::F16) {
// F16 is typically uint16_t, need to convert to float for logging
uint16_t *data = reinterpret_cast<uint16_t*>(raw_data);
std::vector<float> float_data(numel);
for (size_t i = 0; i < numel; ++i) {
// Simple F16 to F32 conversion (approximate)
uint16_t h = data[i];
uint32_t sign = (h >> 15) & 0x1;
uint32_t exp = (h >> 10) & 0x1F;
uint32_t mant = h & 0x3FF;
uint32_t f32 = (sign << 31) | ((exp + 112) << 23) | (mant << 13);
float_data[i] = *reinterpret_cast<float*>(&f32);
}
float min_val = *std::min_element(float_data.begin(), float_data.end());
float max_val = *std::max_element(float_data.begin(), float_data.end());
float sum = std::accumulate(float_data.begin(), float_data.end(), 0.0f);
float mean_val = sum / static_cast<float>(numel);
SPDLOG_INFO(" Stats (F16): min={:.6e}, max={:.6e}, mean={:.6e}, numel={}",
min_val, max_val, mean_val, numel);
if (log_samples && numel > 0) {
size_t sample_count = std::min(max_samples, numel);
SPDLOG_INFO(" Sample values (first {}):", sample_count);
for (size_t i = 0; i < sample_count; ++i) {
SPDLOG_INFO(" [{}] = {:.6e}", i, float_data[i]);
}
}
} else if (dtype == infinicore::DataType::BF16) {
// BF16 is typically uint16_t, need to convert to float for logging
uint16_t *data = reinterpret_cast<uint16_t*>(raw_data);
std::vector<float> float_data(numel);
for (size_t i = 0; i < numel; ++i) {
// BF16 to F32 conversion
uint16_t b = data[i];
uint32_t f32 = (static_cast<uint32_t>(b) << 16);
float_data[i] = *reinterpret_cast<float*>(&f32);
}
float min_val = *std::min_element(float_data.begin(), float_data.end());
float max_val = *std::max_element(float_data.begin(), float_data.end());
float sum = std::accumulate(float_data.begin(), float_data.end(), 0.0f);
float mean_val = sum / static_cast<float>(numel);
SPDLOG_INFO(" Stats (BF16): min={:.6e}, max={:.6e}, mean={:.6e}, numel={}",
min_val, max_val, mean_val, numel);
if (log_samples && numel > 0) {
size_t sample_count = std::min(max_samples, numel);
SPDLOG_INFO(" Sample values (first {}):", sample_count);
for (size_t i = 0; i < sample_count; ++i) {
SPDLOG_INFO(" [{}] = {:.6e}", i, float_data[i]);
}
// Also log last N values to see newly appended decode tokens
// This is critical for debugging precision issues at decode steps
if (numel > sample_count) {
SPDLOG_INFO(" Sample values (last {}):", sample_count);
for (size_t i = numel - sample_count; i < numel; ++i) {
SPDLOG_INFO(" [{}] = {:.6e}", i, float_data[i]);
}
}
}
}
}
} else {
SPDLOG_INFO(" {} (Stats computation skipped for non-F32 tensor)", name);
SPDLOG_INFO(" {} (Stats computation skipped for unsupported dtype)", name);
}
}
......
......@@ -9,5 +9,7 @@ class InfinilmModel : public infinicore::nn::Module {
public:
virtual ~InfinilmModel() = default;
virtual infinicore::Tensor forward(std::vector<std::any>) const = 0;
// Optional: reset cache; default no-op for models without cache
virtual void reset_cache(size_t pos = 0) {}
};
} // namespace infinilm
......@@ -13,27 +13,32 @@
namespace infinilm::models::llama {
LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device,
size_t layer_idx,
infinicore::DataType dtype)
: hidden_size_(config.hidden_size),
: layer_idx_(layer_idx),
hidden_size_(config.hidden_size),
num_attention_heads_(config.num_attention_heads),
num_key_value_heads_(config.num_key_value_heads),
head_dim_(config.head_dim),
kv_dim_(config.kv_dim()),
use_bias_(config.attention_bias) {
use_bias_(config.attention_bias),
use_output_bias_(config.attention_output_bias),
max_position_embeddings_(config.max_position_embeddings) {
// Initialize projection layers
INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, hidden_size_, use_bias_,
dtype, device);
INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, kv_dim_, use_bias_,
dtype, device);
INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, kv_dim_, use_bias_,
dtype, device);
INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_bias_,
dtype, device);
// Output projection uses attention_output_bias (can be different from qkv)
INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_,
dtype, device);
}
infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_states,
const infinicore::Tensor &position_ids,
void *kv_cache) const {
const infinicore::Tensor &position_ids,
void *kv_cache) const {
if (!rotary_emb_) {
throw std::runtime_error("LlamaAttention: rotary_emb not configured");
}
......@@ -72,26 +77,22 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
throw std::runtime_error("Unexpected position_ids shape");
}
// 4. Process each batch item separately for attention computation
infinilm::cache::KVCache *external_cache = static_cast<infinilm::cache::KVCache *>(kv_cache);
// 4. Prepare KV caches
// Convert to [batch, n_head, seq_len, head_dim] for cache
// Ensure contiguous after permute for F16 compatibility with cache operations
q_reshaped = q_reshaped->permute({0, 2, 1, 3})->contiguous(); // [bs, n_q_head, seq_len, head_dim]
auto k_permuted = k_reshaped->permute({0, 2, 1, 3}); // [bs, n_kv_head, seq_len, head_dim]
auto v_permuted = v_reshaped->permute({0, 2, 1, 3}); // [bs, n_kv_head, seq_len, head_dim]
// 4. Prepare KV caches
infinilm::cache::DynamicCache *external_cache = static_cast<infinilm::cache::DynamicCache *>(kv_cache);
infinicore::Tensor k_total; // [bs, n_kv_head, total_seq_len, head_dim]
infinicore::Tensor v_total; // [bs, n_kv_head, total_seq_len, head_dim]
if (external_cache != nullptr) {
auto [k_total_tmp, v_total_tmp] = external_cache->update(k_permuted, v_permuted);
auto [k_total_tmp, v_total_tmp] = external_cache->update(layer_idx_, k_permuted, v_permuted);
k_total = k_total_tmp;
v_total = v_total_tmp;
} else {
auto [k_total_tmp, v_total_tmp] = internal_cache_.update(k_permuted, v_permuted);
k_total = k_total_tmp;
v_total = v_total_tmp;
// No external cache - this shouldn't happen in normal operation, but handle gracefully
throw std::runtime_error("LlamaAttention: kv_cache is required but nullptr provided");
}
auto total_seq_len = k_total->shape()[2];
......
......@@ -29,9 +29,11 @@ public:
*
* @param config Model configuration
* @param device Device to create tensors on
* @param layer_idx Layer index for cache access
* @param dtype Optional data type for model parameters (defaults to F32)
*/
LlamaAttention(const LlamaConfig &config, const infinicore::Device &device,
size_t layer_idx,
infinicore::DataType dtype = infinicore::DataType::F32);
/**
......@@ -39,13 +41,18 @@ public:
*
* @param hidden_states Input tensor of shape [batch, seq_len, hidden_size]
* @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
* @param kv_cache Optional KV cache for incremental decoding
* @param kv_cache Optional model-level KV cache for incremental decoding
* @return Output tensor of shape [batch, seq_len, hidden_size]
*/
infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
const infinicore::Tensor &position_ids,
void *kv_cache = nullptr) const;
/**
* @brief Get the layer index
*/
size_t layer_idx() const { return layer_idx_; }
/**
* @brief Provide shared RoPE module from parent model.
*/
......@@ -68,15 +75,15 @@ protected:
std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
private:
size_t layer_idx_; // Layer index for cache access
size_t hidden_size_;
size_t num_attention_heads_;
size_t num_key_value_heads_;
size_t head_dim_;
size_t kv_dim_;
bool use_bias_;
// Internal KV cache for when no external cache is provided
mutable infinilm::cache::KVCache internal_cache_;
bool use_bias_; // Bias for Q/K/V projections
bool use_output_bias_; // Bias for output projection (o_proj)
size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility)
};
} // namespace infinilm::models::llama
......@@ -3,6 +3,7 @@
#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>
namespace infinilm::models::llama {
......@@ -37,14 +38,23 @@ struct LlamaConfig {
// Optional features
bool use_cache = true; // Whether to use KV cache
bool attention_bias = false; // Whether to use bias in attention projections
bool attention_bias = true; // Whether to use bias in Q/K/V projections (default true for 9G7B compatibility)
bool attention_output_bias = false; // Whether to use bias in output projection (o_proj)
bool mlp_bias = false; // Whether to use bias in MLP projections
bool tie_word_embeddings = false; // Whether to tie input/output embeddings
// Training/initialization parameters
double attention_dropout = 0.0; // Dropout ratio for attention probabilities
double initializer_range = 0.02; // Standard deviation for weight initialization
size_t pretraining_tp = 1; // Tensor parallelism rank used during pretraining
// Model metadata
std::string name_or_path = ""; // Model name or path identifier
// Token IDs
int64_t pad_token_id = -1; // Padding token ID (optional)
int64_t bos_token_id = 1; // Beginning of sequence token ID
int64_t eos_token_id = 2; // End of sequence token ID
std::vector<int64_t> bos_token_id = {1}; // Beginning of sequence token ID(s)
std::vector<int64_t> eos_token_id = {2}; // End of sequence token ID(s)
/**
* @brief Compute key-value dimension for Grouped Query Attention (GQA)
......
......@@ -5,7 +5,9 @@
namespace infinilm::models::llama {
LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device,
infinicore::DataType dtype) {
size_t layer_idx,
infinicore::DataType dtype)
: layer_idx_(layer_idx) {
// Initialize layer normalization layers
INFINICORE_NN_MODULE_INIT(input_layernorm, config.hidden_size, config.rms_norm_eps,
dtype, device);
......@@ -13,7 +15,7 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, const infinicore
dtype, device);
// Initialize attention and MLP modules
INFINICORE_NN_MODULE_INIT(self_attn, config, device, dtype);
INFINICORE_NN_MODULE_INIT(self_attn, config, device, layer_idx, dtype);
INFINICORE_NN_MODULE_INIT(mlp, config, device, dtype);
}
......
......@@ -28,9 +28,11 @@ public:
*
* @param config Model configuration
* @param device Device to create tensors on
* @param layer_idx Layer index for cache management and debugging
* @param dtype Optional data type for model parameters (defaults to F32)
*/
LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device,
size_t layer_idx,
infinicore::DataType dtype = infinicore::DataType::F32);
/**
......@@ -45,6 +47,11 @@ public:
const infinicore::Tensor &position_ids,
void *kv_cache = nullptr) const;
/**
* @brief Get the layer index
*/
size_t layer_idx() const { return layer_idx_; }
void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
if (self_attn_) {
self_attn_->set_rotary_emb(rotary_emb);
......@@ -60,6 +67,9 @@ protected:
// Attention and MLP
INFINICORE_NN_MODULE(LlamaAttention, self_attn);
INFINICORE_NN_MODULE(LlamaMLP, mlp);
private:
size_t layer_idx_; // Layer index for cache management and debugging
};
} // namespace infinilm::models::llama
#include "llama_for_causal_lm.hpp"
#include "infinicore/nn/linear.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/context/context.hpp"
#include <iostream>
namespace infinilm::models::llama {
LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, const infinicore::Device &device,
infinicore::DataType dtype) {
// Initialize module's device_ member
device_ = device;
// Initialize base model
INFINICORE_NN_MODULE_INIT(model, config, device, dtype);
......@@ -20,14 +24,21 @@ LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, const infinicore::
infinicore::Tensor LlamaForCausalLM::forward(const infinicore::Tensor &input_ids,
const infinicore::Tensor &position_ids,
std::vector<void *> *kv_caches) const {
void *kv_cache) const {
// 1. Forward through base model to get hidden states
auto position_ids_device = position_ids->to(device_);
auto hidden_states = model_->forward(input_ids, position_ids_device, kv_caches);
auto hidden_states = model_->forward(input_ids, position_ids_device, kv_cache);
// 2. Apply language modeling head to get logits
auto logits = lm_head_->forward(hidden_states);
// 3. CRITICAL: Synchronize the C++ backend's context after forward pass
// This ensures all C++ backend operations complete before returning to Python
if (device_.getType() != infinicore::Device::Type::CPU) {
infinicore::context::setDevice(device_, false);
infinicore::context::syncStream();
}
return logits;
}
......@@ -49,4 +60,8 @@ infinicore::Tensor LlamaForCausalLM::forward(std::vector<std::any> args) const {
return forward(input_ids, position_ids, kv_caches);
}
void LlamaForCausalLM::reset_cache(size_t pos) {
model_->reset_cache(pos);
}
} // namespace infinilm::models::llama
......@@ -35,18 +35,18 @@ public:
*
* @param input_ids Token IDs tensor of shape [batch, seq_len]
* @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
* @param kv_caches Optional KV caches for incremental decoding (one per layer)
* @param kv_cache Optional model-level KV cache for incremental decoding
* @return Logits tensor of shape [batch, seq_len, vocab_size]
*
* Note: This is a placeholder forward method. The actual implementation
* will be added when integrating with the inference engine.
*/
infinicore::Tensor forward(const infinicore::Tensor &input_ids,
const infinicore::Tensor &position_ids,
std::vector<void *> *kv_caches = nullptr) const;
void *kv_cache = nullptr) const;
infinicore::Tensor forward(std::vector<std::any> args) const override;
// Reset internal cache position
void reset_cache(size_t pos = 0) override;
// Module information
const LlamaConfig &config() const { return model_->config(); }
LlamaModel &model() { return *model_; }
......
......@@ -13,9 +13,15 @@ LlamaModel::LlamaModel(const LlamaConfig &config, const infinicore::Device &devi
INFINICORE_NN_MODULE_INIT(embed_tokens, config.vocab_size, config.hidden_size,
std::nullopt, dtype, device);
// Initialize decoder layers
INFINICORE_NN_MODULE_VEC_INIT(layers, config.num_hidden_layers, LlamaDecoderLayer,
config, device, dtype);
// Initialize decoder layers with layer indices
// TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments
// (e.g., via a factory function or lambda that receives the layer index)
// Currently, we can't use the macro because each layer needs a different layer_idx
layers_.reserve(config.num_hidden_layers);
for (size_t i = 0; i < config.num_hidden_layers; ++i) {
layers_.push_back(this->register_module<LlamaDecoderLayer>(
"layers." + std::to_string(i), config, device, i, dtype));
}
// Initialize final layer normalization
INFINICORE_NN_MODULE_INIT(norm, config.hidden_size, config.rms_norm_eps,
......@@ -36,27 +42,59 @@ LlamaModel::LlamaModel(const LlamaConfig &config, const infinicore::Device &devi
infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
const infinicore::Tensor &position_ids,
std::vector<void *> *kv_caches) const {
void *kv_cache) const {
// Use persistent internal cache if no external cache is provided
// This matches Python backend behavior: if use_cache and past_key_values is None, create DynamicCache
// The cache persists across forward calls to enable incremental decoding
void *cache_to_use = kv_cache;
if (kv_cache == nullptr) {
// Create or reuse persistent internal cache at model level
// This ensures the cache persists across multiple forward calls (prefill -> decode -> decode...)
size_t seq_len = input_ids->shape()[1];
if (!cache_) {
// First time: create cache
cache_ = std::make_unique<infinilm::cache::DynamicCache>(
config_.num_hidden_layers,
config_.max_position_embeddings
);
}
cache_to_use = cache_.get();
}
// 1. Embed tokens: input_ids -> [batch, seq_len, hidden_size]
auto hidden_states = embed_tokens_->forward(input_ids);
// 2. Process through all decoder layers
for (size_t i = 0; i < layers_.size(); ++i) {
void *kv_cache = (kv_caches && i < kv_caches->size()) ? (*kv_caches)[i] : nullptr;
hidden_states = layers_.at(i)->forward(hidden_states, position_ids, kv_cache);
size_t num_layers = layers_.size();
for (size_t i = 0; i < num_layers; ++i) {
// Pass model-level cache (layer index is now a property of the layer)
hidden_states = layers_.at(i)->forward(hidden_states, position_ids, cache_to_use);
// DEBUG: Disabled previous final layer logging
// Logging moved to decoder layer for post-attention normalization
}
// 3. Apply final layer normalization to last token only (aligns with transformers)
// Narrow to last token: [batch, seq_len, hidden_size] -> [batch, 1, hidden_size]
auto shape = hidden_states->shape();
size_t seq_len = shape[1];
auto last_token = hidden_states; //->narrow({{1, seq_len - 1, 1}});
auto last_token = hidden_states->narrow({{1, seq_len - 1, 1}});
auto normalized_states = norm_->forward(hidden_states);
auto normalized_last_token = normalized_states->narrow({{1, seq_len - 1, 1}});
// DEBUG: Disabled previous final layer normalization logging
// Normalize only the last token (matches Python backend)
auto normalized_last_token = norm_->forward(last_token);
return normalized_last_token;
}
void LlamaModel::reset_cache(size_t pos) const {
if (cache_) {
cache_->reset(pos);
}
}
} // namespace infinilm::models::llama
......@@ -2,6 +2,7 @@
#include "llama_config.hpp"
#include "llama_decoder_layer.hpp"
#include "cache/kv_cache.hpp"
#include "infinicore/nn/module.hpp"
#include "infinicore/nn/embedding.hpp"
#include "infinicore/nn/rmsnorm.hpp"
......@@ -9,6 +10,7 @@
#include "infinicore/tensor.hpp"
#include "infinicore/device.hpp"
#include <vector>
#include <memory>
namespace infinilm::models::llama {
......@@ -40,21 +42,27 @@ public:
*
* @param input_ids Token IDs tensor of shape [batch, seq_len]
* @param position_ids Position IDs tensor of shape [batch, seq_len] or [seq_len]
* @param kv_caches Optional KV caches for incremental decoding (one per layer)
* @param kv_cache Optional model-level KV cache for incremental decoding
* @return Output tensor of shape [batch, seq_len, hidden_size]
*
* Note: This is a placeholder forward method. The actual implementation
* will be added when integrating with the inference engine.
*/
infinicore::Tensor forward(const infinicore::Tensor &input_ids,
const infinicore::Tensor &position_ids,
std::vector<void *> *kv_caches = nullptr) const;
void *kv_cache = nullptr) const;
// Module information
const LlamaConfig &config() const { return config_; }
size_t num_layers() const { return config_.num_hidden_layers; }
/**
* @brief Reset the internal cache to a specific position
* This should be called when starting a new generation sequence to prevent state
* from persisting between different questions/prompts
* @param pos Position to reset to (defaults to 0)
*/
void reset_cache(size_t pos = 0) const;
protected:
// Token embeddings
INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);
......@@ -70,6 +78,10 @@ protected:
private:
LlamaConfig config_;
// Persistent cache for when no external cache is provided
// Mutable because it's not part of the model's learned parameters,
// but needs to persist across forward calls for incremental decoding
mutable std::unique_ptr<infinilm::cache::DynamicCache> cache_;
};
} // namespace infinilm::models::llama
......@@ -51,7 +51,11 @@ inline void bind_infer_engine(py::module &m) {
}
return result;
})
.def("generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); }, "Run inference on all ranks with arbitrary arguments");
.def("generate", [](InferEngine &self, py::object input_ids, py::object position_ids) -> infinicore::Tensor { return self.generate(input_ids.cast<infinicore::Tensor>(), position_ids.cast<infinicore::Tensor>()); }, "Run inference on all ranks with arbitrary arguments")
.def("reset_cache", &InferEngine::reset_cache,
py::arg("pos") = 0, py::arg("async") = false,
"Reset the internal cache in all workers to a specific position (clears state between generations). "
"By default, this is synchronous. If async=True, this becomes asynchronous (unstable - use with caution).");
// Optionally, you can add __repr__ for debugging
m.attr("InferEngine").attr("__repr__") = py::cpp_function([](const InferEngine &self) {
......
......@@ -56,105 +56,80 @@ inline void bind_llama(py::module &m) {
.def_readwrite("model_type", &LlamaConfig::model_type)
.def_readwrite("rope_theta", &LlamaConfig::rope_theta)
.def_readwrite("attention_bias", &LlamaConfig::attention_bias)
.def_readwrite("attention_output_bias", &LlamaConfig::attention_output_bias)
.def_readwrite("mlp_bias", &LlamaConfig::mlp_bias)
.def_readwrite("tie_word_embeddings", &LlamaConfig::tie_word_embeddings)
.def_readwrite("use_cache", &LlamaConfig::use_cache)
.def_readwrite("attention_dropout", &LlamaConfig::attention_dropout)
.def_readwrite("initializer_range", &LlamaConfig::initializer_range)
.def_readwrite("pretraining_tp", &LlamaConfig::pretraining_tp)
.def_readwrite("name_or_path", &LlamaConfig::name_or_path)
.def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
.def_readwrite("bos_token_id", &LlamaConfig::bos_token_id)
.def_readwrite("eos_token_id", &LlamaConfig::eos_token_id)
.def("validate", &LlamaConfig::validate)
.def("kv_dim", &LlamaConfig::kv_dim);
// Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here
// Helper function to convert Python object (InfiniCore tensor, numpy array, or torch tensor) to C++ Tensor
auto convert_to_tensor = [](py::object obj, const Device &device) -> infinicore::Tensor {
// First check if it's already an InfiniCore tensor (has _underlying attribute)
if (py::hasattr(obj, "_underlying")) {
try {
// Extract the underlying C++ tensor from Python InfiniCore tensor
auto underlying = obj.attr("_underlying");
auto infini_tensor = underlying.cast<infinicore::Tensor>();
return infini_tensor;
} catch (const py::cast_error &) {
// Fall through to other conversion methods
}
}
// Try direct cast (in case it's already a C++ tensor exposed to Python)
try {
auto infini_tensor = obj.cast<infinicore::Tensor>();
return infini_tensor;
} catch (const py::cast_error &) {
// Not an InfiniCore tensor, continue with other conversions
}
// Try to get data pointer and shape from numpy array or torch tensor
void *data_ptr = nullptr;
std::vector<size_t> shape;
infinicore::DataType dtype = infinicore::DataType::F32;
// Check if it's a numpy array
if (py::hasattr(obj, "__array_interface__")) {
auto array_info = obj.attr("__array_interface__");
auto data = array_info["data"];
if (py::isinstance<py::tuple>(data)) {
auto data_tuple = data.cast<py::tuple>();
data_ptr = reinterpret_cast<void *>(data_tuple[0].cast<uintptr_t>());
} else {
data_ptr = reinterpret_cast<void *>(data.cast<uintptr_t>());
}
auto shape_obj = array_info["shape"];
if (py::isinstance<py::tuple>(shape_obj)) {
auto shape_tuple = shape_obj.cast<py::tuple>();
for (auto dim : shape_tuple) {
shape.push_back(dim.cast<size_t>());
.def_property("bos_token_id",
[](const LlamaConfig &self) {
// Always return as list to match Python config format
return py::cast(self.bos_token_id);
},
[](LlamaConfig &self, py::object value) {
// Accept both single int and list
if (py::isinstance<py::int_>(value)) {
self.bos_token_id = {value.cast<int64_t>()};
} else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
self.bos_token_id = value.cast<std::vector<int64_t>>();
} else {
throw py::type_error("bos_token_id must be int or list of ints");
}
} else {
shape.push_back(shape_obj.cast<size_t>());
}
// Get dtype
std::string typestr = array_info["typestr"].cast<std::string>();
if (typestr == "<f4" || typestr == "float32") {
dtype = infinicore::DataType::F32;
} else if (typestr == "<f2" || typestr == "float16") {
dtype = infinicore::DataType::F16;
} else if (typestr == "<i4" || typestr == "int32") {
dtype = infinicore::DataType::I32;
} else if (typestr == "<i8" || typestr == "int64") {
dtype = infinicore::DataType::I64;
}
} else if (py::hasattr(obj, "data_ptr")) {
// Try torch tensor
data_ptr = reinterpret_cast<void *>(obj.attr("data_ptr")().cast<uintptr_t>());
auto shape_obj = obj.attr("shape");
if (py::isinstance<py::tuple>(shape_obj) || py::isinstance<py::list>(shape_obj)) {
for (auto dim : shape_obj) {
shape.push_back(dim.cast<size_t>());
})
.def_property("eos_token_id",
[](const LlamaConfig &self) {
// Always return as list to match Python config format
return py::cast(self.eos_token_id);
},
[](LlamaConfig &self, py::object value) {
// Accept both single int and list
if (py::isinstance<py::int_>(value)) {
self.eos_token_id = {value.cast<int64_t>()};
} else if (py::isinstance<py::list>(value) || py::isinstance<py::tuple>(value)) {
self.eos_token_id = value.cast<std::vector<int64_t>>();
} else {
throw py::type_error("eos_token_id must be int or list of ints");
}
} else {
shape.push_back(shape_obj.cast<size_t>());
}
// Get dtype from torch tensor
std::string dtype_str = py::str(obj.attr("dtype"));
if (dtype_str.find("float32") != std::string::npos) {
dtype = infinicore::DataType::F32;
} else if (dtype_str.find("float16") != std::string::npos) {
dtype = infinicore::DataType::F16;
} else if (dtype_str.find("int32") != std::string::npos) {
dtype = infinicore::DataType::I32;
} else if (dtype_str.find("int64") != std::string::npos) {
dtype = infinicore::DataType::I64;
}
} else {
throw std::runtime_error("Unsupported tensor type. Expected InfiniCore tensor, numpy array, or torch tensor.");
}
})
.def("validate", &LlamaConfig::validate)
.def("kv_dim", &LlamaConfig::kv_dim)
// Add __dir__ to make attributes discoverable via dir() in Python
.def("__dir__", [](const LlamaConfig &self) {
py::list dir_list;
dir_list.append("vocab_size");
dir_list.append("hidden_size");
dir_list.append("intermediate_size");
dir_list.append("num_hidden_layers");
dir_list.append("num_attention_heads");
dir_list.append("num_key_value_heads");
dir_list.append("head_dim");
dir_list.append("max_position_embeddings");
dir_list.append("rms_norm_eps");
dir_list.append("hidden_act");
dir_list.append("model_type");
dir_list.append("rope_theta");
dir_list.append("attention_bias");
dir_list.append("attention_output_bias");
dir_list.append("mlp_bias");
dir_list.append("tie_word_embeddings");
dir_list.append("use_cache");
dir_list.append("attention_dropout");
dir_list.append("initializer_range");
dir_list.append("pretraining_tp");
dir_list.append("name_or_path");
dir_list.append("pad_token_id");
dir_list.append("bos_token_id");
dir_list.append("eos_token_id");
dir_list.append("validate");
dir_list.append("kv_dim");
return dir_list;
});
return infinicore::Tensor::from_blob(data_ptr, shape, dtype, device);
};
// Note: Device is already bound in InfiniCore bindings, so we don't need to bind it here
// Bind LlamaForCausalLM
py::class_<LlamaForCausalLM, std::shared_ptr<LlamaForCausalLM>>(m, "LlamaForCausalLM")
......@@ -190,55 +165,62 @@ inline void bind_llama(py::module &m) {
return tensor;
}
throw std::runtime_error("Parameter '" + name + "' not found in model"); }, py::arg("name"))
.def("load_state_dict", [convert_to_tensor](LlamaForCausalLM &model, py::dict state_dict, const Device &device) {
.def("load_state_dict", [](LlamaForCausalLM &model, py::dict state_dict) {
// Convert Python dict to C++ state_dict
std::unordered_map<std::string, infinicore::Tensor> cpp_state_dict;
for (auto item : state_dict) {
std::string key = item.first.cast<std::string>();
py::object value = item.second.cast<py::object>();
cpp_state_dict.emplace(key, convert_to_tensor(value, device));
// Extract InfiniCore tensor from Python object
infinicore::Tensor tensor;
if (py::hasattr(value, "_underlying")) {
tensor = value.attr("_underlying").cast<infinicore::Tensor>();
} else {
tensor = value.cast<infinicore::Tensor>();
}
cpp_state_dict.emplace(key, tensor);
}
model.load_state_dict(cpp_state_dict); }, py::arg("state_dict"), py::arg("device"))
model.load_state_dict(cpp_state_dict); }, py::arg("state_dict"))
.def("config", &LlamaForCausalLM::config, py::return_value_policy::reference_internal)
.def("forward", [convert_to_tensor](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_caches = py::none()) {
// Helper to extract C++ tensor from Python object
auto get_tensor = [convert_to_tensor](py::object obj) -> infinicore::Tensor {
.def(
"reset_cache", [](const LlamaForCausalLM &model, size_t pos = 0) {
// Reset the internal cache to prevent state from persisting between generations
model.model().reset_cache(pos);
}, py::arg("pos") = 0, "Reset the internal cache to a specific position (clears state between generations)")
.def("forward", [](const LlamaForCausalLM &model, py::object input_ids, py::object position_ids, py::object kv_cache = py::none()) {
// Helper to extract C++ tensor from Python InfiniCore tensor
auto get_tensor = [](py::object obj) -> infinicore::Tensor {
// If it's already a Python InfiniCore tensor wrapper, extract underlying
if (py::hasattr(obj, "_underlying")) {
return obj.attr("_underlying").cast<infinicore::Tensor>();
}
// Try direct cast (in case it's already a C++ tensor)
try {
return obj.cast<infinicore::Tensor>();
} catch (const py::cast_error &) {
// Extract device from first tensor for conversion
Device device = Device(Device::Type::CPU, 0);
if (py::hasattr(obj, "device")) {
try {
auto py_device = obj.attr("device");
if (py::hasattr(py_device, "_underlying")) {
device = py_device.attr("_underlying").cast<Device>();
} else {
device = py_device.cast<Device>();
}
} catch (...) {
// Keep default CPU device
}
}
return convert_to_tensor(obj, device);
}
return obj.cast<infinicore::Tensor>();
};
// Convert Python tensors to C++ tensors
// Extract InfiniCore tensors from Python objects
auto infini_input_ids = get_tensor(input_ids);
auto infini_position_ids = get_tensor(position_ids);
// Handle kv_caches if provided
std::vector<void *> *kv_caches_ptr = nullptr;
// Handle kv_cache if provided (model-level DynamicCache)
void *kv_cache_ptr = nullptr;
if (!kv_cache.is_none()) {
// Try to extract DynamicCache from Python object
if (py::hasattr(kv_cache, "_underlying")) {
kv_cache_ptr = kv_cache.attr("_underlying").cast<void *>();
} else {
// Try direct cast
try {
kv_cache_ptr = kv_cache.cast<void *>();
} catch (...) {
// If conversion fails, pass nullptr (cache will be ignored)
kv_cache_ptr = nullptr;
}
}
}
return model.forward(infini_input_ids, infini_position_ids, kv_caches_ptr); },
//
py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
return model.forward(infini_input_ids, infini_position_ids, kv_cache_ptr);
}, py::arg("input_ids"), py::arg("position_ids"), py::arg("kv_caches") = py::none());
}
} // namespace infinilm::models::llama
......@@ -119,6 +119,19 @@ class GenerationMixin:
):
model_kwargs = kwargs
# -------------------------------------------------------------------- #
# CRITICAL: Reset internal cache before each new generation
# This prevents state from persisting between different questions/prompts
# -------------------------------------------------------------------- #
# Check if this is a cpp backend model (has _model attribute with reset_cache method)
if hasattr(self, '_model') and hasattr(self._model, 'reset_cache'):
try:
self._model.reset_cache()
except Exception as e:
# If reset_cache fails, log but continue (shouldn't happen)
import warnings
warnings.warn(f"Failed to reset cache: {e}")
# -------------------------------------------------------------------- #
# 创建 cache #
# -------------------------------------------------------------------- #
......@@ -166,6 +179,12 @@ class GenerationMixin:
[eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
)
# Extract sampling parameters from kwargs with defaults
random_val = model_kwargs.get("random_val", 0.1)
topp = model_kwargs.get("topp", 0.8)
topk = model_kwargs.get("topk", 1)
temperature = model_kwargs.get("temperature", 1.0)
# -------------------------------------------------------------------------- #
# 初始化 position_ids
# -------------------------------------------------------------------------- #
......@@ -189,6 +208,7 @@ class GenerationMixin:
# 计算一次
# -------------------------------------------------------------------------- #
start_time = time.time()
logits = self(**model_inputs)
# -------------------------------------------------------------------------- #
......@@ -206,15 +226,16 @@ class GenerationMixin:
dtype=infinicore.int32,
device=token_scores.device,
)
for i in range(0, batch_size):
score = token_scores.narrow(0, i, 1).view((vocab_size,))
out = next_tokens.narrow(0, i, 1).view([])
infinicore.nn.functional.random_sample(
score,
0.8,
0.1,
1,
1.0,
random_val,
topp,
topk,
temperature,
out=out,
)
......@@ -236,7 +257,6 @@ class GenerationMixin:
print(output_str, end="", flush=True)
if stop_on_eos and token_id in eos_token_id_list:
break
print("\n</s>")
print(f"\n\n\n Generation completed in {round(sum(time_list), 2)} ms")
print(
......
......@@ -14,6 +14,9 @@ class LlamaConfig:
This class wraps configuration_llama.LlamaConfig and provides
a _underlying property that creates the C++ config object.
Automatically detects and handles both regular Llama models and Jiuge models
(fm9g7b, fm9g, minicpm) with appropriate defaults and validation.
"""
def __init__(self, config_dict=None, **kwargs):
......@@ -64,22 +67,61 @@ class LlamaConfig:
except (AttributeError, TypeError):
pass
# Handle defaults
if (
not hasattr(self._cpp_config, "num_key_value_heads")
or self._cpp_config.num_key_value_heads == 0
):
# Handle num_key_value_heads with validation
python_num_kv_heads = getattr(self._python_config, "num_key_value_heads", None)
if python_num_kv_heads is None or python_num_kv_heads == 0:
self._cpp_config.num_key_value_heads = (
self._cpp_config.num_attention_heads
)
if (
not hasattr(self._cpp_config, "head_dim")
or self._cpp_config.head_dim == 0
):
self._cpp_config.head_dim = (
self._cpp_config.hidden_size // self._cpp_config.num_attention_heads
)
else:
self._cpp_config.num_key_value_heads = python_num_kv_heads
# Handle head_dim with validation (critical for GEMM operations)
python_head_dim = getattr(self._python_config, "head_dim", None)
if python_head_dim is None or python_head_dim == 0:
# Compute from hidden_size and num_attention_heads
if self._cpp_config.hidden_size > 0 and self._cpp_config.num_attention_heads > 0:
computed_head_dim = self._cpp_config.hidden_size // self._cpp_config.num_attention_heads
self._cpp_config.head_dim = computed_head_dim
else:
raise ValueError(
f"Cannot compute head_dim: hidden_size={self._cpp_config.hidden_size}, "
f"num_attention_heads={self._cpp_config.num_attention_heads}"
)
else:
# Use from Python config
self._cpp_config.head_dim = python_head_dim
# Validate it matches expected value (warn but allow for flexibility)
if self._cpp_config.hidden_size > 0 and self._cpp_config.num_attention_heads > 0:
expected_head_dim = self._cpp_config.hidden_size // self._cpp_config.num_attention_heads
if self._cpp_config.head_dim != expected_head_dim:
import warnings
warnings.warn(
f"head_dim ({self._cpp_config.head_dim}) != hidden_size/num_attention_heads ({expected_head_dim}). "
f"Using head_dim from config."
)
# Ensure vocab_size is set (explicit handling)
if hasattr(self._python_config, "vocab_size"):
self._cpp_config.vocab_size = self._python_config.vocab_size
# Validate config after setting all values (especially important for jiuge models)
if not self._cpp_config.validate():
raise ValueError("C++ LlamaConfig validation failed. Check config values.")
# Log key config values for debugging (especially useful for jiuge models)
import logging
logger = logging.getLogger(__name__)
logger.info(
f"LlamaConfig ({self._python_config.model_type}) C++ LlamaConfig created: vocab_size={self._cpp_config.vocab_size}, "
f"hidden_size={self._cpp_config.hidden_size}, "
f"num_attention_heads={self._cpp_config.num_attention_heads}, "
f"num_key_value_heads={self._cpp_config.num_key_value_heads}, "
f"head_dim={self._cpp_config.head_dim}, "
f"kv_dim={self._cpp_config.kv_dim()}, "
f"attention_bias={self._cpp_config.attention_bias}, "
f"attention_output_bias={self._cpp_config.attention_output_bias}"
)
return self._cpp_config
......@@ -104,13 +146,23 @@ class LlamaForCausalLM(GenerationMixin):
"""
super().__init__()
self.config = config
# Convert config to LlamaConfig (handles both regular Llama and Jiuge models)
if isinstance(config, dict):
config = LlamaConfig(**config)
elif not isinstance(config, LlamaConfig):
# Not a dict or LlamaConfig, try to convert
config = LlamaConfig(config)
# If already LlamaConfig, use as-is (it will auto-detect jiuge models)
if device is None:
device = infinicore.device()
self.use_cache = False
# Store the Python wrapper config so it can be accessed later
# This is needed for DynamicCache which calls config.get_text_config()
self._config = config
self._device = device
# self._model = _infinilm.LlamaForCausalLM(
# config._underlying, device._underlying, dtype
......@@ -149,10 +201,13 @@ class LlamaForCausalLM(GenerationMixin):
"""
return self._model.get_parameter(name)
# @property
# def config(self):
# """Get model configuration"""
# return self._model.config()
@property
def config(self):
"""Get model configuration"""
# Return the Python wrapper config instead of C++ config
# This ensures compatibility with code that expects PretrainedConfig methods
# like get_text_config() used by DynamicCache
return self._config
def forward(self, input_ids, position_ids, *args, **kwargs):
kv_caches = None
......@@ -195,5 +250,6 @@ class LlamaForCausalLM(GenerationMixin):
with open(config_path, "r") as f:
config_dict = json.load(f)
# LlamaConfig automatically detects and handles jiuge models
config = LlamaConfig(config_dict)
return cls(config, device=device, dtype=dtype, **kwargs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment