kv_cache.hpp

#pragma once

#include "infinicore/context/context.hpp"
#include "infinicore/device.hpp"
#include "infinicore/tensor.hpp"

#include "cache_config.hpp"

#include <algorithm>
#include <memory>
#include <numeric>
#include <stdexcept>
#include <utility>

#include <spdlog/spdlog.h>

namespace infinilm::cache {

/**
 * @brief Single layer's KV cache for incremental decoding
 *
 * Stores key and value caches with shape [batch_size, n_kv_head, capacity, head_dim]
 * Similar to DynamicLayer in Python cache_utils.py
 *
 * This represents a single layer's cache within a model-level cache container.
 */
struct KVCacheLayer {
    infinicore::Tensor k_cache;          // [batch_size, n_kv_head, capacity, head_dim]
    infinicore::Tensor v_cache;          // [batch_size, n_kv_head, capacity, head_dim]
    std::vector<size_t> cache_positions; // Current position in cache
    size_t max_capacity;                 // Maximum capacity of cache
    size_t initial_capacity;             // Initial capacity from config
    size_t initial_batch_size;           // Initial batch size from config
    float growth_factor;                 // Growth factor for dynamic resizing
    bool initialized;                    // Whether cache has been initialized

    KVCacheLayer() : max_capacity(0), initial_capacity(4096), initial_batch_size(1),
                     growth_factor(2.0f), initialized(false) {}

    /**
     * @brief Initialize or update cache capacity with config parameters
     * @param batch_size Current batch size
     * @param num_kv_heads Number of key-value heads
     * @param head_dim Head dimension
     * @param seq_len Sequence length of new tokens
     * @param dtype Data type
     * @param device Device
     * @param cache_config Cache configuration parameters
     */
    void ensure_capacity(size_t batch_size, size_t num_kv_heads, size_t head_dim, size_t seq_len,
                         infinicore::DataType dtype, const infinicore::Device &device,
                         const CacheConfig &cache_config) {
        size_t required_capacity = seq_len + std::accumulate(cache_positions.begin(), cache_positions.end(), 0, [](int a, int b) { return std::max(a, b); });

        // VALIDATION: Verify input parameters
        if (num_kv_heads == 0 || head_dim == 0 || seq_len == 0) {
            SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Invalid parameters - num_kv_heads: {}, head_dim: {}, seq_len: {}",
                         num_kv_heads, head_dim, seq_len);
            throw std::runtime_error("KV cache ensure_capacity: invalid parameters");
        }

        // Store config parameters on first initialization
        if (!initialized) {
            initial_capacity = cache_config.initial_capacity;
            initial_batch_size = cache_config.initial_batch_size;
            growth_factor = cache_config.growth_factor;
        }

        // Lazy initialization
        if (!initialized) {
            // Use max of required capacity and initial capacity from config
            max_capacity = std::max(required_capacity, initial_capacity);

            // Use max of current batch size and initial batch size from config
            size_t alloc_batch_size = std::max(batch_size, initial_batch_size);

            k_cache = infinicore::Tensor::empty({alloc_batch_size, num_kv_heads, max_capacity, head_dim},
                                                dtype, device);
            v_cache = infinicore::Tensor::empty({alloc_batch_size, num_kv_heads, max_capacity, head_dim},
                                                dtype, device);
            cache_positions = std::vector<size_t>(alloc_batch_size, 0);
            initialized = true;

            spdlog::debug("Initialized KV cache with batch_size={}, capacity={} (config: initial_batch={}, initial_capacity={})",
                          alloc_batch_size, max_capacity, initial_batch_size, initial_capacity);

            // VALIDATION: Verify cache was created correctly
            if (k_cache->shape()[0] != alloc_batch_size || k_cache->shape()[1] != num_kv_heads || k_cache->shape()[2] != max_capacity || k_cache->shape()[3] != head_dim) {
                SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Cache shape mismatch after initialization");
                throw std::runtime_error("KV cache initialization: shape mismatch");
            }
        }
        // Grow cache if needed using growth factor from config
        else if (required_capacity > max_capacity) {
            if (!cache_config.allow_expand) {
                SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Cache expansion not allowed by config");
                throw std::runtime_error("KV cache expansion not allowed");
            }
            // Calculate new capacity using growth factor
            size_t new_capacity = static_cast<size_t>(
                std::max(static_cast<float>(max_capacity) * growth_factor,
                         static_cast<float>(required_capacity + max_capacity)));

            // Ensure we don't exceed max_position_embeddings if specified
            if (cache_config.max_kv_cache_length != 0) {
                new_capacity = std::min(new_capacity, cache_config.max_kv_cache_length);
            }

            // Ensure we grow by at least some minimum amount
            size_t min_growth = 256;
            if (new_capacity - max_capacity < min_growth) {
                new_capacity = max_capacity + min_growth;
            }

            size_t new_batch_size = std::max(batch_size, k_cache->shape()[0]);
            if (num_kv_heads != k_cache->shape()[1] || head_dim != k_cache->shape()[3]) {
                throw std::runtime_error("KVCache ensure_capacity: num_kv_heads or head_dim mismatch with existing cache.");
            }
            if (new_batch_size > cache_positions.size()) {
                cache_positions.resize(new_batch_size, 0);
            }

            auto k_new = infinicore::Tensor::empty({new_batch_size, num_kv_heads, new_capacity, head_dim},
                                                   dtype, device);
            auto v_new = infinicore::Tensor::empty({new_batch_size, num_kv_heads, new_capacity, head_dim},
                                                   dtype, device);

            spdlog::debug("Growing KV cache from capacity {} to {} (growth_factor={})",
                          max_capacity, new_capacity, growth_factor);

            // Copy existing cache data
            for (size_t b = 0; b < new_batch_size; ++b) {
                size_t cache_position = cache_positions[b];
                if (cache_position > 0) {
                    auto k_slice = k_cache->narrow({{0, b, 1}, {2, 0, cache_position}});
                    auto v_slice = v_cache->narrow({{0, b, 1}, {2, 0, cache_position}});
                    k_new->narrow({{0, b, 1}, {2, 0, cache_position}})->copy_from(k_slice);
                    v_new->narrow({{0, b, 1}, {2, 0, cache_position}})->copy_from(v_slice);
                }
            }

            k_cache = k_new;
            v_cache = v_new;
            max_capacity = new_capacity;

            // VALIDATION: Verify cache was grown correctly
            if (k_cache->shape()[2] != new_capacity) {
                SPDLOG_ERROR("KVCacheLayer::ensure_capacity: New cache capacity mismatch");
                throw std::runtime_error("KV cache growth: capacity mismatch");
            }
        }

        // VALIDATION: Final check that capacity is sufficient
        if (required_capacity > max_capacity) {
            SPDLOG_ERROR("KVCacheLayer::ensure_capacity: Capacity still insufficient after growth");
            throw std::runtime_error("KV cache ensure_capacity: capacity insufficient");
        }
    }

    /**
     * @brief Update cache with new key and value states
     * @param k_new New key states [batch_size, n_kv_head, seq_len, head_dim]
     * @param v_new New value states [batch_size, n_kv_head, seq_len, head_dim]
     * @param cache_config Cache configuration for capacity management
     * @return Tuple of (k_total, v_total) with shape [batch_size, n_kv_head, total_seq_len, head_dim]
     */
    std::pair<infinicore::Tensor, infinicore::Tensor> update(
        const infinicore::Tensor &k_new,
        const infinicore::Tensor &v_new,
        const CacheConfig &cache_config) {
        if (k_new->ndim() != 4 || v_new->ndim() != 4) {
            throw std::runtime_error("KVCache update: k_new and v_new must be 4D tensors");
        }
        size_t batch_size = k_new->shape()[0];
        size_t num_kv_heads = k_new->shape()[1];
        size_t seq_len = k_new->shape()[2];
        size_t head_dim = k_new->shape()[3];

        // Ensure capacity with cache config
        ensure_capacity(batch_size, num_kv_heads, head_dim, seq_len,
                        k_new->dtype(), k_new->device(), cache_config);

        // Copy new k/v into cache at current position
        bool all_equal = cache_positions.empty() || std::equal(cache_positions.begin() + 1, cache_positions.end(), cache_positions.begin());
        if (all_equal) {
            auto cache_position = cache_positions[0];

            auto k_dst = k_cache->narrow({{2, cache_position, seq_len}});
            auto v_dst = v_cache->narrow({{2, cache_position, seq_len}});
            k_dst->copy_from(k_new);
            v_dst->copy_from(v_new);

            // Update position
            cache_position += seq_len;
            for (size_t b = 0; b < batch_size; ++b) {
                cache_positions[b] = cache_position;
            }

            // Return the total cache up to current position
            auto k_total = k_cache->narrow({{2, 0, cache_position}});
            auto v_total = v_cache->narrow({{2, 0, cache_position}});

            return std::make_pair(k_total, v_total);
        } else {
            throw std::runtime_error("KVCache update: cache positions must be equal among a batch.");
        }
    }
};

/**
 * @brief Model-level KV cache container (similar to DynamicCache in Python)
 *
 * Stores a list of KVCacheLayer objects, one per model layer.
 * This aligns with Python backend's DynamicCache architecture.
 */
class DynamicCache {
public:
    /**
     * @brief Construct DynamicCache with cache configuration
     * @param cache_config Cache configuration parameters
     */
    DynamicCache(const CacheConfig &cache_config)
        : cache_config_(cache_config), layers_(cache_config.num_layers) {
        if (cache_config.num_layers == -1) {
            throw std::runtime_error("DynamicCache: num_layers must be specified in CacheConfig");
        }
    }

    /**
     * @brief Construct DynamicCache with specified number of layers
     *
     * @param num_layers Number of model layers (creates one cache layer per model layer)
     * @param max_position_embeddings Maximum position embeddings (used for initial capacity)
     */
    DynamicCache(size_t num_layers, size_t max_position_embeddings = 4096)
        : cache_config_(CacheConfig(CacheType::DYNAMIC, num_layers, max_position_embeddings)), layers_(num_layers) {}

    /**
     * @brief Update cache with new key and value states for a specific layer
     */
    std::pair<infinicore::Tensor, infinicore::Tensor> update(
        size_t layer_idx,
        const infinicore::Tensor &k_new,
        const infinicore::Tensor &v_new) {
        if (layer_idx >= layers_.size()) {
            SPDLOG_ERROR("DynamicCache::update: layer_idx {} out of range (num_layers: {})",
                         layer_idx, layers_.size());
            throw std::runtime_error("DynamicCache: layer_idx out of range");
        }

        // Update the cache for this layer with cache config
        return layers_[layer_idx].update(k_new, v_new, cache_config_);
    }

    /**
     * @brief Update cache with new key and value states (convenience method without layer_idx)
     * This is used when the cache is accessed directly without layer information
     *
     * @param k_new New key states [batch_size, n_kv_head, seq_len, head_dim]
     * @param v_new New value states [batch_size, n_kv_head, seq_len, head_dim]
     * @return Tuple of (k_total, v_total) with shape [batch_size, n_kv_head, total_seq_len, head_dim]
     *
     * Note: This assumes layer_idx=0. For multi-layer models, use update(layer_idx, k_new, v_new) instead.
     */
    std::pair<infinicore::Tensor, infinicore::Tensor> update(
        const infinicore::Tensor &k_new,
        const infinicore::Tensor &v_new) {
        return update(0, k_new, v_new);
    }

    /**
     * @brief Get cache configuration
     */
    const CacheConfig &get_config() const { return cache_config_; }

    /**
     * @brief Update cache configuration (for dynamic reconfiguration)
     */
    void update_config(const CacheConfig &new_config) {
        // Check if we need to rebuild
        bool need_rebuild = false;

        // Rebuild if number of layers changed
        if (new_config.num_layers != cache_config_.num_layers || new_config.initial_batch_size != cache_config_.initial_batch_size) {
            need_rebuild = true;
            layers_.resize(new_config.num_layers);
        }

        // Rebuild if reset mode is RECREATE
        if (new_config.reset_mode == CacheResetMode::RECREATE) {
            need_rebuild = true;
        }

        // Update configuration
        cache_config_ = new_config;

        if (need_rebuild) {
            // Clear all layers to force reinitialization on next use
            for (auto &layer : layers_) {
                layer.initialized = false;
                layer.max_capacity = 0;
                // Tensors will be recreated when ensure_capacity is called
            }
            spdlog::info("DynamicCache configuration updated - cache will be rebuilt on next use");
        } else {
            spdlog::info("DynamicCache configuration updated: layers={}, initial_capacity={}, growth_factor={}",
                         new_config.num_layers, new_config.initial_capacity, new_config.growth_factor);
        }
    }

    /**
     * @brief Get the number of layers in this cache
     */
    size_t num_layers() const { return layers_.size(); }

    /**
     * @brief Get cache position for a specific layer
     */
    size_t cache_position(size_t layer_idx) const {
        if (layer_idx >= layers_.size()) {
            throw std::runtime_error("DynamicCache: layer_idx out of range");
        }
        if (layers_[layer_idx].cache_positions.empty()) {
            return 0;
        }
        return layers_[layer_idx].cache_positions[0]; // All batch items should have same position
    }

    /**
     * @brief Get max position embeddings (used for initial capacity)
     */
    size_t max_kv_cache_length() const { return cache_config_.max_kv_cache_length; }

    /**
     * @brief Reset cache for all layers to a specific position
     * This should be called when starting a new generation sequence or resetting to a specific position
     * @param pos Position to reset to (defaults to 0)
     */
    void reset(size_t pos = 0) {
        for (auto &layer : layers_) {
            std::fill(layer.cache_positions.begin(), layer.cache_positions.end(), pos);
            // Note: We don't reset initialized flag or clear the cache tensors
            // to avoid reallocation. The cache will be overwritten on next update.
        }
    }

    /**
     * @brief Access a specific layer's cache (for advanced usage)
     */
    KVCacheLayer &layer(size_t layer_idx) {
        if (layer_idx >= layers_.size()) {
            throw std::runtime_error("DynamicCache: layer_idx out of range");
        }
        return layers_[layer_idx];
    }

    const KVCacheLayer &layer(size_t layer_idx) const {
        if (layer_idx >= layers_.size()) {
            throw std::runtime_error("DynamicCache: layer_idx out of range");
        }
        return layers_[layer_idx];
    }

private:
    CacheConfig cache_config_;
    std::vector<KVCacheLayer> layers_;
};

} // namespace infinilm::cache