// Copyright (c) OpenMMLab. All rights reserved. #pragma once #include "src/turbomind/models/llama/BlockManager.h" #include namespace turbomind { struct Sequence { enum Status { kCached = 0, kLocked, kActive }; uint64_t id; Status status = kCached; BlockIds blocks; UniqueIds block_unique_ids; int input_length = 0; mutable std::vector tokens; // update by user mutable int cache_len = 0; // additional data kept round-to-round mutable std::vector random_state; // update by user mutable float rope_theta = 0.f; explicit Sequence(uint64_t _id): id(_id) {} friend std::ostream& operator<<(std::ostream& os, const Sequence& seq); }; using Sequences = std::vector; inline std::ostream& operator<<(std::ostream& os, const Sequence& seq) { os << "id=" << seq.id << ", status=" << seq.status << ", token_count=" << seq.tokens.size() << ", block_count=" << seq.blocks.size() << ", cache_len=" << seq.cache_len << ", random_state_size=" << seq.random_state.size(); return os; } class SequenceManager { public: explicit SequenceManager(size_t layer_num, size_t head_num, size_t head_dim, size_t block_seq_len, double block_count, int chunk_size, size_t elem_bits, int rank, IAllocator* allocator); SequenceManager(const SequenceManager&) = delete; SequenceManager(SequenceManager&&) noexcept = default; [[nodiscard]] const Sequence* Create(uint64_t id); [[nodiscard]] const Sequence* Get(uint64_t id); [[nodiscard]] bool Contains(uint64_t id); [[nodiscard]] bool Erase(uint64_t id); void UpdateAndSetUnlock(const Sequence& seq); struct Outcome { int allocation; int swap_in; int swap_out; }; using AdjustInputCount = std::function(const Sequences&, const std::vector&)>; [[nodiscard]] Outcome Materialize(Sequences sequences, std::vector context_lengths, const std::vector& priorities, int step_length, AdjustInputCount adjust); [[nodiscard]] void* GetKeyPtr(int block_id) { return block_manager_->block(block_id).data; } [[nodiscard]] void* GetValPtr(int block_id) { return (std::byte*)GetKeyPtr(block_id) + val_offset_; } int max_block_count() const noexcept { return block_manager_->max_block_count(); } private: void Erase(std::map::iterator it); void CommitUnlockAndFree(); void VerifyAndLockCached(const Sequences& sequences); std::vector CountRequiredBlocks(const Sequences& sequences, // const std::vector& context_lengths, int step_length); static void SortByPriority(Sequences& sequences, // std::vector& context_lengths, const std::vector& priorities); static void AssignAndActivate(const Sequences& sequences, // const std::vector& counts, const BlockIds& blocks, const UniqueIds& unique_ids); private: int block_seq_len_; int rank_; size_t val_offset_{}; // Use `std::map` to avoid reference invalidation std::map sequences_; std::unique_ptr block_manager_; BlockIds unlocked_; BlockIds freed_; }; inline std::ostream& operator<<(std::ostream& os, const SequenceManager::Outcome& oc) { os << "allocation: " << oc.allocation << ", swap-in: " << oc.swap_in << ", swap-out: " << oc.swap_out; return os; } } // namespace turbomind