// Copyright (c) OpenMMLab. All rights reserved. #pragma once #include "src/turbomind/models/llama/BlockManager.h" namespace turbomind { struct Sequence { enum Status { kCached = 0, kLocked, kActive }; uint64_t id; Status status; std::vector blocks; std::vector block_unique_ids; mutable std::vector tokens; // update by user mutable int cache_len; // additional data kept round-to-round mutable std::vector random_state; // update by user mutable float rope_theta; friend std::ostream& operator<<(std::ostream& os, const Sequence& seq); }; using Sequences = std::vector; inline std::ostream& operator<<(std::ostream& os, const Sequence& seq) { os << "id=" << seq.id << ", status=" << seq.status << ", token_count=" << seq.tokens.size() << ", block_count=" << seq.blocks.size() << ", cache_len=" << seq.cache_len << ", random_state_size=" << seq.random_state.size(); return os; } class SequenceManager { public: explicit SequenceManager(size_t layer_num, size_t head_num, size_t head_dim, size_t block_seq_len, double block_count, int chunk_size, size_t elem_bits, int rank, IAllocator* allocator); SequenceManager(const SequenceManager&) = delete; SequenceManager(SequenceManager&&) noexcept = default; const Sequence* Create(uint64_t id); const Sequence* Get(uint64_t id); bool Contains(uint64_t id); bool Erase(uint64_t id); void UpdateAndSetUnlock(const Sequence& seq); struct Outcome { int allocation; int swap_in; int swap_out; }; Outcome Materialize(Sequences sequences, std::vector context_lengths, const std::vector& priorities, int step_length); void* OffsetKey(void* block_ptr) { return block_ptr; } void* OffsetVal(void* block_ptr) { return (std::byte*)block_ptr + val_offset_; } int max_block_count() const noexcept { return block_manager_->max_block_count(); } private: void CommitUnlockAndFree(); void VerifyAndLockCached(const Sequences& sequences); std::vector CountRequiredBlocks(const Sequences& sequences, // const std::vector& context_lengths, int step_length); static void SortByPriority(Sequences& sequences, // std::vector& context_lengths, const std::vector& priorities); static void AssignAndActivate(const Sequences& sequences, // const std::vector& block_counts, const std::vector& blocks); private: int block_seq_len_; int rank_; size_t val_offset_{}; bool need_verify_{}; // Use `std::map` to avoid reference invalidation std::map sequences_; std::unique_ptr block_manager_; std::vector unlocked_; std::vector freed_; }; inline std::ostream& operator<<(std::ostream& os, const SequenceManager::Outcome& oc) { os << "allocation: " << oc.allocation << ", swap-in: " << oc.swap_in << ", swap-out: " << oc.swap_out; return os; } } // namespace turbomind