Unverified Commit ab1767cf authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

TurboMind 2 (#590)

* refresh decoder attention kernel

* block-level kv cache

* `BlockManager` & `SequenceManager`

* update

* update

* update

* update

* rename

* GQA support

* fix context length

* GQA dispatch

* kv8

* tune

* async stream cb

* nvtx

* config parsing

* debug

* optimize output cost

* split-k decoding

* minor

* truncate `session_len` by available blocks

* minor

* license

* fix

* dispatch `cp.async`

* fix linking

* fix

* fix deadlock

* guard input length

* correct start offset

* fix prefill chunking

* fix `cache_block_seq_len` param passing

* fix `block_size` fmtstr

* fix output tokens

* fix batch resizing

* fix masking of finished sequences

* add debug util

* free unused block early

* add ntk scaling and logn scaling

* cmake flags

* fix typo

* w4a16 for sm75

* fix msvc build

* fix msvc build

* fix block verification

* fix msvc build

* use `std::shuffle`

* fix lint

* fix lint

* fix lint

* clear incoming buffer

* clear finished requests

* fix batch initialization

* fix typo

* fix typo

* fix comparison
parent 06125966
This diff is collapsed.
...@@ -14,6 +14,8 @@ namespace turbomind { ...@@ -14,6 +14,8 @@ namespace turbomind {
struct Request { struct Request {
uint64_t id; uint64_t id;
uint64_t priority;
bool start_flag; bool start_flag;
bool end_flag; bool end_flag;
bool stop_flag; bool stop_flag;
...@@ -31,7 +33,8 @@ struct Request { ...@@ -31,7 +33,8 @@ struct Request {
kConflict = 2, kConflict = 2,
kBusy = 3, kBusy = 3,
kInactive = 4, kInactive = 4,
kFail = 5 kFail = 5,
kTooLong = 6
}; };
std::promise<int> signal; std::promise<int> signal;
}; };
...@@ -66,11 +69,16 @@ public: ...@@ -66,11 +69,16 @@ public:
void dequeue(std::vector<std::shared_ptr<Request>>& stop_requests, void dequeue(std::vector<std::shared_ptr<Request>>& stop_requests,
std::vector<std::shared_ptr<Request>>& infer_requests, std::vector<std::shared_ptr<Request>>& infer_requests,
unsigned max_infer_count, unsigned max_infer_count,
bool blocking) bool blocking,
bool& abort)
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
if (blocking) { if (blocking) {
cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty() && closed_ == false); }); cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()) || closed_; });
if (closed_) {
abort = true;
return;
}
} }
stop_requests.clear(); stop_requests.clear();
...@@ -87,9 +95,11 @@ public: ...@@ -87,9 +95,11 @@ public:
} }
void close() void close()
{
{ {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
closed_ = true; closed_ = true;
}
cv_.notify_all(); cv_.notify_all();
} }
...@@ -98,7 +108,7 @@ private: ...@@ -98,7 +108,7 @@ private:
std::queue<std::shared_ptr<Request>> infer_queue_; std::queue<std::shared_ptr<Request>> infer_queue_;
std::mutex mutex_; std::mutex mutex_;
std::condition_variable cv_; std::condition_variable cv_;
bool closed_ = false; bool closed_{false};
}; };
} // namespace turbomind } // namespace turbomind
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment