Unverified Commit 942c9792 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

allocate a large enough kv cache for all parallel requests (#4162)

parent 06164911
...@@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler { ...@@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner // context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) { func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
// allocate a large enough kv cache for all parallel requests
opts.NumCtx = opts.NumCtx * numParallel
req := &LlmRequest{ req := &LlmRequest{
ctx: c, ctx: c,
model: model, model: model,
...@@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, ...@@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
successCh: make(chan *runnerRef), successCh: make(chan *runnerRef),
errCh: make(chan error, 1), errCh: make(chan error, 1),
} }
// context split across parallel threads
opts.NumCtx = opts.NumCtx * numParallel
select { select {
case s.pendingReqCh <- req: case s.pendingReqCh <- req:
default: default:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment