Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
cff3f44f
Commit
cff3f44f
authored
Jul 01, 2024
by
Daniel Hiltgen
Browse files
Fix case for NumCtx
parent
3518aaef
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
7 additions
and
7 deletions
+7
-7
server/sched.go
server/sched.go
+7
-7
No files found.
server/sched.go
View file @
cff3f44f
...
...
@@ -23,7 +23,7 @@ type LlmRequest struct {
ctx
context
.
Context
//nolint:containedctx
model
*
Model
opts
api
.
Options
origNumC
TX
int
// Track the initial ctx request
origNumC
tx
int
// Track the initial ctx request
sessionDuration
time
.
Duration
successCh
chan
*
runnerRef
errCh
chan
error
...
...
@@ -118,8 +118,8 @@ func (s *Scheduler) processPending(ctx context.Context) {
case
pending
:=
<-
s
.
pendingReqCh
:
// Block other requests until we get this pending request running
pending
.
schedAttempts
++
if
pending
.
origNumC
TX
==
0
{
pending
.
origNumC
TX
=
pending
.
opts
.
NumCtx
if
pending
.
origNumC
tx
==
0
{
pending
.
origNumC
tx
=
pending
.
opts
.
NumCtx
}
if
pending
.
ctx
.
Err
()
!=
nil
{
...
...
@@ -135,7 +135,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
// Keep NumCtx and numParallel in sync
if
numParallel
>
1
{
pending
.
opts
.
NumCtx
=
pending
.
origNumC
TX
*
numParallel
pending
.
opts
.
NumCtx
=
pending
.
origNumC
tx
*
numParallel
}
for
{
...
...
@@ -197,7 +197,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// simplifying assumption of defaultParallel when in CPU mode
if
numParallel
<=
0
{
numParallel
=
defaultParallel
pending
.
opts
.
NumCtx
=
pending
.
origNumC
TX
*
numParallel
pending
.
opts
.
NumCtx
=
pending
.
origNumC
tx
*
numParallel
}
if
loadedCount
==
0
{
...
...
@@ -691,7 +691,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// First attempt to fit the model into a single GPU
for
_
,
p
:=
range
numParallelToTry
{
req
.
opts
.
NumCtx
=
req
.
origNumC
TX
*
p
req
.
opts
.
NumCtx
=
req
.
origNumC
tx
*
p
if
!
envconfig
.
SchedSpread
{
for
_
,
g
:=
range
sgl
{
if
ok
,
estimatedVRAM
=
llm
.
PredictServerFit
([]
gpu
.
GpuInfo
{
g
},
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
);
ok
{
...
...
@@ -709,7 +709,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// Now try all the GPUs
for
_
,
p
:=
range
numParallelToTry
{
req
.
opts
.
NumCtx
=
req
.
origNumC
TX
*
p
req
.
opts
.
NumCtx
=
req
.
origNumC
tx
*
p
if
ok
,
estimatedVRAM
=
llm
.
PredictServerFit
(
sgl
,
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
);
ok
{
slog
.
Info
(
"new model will fit in available VRAM, loading"
,
"model"
,
req
.
model
.
ModelPath
,
"library"
,
sgl
[
0
]
.
Library
,
"parallel"
,
p
,
"required"
,
format
.
HumanBytes2
(
estimatedVRAM
))
*
numParallel
=
p
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment