Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
e70610ef
Unverified
Commit
e70610ef
authored
Jul 01, 2024
by
Daniel Hiltgen
Committed by
GitHub
Jul 01, 2024
Browse files
Merge pull request #5410 from dhiltgen/ctx_cleanup
Fix case for NumCtx
parents
dfded7e0
173b5504
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
9 additions
and
9 deletions
+9
-9
envconfig/config.go
envconfig/config.go
+2
-2
server/sched.go
server/sched.go
+7
-7
No files found.
envconfig/config.go
View file @
e70610ef
...
...
@@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar {
"OLLAMA_HOST"
:
{
"OLLAMA_HOST"
,
Host
,
"IP Address for the ollama server (default 127.0.0.1:11434)"
},
"OLLAMA_KEEP_ALIVE"
:
{
"OLLAMA_KEEP_ALIVE"
,
KeepAlive
,
"The duration that models stay loaded in memory (default
\"
5m
\"
)"
},
"OLLAMA_LLM_LIBRARY"
:
{
"OLLAMA_LLM_LIBRARY"
,
LLMLibrary
,
"Set LLM library to bypass autodetection"
},
"OLLAMA_MAX_LOADED_MODELS"
:
{
"OLLAMA_MAX_LOADED_MODELS"
,
MaxRunners
,
"Maximum number of loaded models per GPU
(default auto)
"
},
"OLLAMA_MAX_LOADED_MODELS"
:
{
"OLLAMA_MAX_LOADED_MODELS"
,
MaxRunners
,
"Maximum number of loaded models per GPU"
},
"OLLAMA_MAX_QUEUE"
:
{
"OLLAMA_MAX_QUEUE"
,
MaxQueuedRequests
,
"Maximum number of queued requests"
},
"OLLAMA_MAX_VRAM"
:
{
"OLLAMA_MAX_VRAM"
,
MaxVRAM
,
"Maximum VRAM"
},
"OLLAMA_MODELS"
:
{
"OLLAMA_MODELS"
,
ModelsDir
,
"The path to the models directory"
},
"OLLAMA_NOHISTORY"
:
{
"OLLAMA_NOHISTORY"
,
NoHistory
,
"Do not preserve readline history"
},
"OLLAMA_NOPRUNE"
:
{
"OLLAMA_NOPRUNE"
,
NoPrune
,
"Do not prune model blobs on startup"
},
"OLLAMA_NUM_PARALLEL"
:
{
"OLLAMA_NUM_PARALLEL"
,
NumParallel
,
"Maximum number of parallel requests
(default auto)
"
},
"OLLAMA_NUM_PARALLEL"
:
{
"OLLAMA_NUM_PARALLEL"
,
NumParallel
,
"Maximum number of parallel requests"
},
"OLLAMA_ORIGINS"
:
{
"OLLAMA_ORIGINS"
,
AllowOrigins
,
"A comma separated list of allowed origins"
},
"OLLAMA_RUNNERS_DIR"
:
{
"OLLAMA_RUNNERS_DIR"
,
RunnersDir
,
"Location for runners"
},
"OLLAMA_SCHED_SPREAD"
:
{
"OLLAMA_SCHED_SPREAD"
,
SchedSpread
,
"Always schedule model across all GPUs"
},
...
...
server/sched.go
View file @
e70610ef
...
...
@@ -23,7 +23,7 @@ type LlmRequest struct {
ctx
context
.
Context
//nolint:containedctx
model
*
Model
opts
api
.
Options
origNumC
TX
int
// Track the initial ctx request
origNumC
tx
int
// Track the initial ctx request
sessionDuration
time
.
Duration
successCh
chan
*
runnerRef
errCh
chan
error
...
...
@@ -118,8 +118,8 @@ func (s *Scheduler) processPending(ctx context.Context) {
case
pending
:=
<-
s
.
pendingReqCh
:
// Block other requests until we get this pending request running
pending
.
schedAttempts
++
if
pending
.
origNumC
TX
==
0
{
pending
.
origNumC
TX
=
pending
.
opts
.
NumCtx
if
pending
.
origNumC
tx
==
0
{
pending
.
origNumC
tx
=
pending
.
opts
.
NumCtx
}
if
pending
.
ctx
.
Err
()
!=
nil
{
...
...
@@ -135,7 +135,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
// Keep NumCtx and numParallel in sync
if
numParallel
>
1
{
pending
.
opts
.
NumCtx
=
pending
.
origNumC
TX
*
numParallel
pending
.
opts
.
NumCtx
=
pending
.
origNumC
tx
*
numParallel
}
for
{
...
...
@@ -197,7 +197,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// simplifying assumption of defaultParallel when in CPU mode
if
numParallel
<=
0
{
numParallel
=
defaultParallel
pending
.
opts
.
NumCtx
=
pending
.
origNumC
TX
*
numParallel
pending
.
opts
.
NumCtx
=
pending
.
origNumC
tx
*
numParallel
}
if
loadedCount
==
0
{
...
...
@@ -691,7 +691,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// First attempt to fit the model into a single GPU
for
_
,
p
:=
range
numParallelToTry
{
req
.
opts
.
NumCtx
=
req
.
origNumC
TX
*
p
req
.
opts
.
NumCtx
=
req
.
origNumC
tx
*
p
if
!
envconfig
.
SchedSpread
{
for
_
,
g
:=
range
sgl
{
if
ok
,
estimatedVRAM
=
llm
.
PredictServerFit
([]
gpu
.
GpuInfo
{
g
},
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
);
ok
{
...
...
@@ -709,7 +709,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// Now try all the GPUs
for
_
,
p
:=
range
numParallelToTry
{
req
.
opts
.
NumCtx
=
req
.
origNumC
TX
*
p
req
.
opts
.
NumCtx
=
req
.
origNumC
tx
*
p
if
ok
,
estimatedVRAM
=
llm
.
PredictServerFit
(
sgl
,
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
);
ok
{
slog
.
Info
(
"new model will fit in available VRAM, loading"
,
"model"
,
req
.
model
.
ModelPath
,
"library"
,
sgl
[
0
]
.
Library
,
"parallel"
,
p
,
"required"
,
format
.
HumanBytes2
(
estimatedVRAM
))
*
numParallel
=
p
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment