Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
e70610ef
"tests/vscode:/vscode.git/clone" did not exist on "d8b6f5d09eb0cb9b7913c235a6fc69b698a5b1a3"
Unverified
Commit
e70610ef
authored
Jul 01, 2024
by
Daniel Hiltgen
Committed by
GitHub
Jul 01, 2024
Browse files
Merge pull request #5410 from dhiltgen/ctx_cleanup
Fix case for NumCtx
parents
dfded7e0
173b5504
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
9 additions
and
9 deletions
+9
-9
envconfig/config.go
envconfig/config.go
+2
-2
server/sched.go
server/sched.go
+7
-7
No files found.
envconfig/config.go
View file @
e70610ef
...
...
@@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar {
"OLLAMA_HOST"
:
{
"OLLAMA_HOST"
,
Host
,
"IP Address for the ollama server (default 127.0.0.1:11434)"
},
"OLLAMA_KEEP_ALIVE"
:
{
"OLLAMA_KEEP_ALIVE"
,
KeepAlive
,
"The duration that models stay loaded in memory (default
\"
5m
\"
)"
},
"OLLAMA_LLM_LIBRARY"
:
{
"OLLAMA_LLM_LIBRARY"
,
LLMLibrary
,
"Set LLM library to bypass autodetection"
},
"OLLAMA_MAX_LOADED_MODELS"
:
{
"OLLAMA_MAX_LOADED_MODELS"
,
MaxRunners
,
"Maximum number of loaded models per GPU
(default auto)
"
},
"OLLAMA_MAX_LOADED_MODELS"
:
{
"OLLAMA_MAX_LOADED_MODELS"
,
MaxRunners
,
"Maximum number of loaded models per GPU"
},
"OLLAMA_MAX_QUEUE"
:
{
"OLLAMA_MAX_QUEUE"
,
MaxQueuedRequests
,
"Maximum number of queued requests"
},
"OLLAMA_MAX_VRAM"
:
{
"OLLAMA_MAX_VRAM"
,
MaxVRAM
,
"Maximum VRAM"
},
"OLLAMA_MODELS"
:
{
"OLLAMA_MODELS"
,
ModelsDir
,
"The path to the models directory"
},
"OLLAMA_NOHISTORY"
:
{
"OLLAMA_NOHISTORY"
,
NoHistory
,
"Do not preserve readline history"
},
"OLLAMA_NOPRUNE"
:
{
"OLLAMA_NOPRUNE"
,
NoPrune
,
"Do not prune model blobs on startup"
},
"OLLAMA_NUM_PARALLEL"
:
{
"OLLAMA_NUM_PARALLEL"
,
NumParallel
,
"Maximum number of parallel requests
(default auto)
"
},
"OLLAMA_NUM_PARALLEL"
:
{
"OLLAMA_NUM_PARALLEL"
,
NumParallel
,
"Maximum number of parallel requests"
},
"OLLAMA_ORIGINS"
:
{
"OLLAMA_ORIGINS"
,
AllowOrigins
,
"A comma separated list of allowed origins"
},
"OLLAMA_RUNNERS_DIR"
:
{
"OLLAMA_RUNNERS_DIR"
,
RunnersDir
,
"Location for runners"
},
"OLLAMA_SCHED_SPREAD"
:
{
"OLLAMA_SCHED_SPREAD"
,
SchedSpread
,
"Always schedule model across all GPUs"
},
...
...
server/sched.go
View file @
e70610ef
...
...
@@ -23,7 +23,7 @@ type LlmRequest struct {
ctx
context
.
Context
//nolint:containedctx
model
*
Model
opts
api
.
Options
origNumC
TX
int
// Track the initial ctx request
origNumC
tx
int
// Track the initial ctx request
sessionDuration
time
.
Duration
successCh
chan
*
runnerRef
errCh
chan
error
...
...
@@ -118,8 +118,8 @@ func (s *Scheduler) processPending(ctx context.Context) {
case
pending
:=
<-
s
.
pendingReqCh
:
// Block other requests until we get this pending request running
pending
.
schedAttempts
++
if
pending
.
origNumC
TX
==
0
{
pending
.
origNumC
TX
=
pending
.
opts
.
NumCtx
if
pending
.
origNumC
tx
==
0
{
pending
.
origNumC
tx
=
pending
.
opts
.
NumCtx
}
if
pending
.
ctx
.
Err
()
!=
nil
{
...
...
@@ -135,7 +135,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
// Keep NumCtx and numParallel in sync
if
numParallel
>
1
{
pending
.
opts
.
NumCtx
=
pending
.
origNumC
TX
*
numParallel
pending
.
opts
.
NumCtx
=
pending
.
origNumC
tx
*
numParallel
}
for
{
...
...
@@ -197,7 +197,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// simplifying assumption of defaultParallel when in CPU mode
if
numParallel
<=
0
{
numParallel
=
defaultParallel
pending
.
opts
.
NumCtx
=
pending
.
origNumC
TX
*
numParallel
pending
.
opts
.
NumCtx
=
pending
.
origNumC
tx
*
numParallel
}
if
loadedCount
==
0
{
...
...
@@ -691,7 +691,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// First attempt to fit the model into a single GPU
for
_
,
p
:=
range
numParallelToTry
{
req
.
opts
.
NumCtx
=
req
.
origNumC
TX
*
p
req
.
opts
.
NumCtx
=
req
.
origNumC
tx
*
p
if
!
envconfig
.
SchedSpread
{
for
_
,
g
:=
range
sgl
{
if
ok
,
estimatedVRAM
=
llm
.
PredictServerFit
([]
gpu
.
GpuInfo
{
g
},
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
);
ok
{
...
...
@@ -709,7 +709,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// Now try all the GPUs
for
_
,
p
:=
range
numParallelToTry
{
req
.
opts
.
NumCtx
=
req
.
origNumC
TX
*
p
req
.
opts
.
NumCtx
=
req
.
origNumC
tx
*
p
if
ok
,
estimatedVRAM
=
llm
.
PredictServerFit
(
sgl
,
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
);
ok
{
slog
.
Info
(
"new model will fit in available VRAM, loading"
,
"model"
,
req
.
model
.
ModelPath
,
"library"
,
sgl
[
0
]
.
Library
,
"parallel"
,
p
,
"required"
,
format
.
HumanBytes2
(
estimatedVRAM
))
*
numParallel
=
p
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment