Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
5c191276
Unverified
Commit
5c191276
authored
Jul 31, 2024
by
Michael Yang
Committed by
GitHub
Jul 31, 2024
Browse files
Merge pull request #5473 from ollama/mxyng/environ
fix: environ lookup
parents
71399aa6
85d9d73a
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
22 additions
and
36 deletions
+22
-36
server/routes.go
server/routes.go
+3
-3
server/routes_create_test.go
server/routes_create_test.go
+0
-10
server/routes_delete_test.go
server/routes_delete_test.go
+0
-2
server/routes_list_test.go
server/routes_list_test.go
+0
-2
server/routes_test.go
server/routes_test.go
+0
-4
server/sched.go
server/sched.go
+16
-11
server/sched_test.go
server/sched_test.go
+3
-4
No files found.
server/routes.go
View file @
5c191276
...
...
@@ -1053,7 +1053,7 @@ func (s *Server) GenerateRoutes() http.Handler {
for
_
,
prop
:=
range
openAIProperties
{
config
.
AllowHeaders
=
append
(
config
.
AllowHeaders
,
"x-stainless-"
+
prop
)
}
config
.
AllowOrigins
=
envconfig
.
Allow
Origins
config
.
AllowOrigins
=
envconfig
.
Origins
()
r
:=
gin
.
Default
()
r
.
Use
(
...
...
@@ -1098,7 +1098,7 @@ func (s *Server) GenerateRoutes() http.Handler {
func
Serve
(
ln
net
.
Listener
)
error
{
level
:=
slog
.
LevelInfo
if
envconfig
.
Debug
{
if
envconfig
.
Debug
()
{
level
=
slog
.
LevelDebug
}
...
...
@@ -1126,7 +1126,7 @@ func Serve(ln net.Listener) error {
return
err
}
if
!
envconfig
.
NoPrune
{
if
!
envconfig
.
NoPrune
()
{
// clean up unused layers and manifests
if
err
:=
PruneLayers
();
err
!=
nil
{
return
err
...
...
server/routes_create_test.go
View file @
5c191276
...
...
@@ -15,7 +15,6 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
)
...
...
@@ -89,7 +88,6 @@ func TestCreateFromBin(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
w
:=
createRequest
(
t
,
s
.
CreateModelHandler
,
api
.
CreateRequest
{
...
...
@@ -117,7 +115,6 @@ func TestCreateFromModel(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
w
:=
createRequest
(
t
,
s
.
CreateModelHandler
,
api
.
CreateRequest
{
...
...
@@ -160,7 +157,6 @@ func TestCreateRemovesLayers(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
w
:=
createRequest
(
t
,
s
.
CreateModelHandler
,
api
.
CreateRequest
{
...
...
@@ -209,7 +205,6 @@ func TestCreateUnsetsSystem(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
w
:=
createRequest
(
t
,
s
.
CreateModelHandler
,
api
.
CreateRequest
{
...
...
@@ -267,7 +262,6 @@ func TestCreateMergeParameters(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
w
:=
createRequest
(
t
,
s
.
CreateModelHandler
,
api
.
CreateRequest
{
...
...
@@ -372,7 +366,6 @@ func TestCreateReplacesMessages(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
w
:=
createRequest
(
t
,
s
.
CreateModelHandler
,
api
.
CreateRequest
{
...
...
@@ -450,7 +443,6 @@ func TestCreateTemplateSystem(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
w
:=
createRequest
(
t
,
s
.
CreateModelHandler
,
api
.
CreateRequest
{
...
...
@@ -534,7 +526,6 @@ func TestCreateLicenses(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
w
:=
createRequest
(
t
,
s
.
CreateModelHandler
,
api
.
CreateRequest
{
...
...
@@ -582,7 +573,6 @@ func TestCreateDetectTemplate(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
t
.
Run
(
"matched"
,
func
(
t
*
testing
.
T
)
{
...
...
server/routes_delete_test.go
View file @
5c191276
...
...
@@ -10,7 +10,6 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/types/model"
)
...
...
@@ -19,7 +18,6 @@ func TestDelete(t *testing.T) {
p
:=
t
.
TempDir
()
t
.
Setenv
(
"OLLAMA_MODELS"
,
p
)
envconfig
.
LoadConfig
()
var
s
Server
...
...
server/routes_list_test.go
View file @
5c191276
...
...
@@ -9,14 +9,12 @@ import (
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
)
func
TestList
(
t
*
testing
.
T
)
{
gin
.
SetMode
(
gin
.
TestMode
)
t
.
Setenv
(
"OLLAMA_MODELS"
,
t
.
TempDir
())
envconfig
.
LoadConfig
()
expectNames
:=
[]
string
{
"mistral:7b-instruct-q4_0"
,
...
...
server/routes_test.go
View file @
5c191276
...
...
@@ -19,7 +19,6 @@ import (
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser"
...
...
@@ -347,7 +346,6 @@ func Test_Routes(t *testing.T) {
}
t
.
Setenv
(
"OLLAMA_MODELS"
,
t
.
TempDir
())
envconfig
.
LoadConfig
()
s
:=
&
Server
{}
router
:=
s
.
GenerateRoutes
()
...
...
@@ -378,7 +376,6 @@ func Test_Routes(t *testing.T) {
func
TestCase
(
t
*
testing
.
T
)
{
t
.
Setenv
(
"OLLAMA_MODELS"
,
t
.
TempDir
())
envconfig
.
LoadConfig
()
cases
:=
[]
string
{
"mistral"
,
...
...
@@ -458,7 +455,6 @@ func TestCase(t *testing.T) {
func
TestShow
(
t
*
testing
.
T
)
{
t
.
Setenv
(
"OLLAMA_MODELS"
,
t
.
TempDir
())
envconfig
.
LoadConfig
()
var
s
Server
...
...
server/sched.go
View file @
5c191276
...
...
@@ -5,9 +5,11 @@ import (
"errors"
"fmt"
"log/slog"
"os"
"reflect"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"time"
...
...
@@ -59,11 +61,12 @@ var defaultParallel = 4
var
ErrMaxQueue
=
fmt
.
Errorf
(
"server busy, please try again. maximum pending requests exceeded"
)
func
InitScheduler
(
ctx
context
.
Context
)
*
Scheduler
{
maxQueue
:=
envconfig
.
MaxQueue
()
sched
:=
&
Scheduler
{
pendingReqCh
:
make
(
chan
*
LlmRequest
,
envconfig
.
MaxQueuedRequests
),
finishedReqCh
:
make
(
chan
*
LlmRequest
,
envconfig
.
MaxQueuedRequests
),
expiredCh
:
make
(
chan
*
runnerRef
,
envconfig
.
MaxQueuedRequests
),
unloadedCh
:
make
(
chan
interface
{},
envconfig
.
MaxQueuedRequests
),
pendingReqCh
:
make
(
chan
*
LlmRequest
,
maxQueue
),
finishedReqCh
:
make
(
chan
*
LlmRequest
,
maxQueue
),
expiredCh
:
make
(
chan
*
runnerRef
,
maxQueue
),
unloadedCh
:
make
(
chan
interface
{},
maxQueue
),
loaded
:
make
(
map
[
string
]
*
runnerRef
),
newServerFn
:
llm
.
NewLlamaServer
,
getGpuFn
:
gpu
.
GetGPUInfo
,
...
...
@@ -126,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
slog
.
Debug
(
"pending request cancelled or timed out, skipping scheduling"
)
continue
}
numParallel
:=
envconfig
.
NumParallel
numParallel
:=
int
(
envconfig
.
NumParallel
())
// TODO (jmorganca): multimodal models don't support parallel yet
// see https://github.com/ollama/ollama/issues/4165
if
len
(
pending
.
model
.
ProjectorPaths
)
>
0
&&
numParallel
!=
1
{
...
...
@@ -148,7 +151,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending
.
useLoadedRunner
(
runner
,
s
.
finishedReqCh
)
break
}
}
else
if
envconfig
.
MaxRunners
>
0
&&
loadedCount
>=
envconfig
.
MaxRunners
{
}
else
if
envconfig
.
MaxRunners
()
>
0
&&
loadedCount
>=
int
(
envconfig
.
MaxRunners
())
{
slog
.
Debug
(
"max runners achieved, unloading one to make room"
,
"runner_count"
,
loadedCount
)
runnerToExpire
=
s
.
findRunnerToUnload
()
}
else
{
...
...
@@ -161,7 +164,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
gpus
=
s
.
getGpuFn
()
}
if
envconfig
.
MaxRunners
<=
0
{
if
envconfig
.
MaxRunners
()
<=
0
{
// No user specified MaxRunners, so figure out what automatic setting to use
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
// if any GPU has unreliable free memory reporting, 1x the number of GPUs
...
...
@@ -173,11 +176,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
}
if
allReliable
{
envconfig
.
MaxRunners
=
defaultModelsPerGPU
*
len
(
gpus
)
// HACK
os
.
Setenv
(
"OLLAMA_MAX_LOADED_MODELS"
,
strconv
.
Itoa
(
defaultModelsPerGPU
*
len
(
gpus
)))
slog
.
Debug
(
"updating default concurrency"
,
"OLLAMA_MAX_LOADED_MODELS"
,
envconfig
.
MaxRunners
,
"gpu_count"
,
len
(
gpus
))
}
else
{
// HACK
os
.
Setenv
(
"OLLAMA_MAX_LOADED_MODELS"
,
strconv
.
Itoa
(
len
(
gpus
)))
slog
.
Info
(
"one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency"
)
envconfig
.
MaxRunners
=
len
(
gpus
)
}
}
...
...
@@ -404,7 +409,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
if
numParallel
<
1
{
numParallel
=
1
}
sessionDuration
:=
envconfig
.
KeepAlive
sessionDuration
:=
envconfig
.
KeepAlive
()
if
req
.
sessionDuration
!=
nil
{
sessionDuration
=
req
.
sessionDuration
.
Duration
}
...
...
@@ -699,7 +704,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
// First attempt to fit the model into a single GPU
for
_
,
p
:=
range
numParallelToTry
{
req
.
opts
.
NumCtx
=
req
.
origNumCtx
*
p
if
!
envconfig
.
SchedSpread
{
if
!
envconfig
.
SchedSpread
()
{
for
_
,
g
:=
range
sgl
{
if
ok
,
estimatedVRAM
=
llm
.
PredictServerFit
([]
gpu
.
GpuInfo
{
g
},
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
);
ok
{
slog
.
Info
(
"new model will fit in available VRAM in single GPU, loading"
,
"model"
,
req
.
model
.
ModelPath
,
"gpu"
,
g
.
ID
,
"parallel"
,
p
,
"available"
,
g
.
FreeMemory
,
"required"
,
format
.
HumanBytes2
(
estimatedVRAM
))
...
...
server/sched_test.go
View file @
5c191276
...
...
@@ -12,7 +12,6 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/app/lifecycle"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
...
...
@@ -272,7 +271,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
c
.
req
.
opts
.
NumGPU
=
0
// CPU load, will be allowed
d
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-3c"
,
30
,
nil
)
// Needs prior unloaded
envconfig
.
MaxRunners
=
1
t
.
Setenv
(
"OLLAMA_MAX_LOADED_MODELS"
,
"1"
)
s
.
newServerFn
=
a
.
newServer
slog
.
Info
(
"a"
)
s
.
pendingReqCh
<-
a
.
req
...
...
@@ -291,7 +290,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
require
.
Len
(
t
,
s
.
loaded
,
1
)
s
.
loadedMu
.
Unlock
()
envconfig
.
MaxRunners
=
0
t
.
Setenv
(
"OLLAMA_MAX_LOADED_MODELS"
,
"0"
)
s
.
newServerFn
=
b
.
newServer
slog
.
Info
(
"b"
)
s
.
pendingReqCh
<-
b
.
req
...
...
@@ -362,7 +361,7 @@ func TestGetRunner(t *testing.T) {
a
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1a"
,
10
,
&
api
.
Duration
{
Duration
:
2
*
time
.
Millisecond
})
b
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1b"
,
10
,
&
api
.
Duration
{
Duration
:
2
*
time
.
Millisecond
})
c
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1c"
,
10
,
&
api
.
Duration
{
Duration
:
2
*
time
.
Millisecond
})
envconfig
.
MaxQueuedRequests
=
1
t
.
Setenv
(
"OLLAMA_MAX_QUEUE"
,
"1"
)
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
getGpuFn
s
.
getCpuFn
=
getCpuFn
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment