Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
840424a2
Unverified
Commit
840424a2
authored
May 05, 2024
by
Daniel Hiltgen
Committed by
GitHub
May 05, 2024
Browse files
Merge pull request #4154 from dhiltgen/central_config
Centralize server config handling
parents
6707768e
f56aa200
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
235 additions
and
162 deletions
+235
-162
app/lifecycle/logging.go
app/lifecycle/logging.go
+3
-1
app/lifecycle/updater_windows.go
app/lifecycle/updater_windows.go
+1
-4
gpu/assets.go
gpu/assets.go
+5
-40
gpu/gpu.go
gpu/gpu.go
+2
-1
llm/memory.go
llm/memory.go
+3
-11
llm/server.go
llm/server.go
+5
-12
server/envconfig/config.go
server/envconfig/config.go
+174
-0
server/envconfig/config_test.go
server/envconfig/config_test.go
+20
-0
server/images.go
server/images.go
+3
-2
server/routes.go
server/routes.go
+5
-21
server/sched.go
server/sched.go
+10
-43
server/sched_test.go
server/sched_test.go
+4
-27
No files found.
app/lifecycle/logging.go
View file @
840424a2
...
...
@@ -5,12 +5,14 @@ import (
"log/slog"
"os"
"path/filepath"
"github.com/ollama/ollama/server/envconfig"
)
func
InitLogging
()
{
level
:=
slog
.
LevelInfo
if
debug
:=
os
.
Getenv
(
"OLLAMA_DEBUG"
);
debug
!=
""
{
if
envconfig
.
Debug
{
level
=
slog
.
LevelDebug
}
...
...
app/lifecycle/updater_windows.go
View file @
840424a2
...
...
@@ -31,16 +31,13 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
"/LOG="
+
filepath
.
Base
(
UpgradeLogFile
),
// Only relative seems reliable, so set pwd
"/FORCECLOSEAPPLICATIONS"
,
// Force close the tray app - might be needed
}
// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
// TODO - temporarily disable since we're pinning in debug mode for the preview
// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
// make the upgrade as quiet as possible (no GUI, no prompts)
installArgs
=
append
(
installArgs
,
"/SP"
,
// Skip the "This will install... Do you wish to continue" prompt
"/SUPPRESSMSGBOXES"
,
"/SILENT"
,
"/VERYSILENT"
,
)
// }
// Safeguard in case we have requests in flight that need to drain...
slog
.
Info
(
"Waiting for server to shutdown"
)
...
...
gpu/assets.go
View file @
840424a2
...
...
@@ -12,6 +12,8 @@ import (
"sync"
"syscall"
"time"
"github.com/ollama/ollama/server/envconfig"
)
var
(
...
...
@@ -24,45 +26,8 @@ func PayloadsDir() (string, error) {
defer
lock
.
Unlock
()
var
err
error
if
payloadsDir
==
""
{
runnersDir
:=
os
.
Getenv
(
"OLLAMA_RUNNERS_DIR"
)
// On Windows we do not carry the payloads inside the main executable
if
runtime
.
GOOS
==
"windows"
&&
runnersDir
==
""
{
appExe
,
err
:=
os
.
Executable
()
if
err
!=
nil
{
slog
.
Error
(
"failed to lookup executable path"
,
"error"
,
err
)
return
""
,
err
}
cwd
,
err
:=
os
.
Getwd
()
if
err
!=
nil
{
slog
.
Error
(
"failed to lookup working directory"
,
"error"
,
err
)
return
""
,
err
}
runnersDir
:=
envconfig
.
RunnersDir
var
paths
[]
string
for
_
,
root
:=
range
[]
string
{
filepath
.
Dir
(
appExe
),
cwd
}
{
paths
=
append
(
paths
,
filepath
.
Join
(
root
),
filepath
.
Join
(
root
,
"windows-"
+
runtime
.
GOARCH
),
filepath
.
Join
(
root
,
"dist"
,
"windows-"
+
runtime
.
GOARCH
),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for
_
,
p
:=
range
paths
{
candidate
:=
filepath
.
Join
(
p
,
"ollama_runners"
)
_
,
err
:=
os
.
Stat
(
candidate
)
if
err
==
nil
{
runnersDir
=
candidate
break
}
}
if
runnersDir
==
""
{
err
=
fmt
.
Errorf
(
"unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'"
)
slog
.
Error
(
"incomplete distribution"
,
"error"
,
err
)
return
""
,
err
}
}
if
runnersDir
!=
""
{
payloadsDir
=
runnersDir
return
payloadsDir
,
nil
...
...
@@ -70,7 +35,7 @@ func PayloadsDir() (string, error) {
// The remainder only applies on non-windows where we still carry payloads in the main executable
cleanupTmpDirs
()
tmpDir
:=
os
.
Getenv
(
"OLLAMA_TMPDIR"
)
tmpDir
:=
envconfig
.
TmpDir
if
tmpDir
==
""
{
tmpDir
,
err
=
os
.
MkdirTemp
(
""
,
"ollama"
)
if
err
!=
nil
{
...
...
@@ -133,7 +98,7 @@ func cleanupTmpDirs() {
func
Cleanup
()
{
lock
.
Lock
()
defer
lock
.
Unlock
()
runnersDir
:=
os
.
Getenv
(
"OLLAMA_RUNNERS_DIR"
)
runnersDir
:=
envconfig
.
RunnersDir
if
payloadsDir
!=
""
&&
runnersDir
==
""
&&
runtime
.
GOOS
!=
"windows"
{
// We want to fully clean up the tmpdir parent of the payloads dir
tmpDir
:=
filepath
.
Clean
(
filepath
.
Join
(
payloadsDir
,
".."
))
...
...
gpu/gpu.go
View file @
840424a2
...
...
@@ -21,6 +21,7 @@ import (
"unsafe"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/server/envconfig"
)
type
handles
struct
{
...
...
@@ -268,7 +269,7 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
}
func
getVerboseState
()
C
.
uint16_t
{
if
debug
:=
os
.
Getenv
(
"OLLAMA_DEBUG"
);
debug
!=
""
{
if
envconfig
.
Debug
{
return
C
.
uint16_t
(
1
)
}
return
C
.
uint16_t
(
0
)
...
...
llm/memory.go
View file @
840424a2
...
...
@@ -3,12 +3,11 @@ package llm
import
(
"fmt"
"log/slog"
"os"
"strconv"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/server/envconfig"
)
// This algorithm looks for a complete fit to determine if we need to unload other models
...
...
@@ -50,15 +49,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
for
_
,
info
:=
range
gpus
{
memoryAvailable
+=
info
.
FreeMemory
}
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
if
userLimit
!=
""
{
avail
,
err
:=
strconv
.
ParseUint
(
userLimit
,
10
,
64
)
if
err
!=
nil
{
slog
.
Error
(
"invalid setting, ignoring"
,
"OLLAMA_MAX_VRAM"
,
userLimit
,
"error"
,
err
)
}
else
{
slog
.
Info
(
"user override memory limit"
,
"OLLAMA_MAX_VRAM"
,
avail
,
"actual"
,
memoryAvailable
)
memoryAvailable
=
avail
}
if
envconfig
.
MaxVRAM
>
0
{
memoryAvailable
=
envconfig
.
MaxVRAM
}
slog
.
Debug
(
"evaluating"
,
"library"
,
gpus
[
0
]
.
Library
,
"gpu_count"
,
len
(
gpus
),
"available"
,
format
.
HumanBytes2
(
memoryAvailable
))
...
...
llm/server.go
View file @
840424a2
...
...
@@ -26,6 +26,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/server/envconfig"
)
type
LlamaServer
interface
{
...
...
@@ -124,7 +125,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
else
{
servers
=
serversForGpu
(
gpus
[
0
])
// All GPUs in the list are matching Library and Variant
}
demandLib
:=
strings
.
Trim
(
os
.
Getenv
(
"OLLAMA_LLM_LIBRARY"
),
"
\"
' "
)
demandLib
:=
envconfig
.
LLMLibrary
if
demandLib
!=
""
{
serverPath
:=
availableServers
[
demandLib
]
if
serverPath
==
""
{
...
...
@@ -145,7 +146,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
"--batch-size"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumBatch
),
"--embedding"
,
}
if
debug
:=
os
.
Getenv
(
"OLLAMA_DEBUG"
);
debug
!=
""
{
if
envconfig
.
Debug
{
params
=
append
(
params
,
"--log-format"
,
"json"
)
}
else
{
params
=
append
(
params
,
"--log-disable"
)
...
...
@@ -155,7 +156,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params
=
append
(
params
,
"--n-gpu-layers"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumGPU
))
}
if
debug
:=
os
.
Getenv
(
"OLLAMA_DEBUG"
);
debug
!=
""
{
if
envconfig
.
Debug
{
params
=
append
(
params
,
"--verbose"
)
}
...
...
@@ -194,15 +195,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
numParallel
:=
1
if
onp
:=
os
.
Getenv
(
"OLLAMA_NUM_PARALLEL"
);
onp
!=
""
{
numParallel
,
err
=
strconv
.
Atoi
(
onp
)
if
err
!=
nil
||
numParallel
<=
0
{
err
=
fmt
.
Errorf
(
"invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w"
,
onp
,
err
)
slog
.
Error
(
"misconfiguration"
,
"error"
,
err
)
return
nil
,
err
}
}
numParallel
:=
envconfig
.
NumParallel
params
=
append
(
params
,
"--parallel"
,
fmt
.
Sprintf
(
"%d"
,
numParallel
))
for
i
:=
0
;
i
<
len
(
servers
);
i
++
{
...
...
server/envconfig/config.go
0 → 100644
View file @
840424a2
package
envconfig
import
(
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
)
var
(
// Set via OLLAMA_ORIGINS in the environment
AllowOrigins
[]
string
// Set via OLLAMA_DEBUG in the environment
Debug
bool
// Set via OLLAMA_LLM_LIBRARY in the environment
LLMLibrary
string
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
MaxRunners
int
// Set via OLLAMA_MAX_QUEUE in the environment
MaxQueuedRequests
int
// Set via OLLAMA_MAX_VRAM in the environment
MaxVRAM
uint64
// Set via OLLAMA_NOPRUNE in the environment
NoPrune
bool
// Set via OLLAMA_NUM_PARALLEL in the environment
NumParallel
int
// Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir
string
// Set via OLLAMA_TMPDIR in the environment
TmpDir
string
)
func
AsMap
()
map
[
string
]
string
{
return
map
[
string
]
string
{
"OLLAMA_ORIGINS"
:
fmt
.
Sprintf
(
"%v"
,
AllowOrigins
),
"OLLAMA_DEBUG"
:
fmt
.
Sprintf
(
"%v"
,
Debug
),
"OLLAMA_LLM_LIBRARY"
:
fmt
.
Sprintf
(
"%v"
,
LLMLibrary
),
"OLLAMA_MAX_LOADED_MODELS"
:
fmt
.
Sprintf
(
"%v"
,
MaxRunners
),
"OLLAMA_MAX_QUEUE"
:
fmt
.
Sprintf
(
"%v"
,
MaxQueuedRequests
),
"OLLAMA_MAX_VRAM"
:
fmt
.
Sprintf
(
"%v"
,
MaxVRAM
),
"OLLAMA_NOPRUNE"
:
fmt
.
Sprintf
(
"%v"
,
NoPrune
),
"OLLAMA_NUM_PARALLEL"
:
fmt
.
Sprintf
(
"%v"
,
NumParallel
),
"OLLAMA_RUNNERS_DIR"
:
fmt
.
Sprintf
(
"%v"
,
RunnersDir
),
"OLLAMA_TMPDIR"
:
fmt
.
Sprintf
(
"%v"
,
TmpDir
),
}
}
var
defaultAllowOrigins
=
[]
string
{
"localhost"
,
"127.0.0.1"
,
"0.0.0.0"
,
}
// Clean quotes and spaces from the value
func
clean
(
key
string
)
string
{
return
strings
.
Trim
(
os
.
Getenv
(
key
),
"
\"
' "
)
}
func
init
()
{
// default values
NumParallel
=
1
MaxRunners
=
1
MaxQueuedRequests
=
512
LoadConfig
()
}
func
LoadConfig
()
{
if
debug
:=
clean
(
"OLLAMA_DEBUG"
);
debug
!=
""
{
d
,
err
:=
strconv
.
ParseBool
(
debug
)
if
err
==
nil
{
Debug
=
d
}
else
{
Debug
=
true
}
}
RunnersDir
=
clean
(
"OLLAMA_RUNNERS_DIR"
)
if
runtime
.
GOOS
==
"windows"
&&
RunnersDir
==
""
{
// On Windows we do not carry the payloads inside the main executable
appExe
,
err
:=
os
.
Executable
()
if
err
!=
nil
{
slog
.
Error
(
"failed to lookup executable path"
,
"error"
,
err
)
}
cwd
,
err
:=
os
.
Getwd
()
if
err
!=
nil
{
slog
.
Error
(
"failed to lookup working directory"
,
"error"
,
err
)
}
var
paths
[]
string
for
_
,
root
:=
range
[]
string
{
filepath
.
Dir
(
appExe
),
cwd
}
{
paths
=
append
(
paths
,
filepath
.
Join
(
root
),
filepath
.
Join
(
root
,
"windows-"
+
runtime
.
GOARCH
),
filepath
.
Join
(
root
,
"dist"
,
"windows-"
+
runtime
.
GOARCH
),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for
_
,
p
:=
range
paths
{
candidate
:=
filepath
.
Join
(
p
,
"ollama_runners"
)
_
,
err
:=
os
.
Stat
(
candidate
)
if
err
==
nil
{
RunnersDir
=
candidate
break
}
}
if
RunnersDir
==
""
{
slog
.
Error
(
"unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'"
)
}
}
TmpDir
=
clean
(
"OLLAMA_TMPDIR"
)
userLimit
:=
clean
(
"OLLAMA_MAX_VRAM"
)
if
userLimit
!=
""
{
avail
,
err
:=
strconv
.
ParseUint
(
userLimit
,
10
,
64
)
if
err
!=
nil
{
slog
.
Error
(
"invalid setting, ignoring"
,
"OLLAMA_MAX_VRAM"
,
userLimit
,
"error"
,
err
)
}
else
{
MaxVRAM
=
avail
}
}
LLMLibrary
=
clean
(
"OLLAMA_LLM_LIBRARY"
)
if
onp
:=
clean
(
"OLLAMA_NUM_PARALLEL"
);
onp
!=
""
{
val
,
err
:=
strconv
.
Atoi
(
onp
)
if
err
!=
nil
||
val
<=
0
{
slog
.
Error
(
"invalid setting must be greater than zero"
,
"OLLAMA_NUM_PARALLEL"
,
onp
,
"error"
,
err
)
}
else
{
NumParallel
=
val
}
}
if
noprune
:=
clean
(
"OLLAMA_NOPRUNE"
);
noprune
!=
""
{
NoPrune
=
true
}
if
origins
:=
clean
(
"OLLAMA_ORIGINS"
);
origins
!=
""
{
AllowOrigins
=
strings
.
Split
(
origins
,
","
)
}
for
_
,
allowOrigin
:=
range
defaultAllowOrigins
{
AllowOrigins
=
append
(
AllowOrigins
,
fmt
.
Sprintf
(
"http://%s"
,
allowOrigin
),
fmt
.
Sprintf
(
"https://%s"
,
allowOrigin
),
fmt
.
Sprintf
(
"http://%s:*"
,
allowOrigin
),
fmt
.
Sprintf
(
"https://%s:*"
,
allowOrigin
),
)
}
maxRunners
:=
clean
(
"OLLAMA_MAX_LOADED_MODELS"
)
if
maxRunners
!=
""
{
m
,
err
:=
strconv
.
Atoi
(
maxRunners
)
if
err
!=
nil
{
slog
.
Error
(
"invalid setting"
,
"OLLAMA_MAX_LOADED_MODELS"
,
maxRunners
,
"error"
,
err
)
}
else
{
MaxRunners
=
m
}
}
if
onp
:=
os
.
Getenv
(
"OLLAMA_MAX_QUEUE"
);
onp
!=
""
{
p
,
err
:=
strconv
.
Atoi
(
onp
)
if
err
!=
nil
||
p
<=
0
{
slog
.
Error
(
"invalid setting"
,
"OLLAMA_MAX_QUEUE"
,
onp
,
"error"
,
err
)
}
else
{
MaxQueuedRequests
=
p
}
}
}
server/envconfig/config_test.go
0 → 100644
View file @
840424a2
package
envconfig
import
(
"os"
"testing"
"github.com/stretchr/testify/require"
)
func
TestConfig
(
t
*
testing
.
T
)
{
os
.
Setenv
(
"OLLAMA_DEBUG"
,
""
)
LoadConfig
()
require
.
False
(
t
,
Debug
)
os
.
Setenv
(
"OLLAMA_DEBUG"
,
"false"
)
LoadConfig
()
require
.
False
(
t
,
Debug
)
os
.
Setenv
(
"OLLAMA_DEBUG"
,
"1"
)
LoadConfig
()
require
.
True
(
t
,
Debug
)
}
server/images.go
View file @
840424a2
...
...
@@ -29,6 +29,7 @@ import (
"github.com/ollama/ollama/convert"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/server/envconfig"
"github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
...
...
@@ -695,7 +696,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
return
err
}
if
noprune
:=
os
.
Getenv
(
"OLLAMA_NOPRUNE"
);
noprune
==
""
{
if
!
envconfig
.
NoPrune
{
if
err
:=
deleteUnusedLayers
(
nil
,
deleteMap
,
false
);
err
!=
nil
{
return
err
}
...
...
@@ -1026,7 +1027,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
// build deleteMap to prune unused layers
deleteMap
:=
make
(
map
[
string
]
struct
{})
if
noprune
=
os
.
Getenv
(
"OLLAMA_NOPRUNE"
);
noprune
==
""
{
if
!
envconfig
.
NoPrune
{
manifest
,
_
,
err
=
GetManifest
(
mp
)
if
err
!=
nil
&&
!
errors
.
Is
(
err
,
os
.
ErrNotExist
)
{
return
err
...
...
server/routes.go
View file @
840424a2
...
...
@@ -29,6 +29,7 @@ import (
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/server/envconfig"
"github.com/ollama/ollama/types/model"
"github.com/ollama/ollama/version"
)
...
...
@@ -859,12 +860,6 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
c
.
Status
(
http
.
StatusCreated
)
}
var
defaultAllowOrigins
=
[]
string
{
"localhost"
,
"127.0.0.1"
,
"0.0.0.0"
,
}
func
isLocalIP
(
ip
netip
.
Addr
)
bool
{
if
interfaces
,
err
:=
net
.
Interfaces
();
err
==
nil
{
for
_
,
iface
:=
range
interfaces
{
...
...
@@ -948,19 +943,7 @@ func (s *Server) GenerateRoutes() http.Handler {
config
:=
cors
.
DefaultConfig
()
config
.
AllowWildcard
=
true
config
.
AllowBrowserExtensions
=
true
if
allowedOrigins
:=
strings
.
Trim
(
os
.
Getenv
(
"OLLAMA_ORIGINS"
),
"
\"
'"
);
allowedOrigins
!=
""
{
config
.
AllowOrigins
=
strings
.
Split
(
allowedOrigins
,
","
)
}
for
_
,
allowOrigin
:=
range
defaultAllowOrigins
{
config
.
AllowOrigins
=
append
(
config
.
AllowOrigins
,
fmt
.
Sprintf
(
"http://%s"
,
allowOrigin
),
fmt
.
Sprintf
(
"https://%s"
,
allowOrigin
),
fmt
.
Sprintf
(
"http://%s:*"
,
allowOrigin
),
fmt
.
Sprintf
(
"https://%s:*"
,
allowOrigin
),
)
}
config
.
AllowOrigins
=
envconfig
.
AllowOrigins
r
:=
gin
.
Default
()
r
.
Use
(
...
...
@@ -999,10 +982,11 @@ func (s *Server) GenerateRoutes() http.Handler {
func
Serve
(
ln
net
.
Listener
)
error
{
level
:=
slog
.
LevelInfo
if
debug
:=
os
.
Getenv
(
"OLLAMA_DEBUG"
);
debug
!=
""
{
if
envconfig
.
Debug
{
level
=
slog
.
LevelDebug
}
slog
.
Info
(
"server config"
,
"env"
,
envconfig
.
AsMap
())
handler
:=
slog
.
NewTextHandler
(
os
.
Stderr
,
&
slog
.
HandlerOptions
{
Level
:
level
,
AddSource
:
true
,
...
...
@@ -1026,7 +1010,7 @@ func Serve(ln net.Listener) error {
return
err
}
if
noprune
:=
os
.
Getenv
(
"OLLAMA_NOPRUNE"
);
noprune
==
""
{
if
!
envconfig
.
NoPrune
{
// clean up unused layers and manifests
if
err
:=
PruneLayers
();
err
!=
nil
{
return
err
...
...
server/sched.go
View file @
840424a2
...
...
@@ -5,10 +5,8 @@ import (
"errors"
"fmt"
"log/slog"
"os"
"reflect"
"sort"
"strconv"
"strings"
"sync"
"time"
...
...
@@ -17,6 +15,7 @@ import (
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/server/envconfig"
"golang.org/x/exp/slices"
)
...
...
@@ -43,46 +42,14 @@ type Scheduler struct {
getGpuFn
func
()
gpu
.
GpuInfoList
}
var
(
// TODO set this to zero after a release or two, to enable multiple models by default
loadedMax
=
1
// Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
maxQueuedRequests
=
512
numParallel
=
1
ErrMaxQueue
=
fmt
.
Errorf
(
"server busy, please try again. maximum pending requests exceeded"
)
)
var
ErrMaxQueue
=
fmt
.
Errorf
(
"server busy, please try again. maximum pending requests exceeded"
)
func
InitScheduler
(
ctx
context
.
Context
)
*
Scheduler
{
maxRunners
:=
os
.
Getenv
(
"OLLAMA_MAX_LOADED_MODELS"
)
if
maxRunners
!=
""
{
m
,
err
:=
strconv
.
Atoi
(
maxRunners
)
if
err
!=
nil
{
slog
.
Error
(
"invalid setting"
,
"OLLAMA_MAX_LOADED_MODELS"
,
maxRunners
,
"error"
,
err
)
}
else
{
loadedMax
=
m
}
}
if
onp
:=
os
.
Getenv
(
"OLLAMA_NUM_PARALLEL"
);
onp
!=
""
{
p
,
err
:=
strconv
.
Atoi
(
onp
)
if
err
!=
nil
||
p
<=
0
{
slog
.
Error
(
"invalid parallel setting, must be greater than zero"
,
"OLLAMA_NUM_PARALLEL"
,
onp
,
"error"
,
err
)
}
else
{
numParallel
=
p
}
}
if
onp
:=
os
.
Getenv
(
"OLLAMA_MAX_QUEUE"
);
onp
!=
""
{
p
,
err
:=
strconv
.
Atoi
(
onp
)
if
err
!=
nil
||
p
<=
0
{
slog
.
Error
(
"invalid setting"
,
"OLLAMA_MAX_QUEUE"
,
onp
,
"error"
,
err
)
}
else
{
maxQueuedRequests
=
p
}
}
sched
:=
&
Scheduler
{
pendingReqCh
:
make
(
chan
*
LlmRequest
,
m
axQueuedRequests
),
finishedReqCh
:
make
(
chan
*
LlmRequest
,
m
axQueuedRequests
),
expiredCh
:
make
(
chan
*
runnerRef
,
m
axQueuedRequests
),
unloadedCh
:
make
(
chan
interface
{},
m
axQueuedRequests
),
pendingReqCh
:
make
(
chan
*
LlmRequest
,
envconfig
.
M
axQueuedRequests
),
finishedReqCh
:
make
(
chan
*
LlmRequest
,
envconfig
.
M
axQueuedRequests
),
expiredCh
:
make
(
chan
*
runnerRef
,
envconfig
.
M
axQueuedRequests
),
unloadedCh
:
make
(
chan
interface
{},
envconfig
.
M
axQueuedRequests
),
loaded
:
make
(
map
[
string
]
*
runnerRef
),
newServerFn
:
llm
.
NewLlamaServer
,
getGpuFn
:
gpu
.
GetGPUInfo
,
...
...
@@ -94,7 +61,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner
func
(
s
*
Scheduler
)
GetRunner
(
c
context
.
Context
,
model
*
Model
,
opts
api
.
Options
,
sessionDuration
time
.
Duration
)
(
chan
*
runnerRef
,
chan
error
)
{
// allocate a large enough kv cache for all parallel requests
opts
.
NumCtx
=
opts
.
NumCtx
*
n
umParallel
opts
.
NumCtx
=
opts
.
NumCtx
*
envconfig
.
N
umParallel
req
:=
&
LlmRequest
{
ctx
:
c
,
...
...
@@ -147,11 +114,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending
.
useLoadedRunner
(
runner
,
s
.
finishedReqCh
)
break
}
}
else
if
loadedMax
>
0
&&
loadedCount
>=
loadedMax
{
}
else
if
envconfig
.
MaxRunners
>
0
&&
loadedCount
>=
envconfig
.
MaxRunners
{
slog
.
Debug
(
"max runners achieved, unloading one to make room"
,
"runner_count"
,
loadedCount
)
runnerToExpire
=
s
.
findRunnerToUnload
(
pending
)
}
else
{
// Either no models are loaded or below
loadedMax
// Either no models are loaded or below
envconfig.MaxRunners
// Get a refreshed GPU list
gpus
:=
s
.
getGpuFn
()
...
...
@@ -162,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}
// If we're CPU only mode, just limit by
loadedMax
above
// If we're CPU only mode, just limit by
envconfig.MaxRunners
above
// TODO handle system memory exhaustion
if
(
len
(
gpus
)
==
1
&&
gpus
[
0
]
.
Library
==
"cpu"
)
||
pending
.
opts
.
NumGPU
==
0
{
slog
.
Debug
(
"cpu mode with existing models, loading"
)
...
...
server/sched_test.go
View file @
840424a2
...
...
@@ -15,6 +15,7 @@ import (
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/server/envconfig"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
...
...
@@ -27,34 +28,10 @@ func init() {
func
TestInitScheduler
(
t
*
testing
.
T
)
{
ctx
,
done
:=
context
.
WithCancel
(
context
.
Background
())
defer
done
()
initialMax
:=
loadedMax
initialParallel
:=
numParallel
s
:=
InitScheduler
(
ctx
)
require
.
Equal
(
t
,
initialMax
,
loadedMax
)
s
.
loadedMu
.
Lock
()
require
.
NotNil
(
t
,
s
.
loaded
)
s
.
loadedMu
.
Unlock
()
os
.
Setenv
(
"OLLAMA_MAX_LOADED_MODELS"
,
"blue"
)
s
=
InitScheduler
(
ctx
)
require
.
Equal
(
t
,
initialMax
,
loadedMax
)
s
.
loadedMu
.
Lock
()
require
.
NotNil
(
t
,
s
.
loaded
)
s
.
loadedMu
.
Unlock
()
os
.
Setenv
(
"OLLAMA_MAX_LOADED_MODELS"
,
"0"
)
s
=
InitScheduler
(
ctx
)
require
.
Equal
(
t
,
0
,
loadedMax
)
s
.
loadedMu
.
Lock
()
require
.
NotNil
(
t
,
s
.
loaded
)
s
.
loadedMu
.
Unlock
()
os
.
Setenv
(
"OLLAMA_NUM_PARALLEL"
,
"blue"
)
_
=
InitScheduler
(
ctx
)
require
.
Equal
(
t
,
initialParallel
,
numParallel
)
os
.
Setenv
(
"OLLAMA_NUM_PARALLEL"
,
"10"
)
_
=
InitScheduler
(
ctx
)
require
.
Equal
(
t
,
10
,
numParallel
)
}
func
TestLoad
(
t
*
testing
.
T
)
{
...
...
@@ -249,7 +226,7 @@ func TestRequests(t *testing.T) {
t
.
Errorf
(
"timeout"
)
}
loadedMax
=
1
envconfig
.
MaxRunners
=
1
s
.
newServerFn
=
scenario3a
.
newServer
slog
.
Info
(
"scenario3a"
)
s
.
pendingReqCh
<-
scenario3a
.
req
...
...
@@ -268,7 +245,7 @@ func TestRequests(t *testing.T) {
require
.
Len
(
t
,
s
.
loaded
,
1
)
s
.
loadedMu
.
Unlock
()
loadedMax
=
0
envconfig
.
MaxRunners
=
0
s
.
newServerFn
=
scenario3b
.
newServer
slog
.
Info
(
"scenario3b"
)
s
.
pendingReqCh
<-
scenario3b
.
req
...
...
@@ -339,7 +316,7 @@ func TestGetRunner(t *testing.T) {
scenario1b
.
req
.
sessionDuration
=
0
scenario1c
:=
newScenario
(
t
,
ctx
,
"ollama-model-1c"
,
10
)
scenario1c
.
req
.
sessionDuration
=
0
m
axQueuedRequests
=
1
envconfig
.
M
axQueuedRequests
=
1
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
func
()
gpu
.
GpuInfoList
{
g
:=
gpu
.
GpuInfo
{
Library
:
"metal"
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment