Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
05cd82ef
Unverified
Commit
05cd82ef
authored
Oct 16, 2024
by
Daniel Hiltgen
Committed by
GitHub
Oct 16, 2024
Browse files
Rename gpu package discover (#7143)
Cleaning up go package naming
parent
7d6eb0d4
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
85 additions
and
85 deletions
+85
-85
discover/gpu_oneapi.go
discover/gpu_oneapi.go
+1
-1
discover/gpu_test.go
discover/gpu_test.go
+1
-1
discover/gpu_windows.go
discover/gpu_windows.go
+1
-1
discover/gpu_windows_test.go
discover/gpu_windows_test.go
+1
-1
discover/types.go
discover/types.go
+1
-1
llm/memory.go
llm/memory.go
+4
-4
llm/memory_test.go
llm/memory_test.go
+3
-3
llm/server.go
llm/server.go
+8
-8
runners/common.go
runners/common.go
+7
-7
server/routes.go
server/routes.go
+2
-2
server/routes_generate_test.go
server/routes_generate_test.go
+9
-9
server/sched.go
server/sched.go
+23
-23
server/sched_test.go
server/sched_test.go
+24
-24
No files found.
gpu
/gpu_oneapi.go
→
discover
/gpu_oneapi.go
View file @
05cd82ef
//go:build linux || windows
package
gpu
package
discover
import
(
"log/slog"
...
...
gpu
/gpu_test.go
→
discover
/gpu_test.go
View file @
05cd82ef
package
gpu
package
discover
import
(
"runtime"
...
...
gpu
/gpu_windows.go
→
discover
/gpu_windows.go
View file @
05cd82ef
package
gpu
package
discover
import
(
"fmt"
...
...
gpu
/gpu_windows_test.go
→
discover
/gpu_windows_test.go
View file @
05cd82ef
package
gpu
package
discover
import
"testing"
...
...
gpu
/types.go
→
discover
/types.go
View file @
05cd82ef
package
gpu
package
discover
import
(
"fmt"
...
...
llm/memory.go
View file @
05cd82ef
...
...
@@ -7,13 +7,13 @@ import (
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
)
// This algorithm looks for a complete fit to determine if we need to unload other models
func
PredictServerFit
(
allGpus
gpu
.
GpuInfoList
,
ggml
*
GGML
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
bool
,
uint64
)
{
func
PredictServerFit
(
allGpus
discover
.
GpuInfoList
,
ggml
*
GGML
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
bool
,
uint64
)
{
// Split up the GPUs by type and try them
var
estimatedVRAM
uint64
for
_
,
gpus
:=
range
allGpus
.
ByLibrary
()
{
...
...
@@ -67,7 +67,7 @@ type MemoryEstimate struct {
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
func
EstimateGPULayers
(
gpus
[]
gpu
.
GpuInfo
,
ggml
*
GGML
,
projectors
[]
string
,
opts
api
.
Options
)
MemoryEstimate
{
func
EstimateGPULayers
(
gpus
[]
discover
.
GpuInfo
,
ggml
*
GGML
,
projectors
[]
string
,
opts
api
.
Options
)
MemoryEstimate
{
// Graph size for a partial offload, applies to all GPUs
var
graphPartialOffload
uint64
...
...
@@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
gpuAllocations
:=
make
([]
uint64
,
len
(
gpus
))
type
gs
struct
{
i
int
g
*
gpu
.
GpuInfo
g
*
discover
.
GpuInfo
}
gpusWithSpace
:=
[]
gs
{}
for
i
:=
range
gpus
{
...
...
llm/memory_test.go
View file @
05cd82ef
...
...
@@ -10,7 +10,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/
gpu
"
"github.com/ollama/ollama/
discover
"
)
func
TestEstimateGPULayers
(
t
*
testing
.
T
)
{
...
...
@@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
}
// Simple CPU scenario
gpus
:=
[]
gpu
.
GpuInfo
{
gpus
:=
[]
discover
.
GpuInfo
{
{
Library
:
"cpu"
,
},
...
...
@@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {
// Dual CUDA scenario with assymetry
gpuMinimumMemory
:=
uint64
(
2048
)
gpus
=
[]
gpu
.
GpuInfo
{
gpus
=
[]
discover
.
GpuInfo
{
{
Library
:
"cuda"
,
MinimumMemory
:
gpuMinimumMemory
,
...
...
llm/server.go
View file @
05cd82ef
...
...
@@ -26,9 +26,9 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/build"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llama"
"github.com/ollama/ollama/runners"
)
...
...
@@ -61,8 +61,8 @@ type llmServer struct {
estimate
MemoryEstimate
totalLayers
uint64
// gpuCount int
gpus
gpu
.
GpuInfoList
// Recorded just before the model loaded, free space will be incorrect
loadDuration
time
.
Duration
// Record how long it took the model to load
gpus
discover
.
GpuInfoList
// Recorded just before the model loaded, free space will be incorrect
loadDuration
time
.
Duration
// Record how long it took the model to load
loadProgress
float32
sem
*
semaphore
.
Weighted
...
...
@@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
// NewLlamaServer will run a server for the given GPUs
// The gpu list must be a single family.
func
NewLlamaServer
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
GGML
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
LlamaServer
,
error
)
{
func
NewLlamaServer
(
gpus
discover
.
GpuInfoList
,
model
string
,
ggml
*
GGML
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
LlamaServer
,
error
)
{
var
err
error
var
cpuRunner
string
var
estimate
MemoryEstimate
...
...
@@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
var
systemFreeMemory
uint64
var
systemSwapFreeMemory
uint64
systemInfo
:=
gpu
.
GetSystemInfo
()
systemInfo
:=
discover
.
GetSystemInfo
()
systemTotalMemory
=
systemInfo
.
System
.
TotalMemory
systemFreeMemory
=
systemInfo
.
System
.
FreeMemory
systemSwapFreeMemory
=
systemInfo
.
System
.
FreeSwap
...
...
@@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if
opts
.
NumGPU
==
0
{
gpus
=
gpu
.
GetCPUInfo
()
gpus
=
discover
.
GetCPUInfo
()
}
if
len
(
gpus
)
==
1
&&
gpus
[
0
]
.
Library
==
"cpu"
{
cpuRunner
=
runners
.
ServerForCpu
()
...
...
@@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
case
gpus
[
0
]
.
Library
!=
"metal"
&&
estimate
.
Layers
==
0
:
// Don't bother loading into the GPU if no layers can fit
cpuRunner
=
runners
.
ServerForCpu
()
gpus
=
gpu
.
GetCPUInfo
()
gpus
=
discover
.
GetCPUInfo
()
case
opts
.
NumGPU
<
0
&&
estimate
.
Layers
>
0
&&
gpus
[
0
]
.
Library
!=
"cpu"
:
opts
.
NumGPU
=
estimate
.
Layers
}
...
...
@@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
if
strings
.
HasPrefix
(
servers
[
i
],
"cpu"
)
{
gpus
=
gpu
.
GetCPUInfo
()
gpus
=
discover
.
GetCPUInfo
()
}
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
...
...
runners/common.go
View file @
05cd82ef
...
...
@@ -18,8 +18,8 @@ import (
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu"
)
const
(
...
...
@@ -301,11 +301,11 @@ func GetAvailableServers(payloadsDir string) map[string]string {
// serversForGpu returns a list of compatible servers give the provided GPU
// info, ordered by performance. assumes Init() has been called
// TODO - switch to metadata based mapping
func
ServersForGpu
(
info
gpu
.
GpuInfo
)
[]
string
{
func
ServersForGpu
(
info
discover
.
GpuInfo
)
[]
string
{
// glob workDir for files that start with ollama_
availableServers
:=
GetAvailableServers
(
runnersDir
)
requested
:=
info
.
Library
if
info
.
Variant
!=
gpu
.
CPUCapabilityNone
.
String
()
{
if
info
.
Variant
!=
discover
.
CPUCapabilityNone
.
String
()
{
requested
+=
"_"
+
info
.
Variant
}
...
...
@@ -341,12 +341,12 @@ func ServersForGpu(info gpu.GpuInfo) []string {
if
!
(
runtime
.
GOOS
==
"darwin"
&&
runtime
.
GOARCH
==
"arm64"
)
{
// Load up the best CPU variant if not primary requested
if
info
.
Library
!=
"cpu"
{
variant
:=
gpu
.
GetCPUCapability
()
variant
:=
discover
.
GetCPUCapability
()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if
variant
!=
gpu
.
CPUCapabilityNone
{
if
variant
!=
discover
.
CPUCapabilityNone
{
for
cmp
:=
range
availableServers
{
if
cmp
==
"cpu_"
+
variant
.
String
()
{
servers
=
append
(
servers
,
cmp
)
...
...
@@ -371,9 +371,9 @@ func ServerForCpu() string {
if
runtime
.
GOOS
==
"darwin"
&&
runtime
.
GOARCH
==
"arm64"
{
return
"metal"
}
variant
:=
gpu
.
GetCPUCapability
()
variant
:=
discover
.
GetCPUCapability
()
availableServers
:=
GetAvailableServers
(
runnersDir
)
if
variant
!=
gpu
.
CPUCapabilityNone
{
if
variant
!=
discover
.
CPUCapabilityNone
{
for
cmp
:=
range
availableServers
{
if
cmp
==
"cpu_"
+
variant
.
String
()
{
return
cmp
...
...
server/routes.go
View file @
05cd82ef
...
...
@@ -27,8 +27,8 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/build"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser"
...
...
@@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error {
// At startup we retrieve GPU information so we can get log messages before loading a model
// This will log warnings to the log in case we have problems with detected GPUs
gpus
:=
gpu
.
GetGPUInfo
()
gpus
:=
discover
.
GetGPUInfo
()
gpus
.
LogDetails
()
err
=
srvr
.
Serve
(
ln
)
...
...
server/routes_generate_test.go
View file @
05cd82ef
...
...
@@ -15,7 +15,7 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/
gpu
"
"github.com/ollama/ollama/
discover
"
"github.com/ollama/ollama/llm"
)
...
...
@@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
return
}
func
newMockServer
(
mock
*
mockRunner
)
func
(
gpu
.
GpuInfoList
,
string
,
*
llm
.
GGML
,
[]
string
,
[]
string
,
api
.
Options
,
int
)
(
llm
.
LlamaServer
,
error
)
{
return
func
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
projectors
,
system
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
func
newMockServer
(
mock
*
mockRunner
)
func
(
discover
.
GpuInfoList
,
string
,
*
llm
.
GGML
,
[]
string
,
[]
string
,
api
.
Options
,
int
)
(
llm
.
LlamaServer
,
error
)
{
return
func
(
gpus
discover
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
projectors
,
system
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
return
mock
,
nil
}
}
...
...
@@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) {
unloadedCh
:
make
(
chan
any
,
1
),
loaded
:
make
(
map
[
string
]
*
runnerRef
),
newServerFn
:
newMockServer
(
&
mock
),
getGpuFn
:
gpu
.
GetGPUInfo
,
getCpuFn
:
gpu
.
GetCPUInfo
,
getGpuFn
:
discover
.
GetGPUInfo
,
getCpuFn
:
discover
.
GetCPUInfo
,
reschedDelay
:
250
*
time
.
Millisecond
,
loadFn
:
func
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
,
numParallel
int
)
{
loadFn
:
func
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
discover
.
GpuInfoList
,
numParallel
int
)
{
// add small delay to simulate loading
time
.
Sleep
(
time
.
Millisecond
)
req
.
successCh
<-
&
runnerRef
{
...
...
@@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) {
unloadedCh
:
make
(
chan
any
,
1
),
loaded
:
make
(
map
[
string
]
*
runnerRef
),
newServerFn
:
newMockServer
(
&
mock
),
getGpuFn
:
gpu
.
GetGPUInfo
,
getCpuFn
:
gpu
.
GetCPUInfo
,
getGpuFn
:
discover
.
GetGPUInfo
,
getCpuFn
:
discover
.
GetCPUInfo
,
reschedDelay
:
250
*
time
.
Millisecond
,
loadFn
:
func
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
,
numParallel
int
)
{
loadFn
:
func
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
discover
.
GpuInfoList
,
numParallel
int
)
{
// add small delay to simulate loading
time
.
Sleep
(
time
.
Millisecond
)
req
.
successCh
<-
&
runnerRef
{
...
...
server/sched.go
View file @
05cd82ef
...
...
@@ -15,9 +15,9 @@ import (
"time"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
)
...
...
@@ -41,10 +41,10 @@ type Scheduler struct {
loaded
map
[
string
]
*
runnerRef
loadedMu
sync
.
Mutex
loadFn
func
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
,
numParallel
int
)
newServerFn
func
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
getGpuFn
func
()
gpu
.
GpuInfoList
getCpuFn
func
()
gpu
.
GpuInfoList
loadFn
func
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
discover
.
GpuInfoList
,
numParallel
int
)
newServerFn
func
(
gpus
discover
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
getGpuFn
func
()
discover
.
GpuInfoList
getCpuFn
func
()
discover
.
GpuInfoList
reschedDelay
time
.
Duration
}
...
...
@@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
unloadedCh
:
make
(
chan
interface
{},
maxQueue
),
loaded
:
make
(
map
[
string
]
*
runnerRef
),
newServerFn
:
llm
.
NewLlamaServer
,
getGpuFn
:
gpu
.
GetGPUInfo
,
getCpuFn
:
gpu
.
GetCPUInfo
,
getGpuFn
:
discover
.
GetGPUInfo
,
getCpuFn
:
discover
.
GetCPUInfo
,
reschedDelay
:
250
*
time
.
Millisecond
,
}
sched
.
loadFn
=
sched
.
load
...
...
@@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
else
{
// Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list
var
gpus
gpu
.
GpuInfoList
var
gpus
discover
.
GpuInfoList
if
pending
.
opts
.
NumGPU
==
0
{
gpus
=
s
.
getCpuFn
()
}
else
{
...
...
@@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
}()
}
func
(
s
*
Scheduler
)
load
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
,
numParallel
int
)
{
func
(
s
*
Scheduler
)
load
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
discover
.
GpuInfoList
,
numParallel
int
)
{
if
numParallel
<
1
{
numParallel
=
1
}
...
...
@@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
}()
}
func
(
s
*
Scheduler
)
updateFreeSpace
(
allGpus
gpu
.
GpuInfoList
)
{
func
(
s
*
Scheduler
)
updateFreeSpace
(
allGpus
discover
.
GpuInfoList
)
{
type
predKey
struct
{
Library
string
ID
string
...
...
@@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
// This routine returns the set of GPUs that do not have an active loading model.
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
func
(
s
*
Scheduler
)
filterGPUsWithoutLoadingModels
(
allGpus
gpu
.
GpuInfoList
)
gpu
.
GpuInfoList
{
ret
:=
append
(
gpu
.
GpuInfoList
{},
allGpus
...
)
func
(
s
*
Scheduler
)
filterGPUsWithoutLoadingModels
(
allGpus
discover
.
GpuInfoList
)
discover
.
GpuInfoList
{
ret
:=
append
(
discover
.
GpuInfoList
{},
allGpus
...
)
s
.
loadedMu
.
Lock
()
defer
s
.
loadedMu
.
Unlock
()
for
_
,
runner
:=
range
s
.
loaded
{
...
...
@@ -541,8 +541,8 @@ type runnerRef struct {
// unloading bool // set to true when we are trying to unload the runner
llama
llm
.
LlamaServer
loading
bool
// True only during initial load, then false forever
gpus
gpu
.
GpuInfoList
// Recorded at time of provisioning
loading
bool
// True only during initial load, then false forever
gpus
discover
.
GpuInfoList
// Recorded at time of provisioning
estimatedVRAM
uint64
estimatedTotal
uint64
...
...
@@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
start
:=
time
.
Now
()
// Establish a baseline before we unload
gpusBefore
:=
gpu
.
GetGPUInfo
()
gpusBefore
:=
discover
.
GetGPUInfo
()
var
totalMemoryBefore
,
freeMemoryBefore
uint64
for
_
,
gpu
:=
range
gpusBefore
{
totalMemoryBefore
+=
gpu
.
TotalMemory
...
...
@@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
}
// Query GPUs, look for free to go back up
gpusNow
:=
gpu
.
GetGPUInfo
()
gpusNow
:=
discover
.
GetGPUInfo
()
var
totalMemoryNow
,
freeMemoryNow
uint64
for
_
,
gpu
:=
range
gpusNow
{
totalMemoryNow
+=
gpu
.
TotalMemory
...
...
@@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
// If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
// opts.NumCtx accordingly
func
pickBestFullFitByLibrary
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
,
numParallel
*
int
)
gpu
.
GpuInfoList
{
func
pickBestFullFitByLibrary
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
discover
.
GpuInfoList
,
numParallel
*
int
)
discover
.
GpuInfoList
{
var
estimatedVRAM
uint64
var
numParallelToTry
[]
int
...
...
@@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
for
_
,
gl
:=
range
gpus
.
ByLibrary
()
{
var
ok
bool
sgl
:=
append
(
make
(
gpu
.
GpuInfoList
,
0
,
len
(
gl
)),
gl
...
)
sgl
:=
append
(
make
(
discover
.
GpuInfoList
,
0
,
len
(
gl
)),
gl
...
)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
sort
.
Sort
(
sort
.
Reverse
(
gpu
.
ByFreeMemory
(
sgl
)))
sort
.
Sort
(
sort
.
Reverse
(
discover
.
ByFreeMemory
(
sgl
)))
// First attempt to fit the model into a single GPU
for
_
,
p
:=
range
numParallelToTry
{
req
.
opts
.
NumCtx
=
req
.
origNumCtx
*
p
if
!
envconfig
.
SchedSpread
()
{
for
_
,
g
:=
range
sgl
{
if
ok
,
estimatedVRAM
=
llm
.
PredictServerFit
([]
gpu
.
GpuInfo
{
g
},
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
);
ok
{
if
ok
,
estimatedVRAM
=
llm
.
PredictServerFit
([]
discover
.
GpuInfo
{
g
},
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
);
ok
{
slog
.
Info
(
"new model will fit in available VRAM in single GPU, loading"
,
"model"
,
req
.
model
.
ModelPath
,
"gpu"
,
g
.
ID
,
"parallel"
,
p
,
"available"
,
g
.
FreeMemory
,
"required"
,
format
.
HumanBytes2
(
estimatedVRAM
))
*
numParallel
=
p
return
[]
gpu
.
GpuInfo
{
g
}
return
[]
discover
.
GpuInfo
{
g
}
}
}
}
...
...
@@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func
pickBestPartialFitByLibrary
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
,
numParallel
*
int
)
gpu
.
GpuInfoList
{
func
pickBestPartialFitByLibrary
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
discover
.
GpuInfoList
,
numParallel
*
int
)
discover
.
GpuInfoList
{
if
*
numParallel
<=
0
{
*
numParallel
=
1
req
.
opts
.
NumCtx
=
req
.
origNumCtx
...
...
@@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {
// If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded
func
(
s
*
Scheduler
)
maybeFindCPURunnerToUnload
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
)
*
runnerRef
{
func
(
s
*
Scheduler
)
maybeFindCPURunnerToUnload
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
discover
.
GpuInfoList
)
*
runnerRef
{
slog
.
Debug
(
"evaluating if CPU model load will fit in available system memory"
)
estimate
:=
llm
.
EstimateGPULayers
(
gpus
,
ggml
,
req
.
model
.
ProjectorPaths
,
req
.
opts
)
if
estimate
.
TotalSize
<=
gpus
[
0
]
.
FreeMemory
{
...
...
server/sched_test.go
View file @
05cd82ef
...
...
@@ -13,8 +13,8 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/app/lifecycle"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm"
)
...
...
@@ -47,10 +47,10 @@ func TestLoad(t *testing.T) {
sessionDuration
:
&
api
.
Duration
{
Duration
:
2
*
time
.
Second
},
}
// Fail to load model first
s
.
newServerFn
=
func
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
s
.
newServerFn
=
func
(
gpus
discover
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
return
nil
,
errors
.
New
(
"something failed to load model blah"
)
}
gpus
:=
gpu
.
GpuInfoList
{}
gpus
:=
discover
.
GpuInfoList
{}
s
.
load
(
req
,
ggml
,
gpus
,
0
)
require
.
Empty
(
t
,
req
.
successCh
)
require
.
Len
(
t
,
req
.
errCh
,
1
)
...
...
@@ -61,7 +61,7 @@ func TestLoad(t *testing.T) {
require
.
Contains
(
t
,
err
.
Error
(),
"this model may be incompatible"
)
server
:=
&
mockLlm
{
estimatedVRAM
:
10
,
estimatedVRAMByGPU
:
map
[
string
]
uint64
{}}
s
.
newServerFn
=
func
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
s
.
newServerFn
=
func
(
gpus
discover
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
return
server
,
nil
}
s
.
load
(
req
,
ggml
,
gpus
,
0
)
...
...
@@ -102,7 +102,7 @@ type reqBundle struct {
ggml
*
llm
.
GGML
}
func
(
scenario
*
reqBundle
)
newServer
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
func
(
scenario
*
reqBundle
)
newServer
(
gpus
discover
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
return
scenario
.
srv
,
nil
}
...
...
@@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
return
b
}
func
getGpuFn
()
gpu
.
GpuInfoList
{
g
:=
gpu
.
GpuInfo
{
Library
:
"metal"
}
func
getGpuFn
()
discover
.
GpuInfoList
{
g
:=
discover
.
GpuInfo
{
Library
:
"metal"
}
g
.
TotalMemory
=
24
*
format
.
GigaByte
g
.
FreeMemory
=
12
*
format
.
GigaByte
return
[]
gpu
.
GpuInfo
{
g
}
return
[]
discover
.
GpuInfo
{
g
}
}
func
getCpuFn
()
gpu
.
GpuInfoList
{
g
:=
gpu
.
GpuInfo
{
Library
:
"cpu"
}
func
getCpuFn
()
discover
.
GpuInfoList
{
g
:=
discover
.
GpuInfo
{
Library
:
"cpu"
}
g
.
TotalMemory
=
32
*
format
.
GigaByte
g
.
FreeMemory
=
26
*
format
.
GigaByte
return
[]
gpu
.
GpuInfo
{
g
}
return
[]
discover
.
GpuInfo
{
g
}
}
func
TestRequestsSameModelSameRequest
(
t
*
testing
.
T
)
{
...
...
@@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) {
}
var
ggml
*
llm
.
GGML
gpus
:=
gpu
.
GpuInfoList
{}
gpus
:=
discover
.
GpuInfoList
{}
server
:=
&
mockLlm
{
estimatedVRAM
:
10
,
estimatedVRAMByGPU
:
map
[
string
]
uint64
{}}
s
.
newServerFn
=
func
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
s
.
newServerFn
=
func
(
gpus
discover
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
return
server
,
nil
}
s
.
load
(
req
,
ggml
,
gpus
,
0
)
...
...
@@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) {
// Same model, same request
scenario1a
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1a"
,
10
,
nil
)
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
func
()
gpu
.
GpuInfoList
{
g
:=
gpu
.
GpuInfo
{
Library
:
"metal"
}
s
.
getGpuFn
=
func
()
discover
.
GpuInfoList
{
g
:=
discover
.
GpuInfo
{
Library
:
"metal"
}
g
.
TotalMemory
=
24
*
format
.
GigaByte
g
.
FreeMemory
=
12
*
format
.
GigaByte
return
[]
gpu
.
GpuInfo
{
g
}
return
[]
discover
.
GpuInfo
{
g
}
}
s
.
newServerFn
=
scenario1a
.
newServer
successCh1a
,
errCh1a
:=
s
.
GetRunner
(
scenario1a
.
ctx
,
scenario1a
.
req
.
model
,
scenario1a
.
req
.
opts
,
scenario1a
.
req
.
sessionDuration
)
...
...
@@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) {
func
TestUpdateFreeSpace
(
t
*
testing
.
T
)
{
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
100
*
time
.
Millisecond
)
defer
done
()
gpus
:=
gpu
.
GpuInfoList
{
gpus
:=
discover
.
GpuInfoList
{
{
Library
:
"a"
,
ID
:
"1"
,
...
...
@@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) {
func
TestFilterGPUsWithoutLoadingModels
(
t
*
testing
.
T
)
{
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
100
*
time
.
Millisecond
)
defer
done
()
gpus
:=
gpu
.
GpuInfoList
{
gpus
:=
discover
.
GpuInfoList
{
{
Library
:
"cuda"
,
ID
:
"0"
,
...
...
@@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
ID
:
"1"
,
},
}
r1
:=
&
runnerRef
{
gpus
:
gpu
.
GpuInfoList
{
gpus
[
0
]},
loading
:
true
}
r1
:=
&
runnerRef
{
gpus
:
discover
.
GpuInfoList
{
gpus
[
0
]},
loading
:
true
}
s
:=
InitScheduler
(
ctx
)
s
.
loadedMu
.
Lock
()
...
...
@@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
require
.
Len
(
t
,
tmp
,
1
)
require
.
Equal
(
t
,
"1"
,
tmp
[
0
]
.
ID
)
r1
.
gpus
=
gpu
.
GpuInfoList
{
gpus
[
1
]}
r1
.
gpus
=
discover
.
GpuInfoList
{
gpus
[
1
]}
tmp
=
s
.
filterGPUsWithoutLoadingModels
(
gpus
)
require
.
Len
(
t
,
tmp
,
1
)
require
.
Equal
(
t
,
"0"
,
tmp
[
0
]
.
ID
)
r1
.
gpus
=
gpu
.
GpuInfoList
{}
r1
.
gpus
=
discover
.
GpuInfoList
{}
tmp
=
s
.
filterGPUsWithoutLoadingModels
(
gpus
)
require
.
Len
(
t
,
tmp
,
2
)
}
...
...
@@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) {
defer
done
()
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
func
()
gpu
.
GpuInfoList
{
s
.
getGpuFn
=
func
()
discover
.
GpuInfoList
{
// Set memory values to require the model to be spread
gpus
:=
[]
gpu
.
GpuInfo
{
gpus
:=
[]
discover
.
GpuInfo
{
{
Library
:
"cuda"
},
{
Library
:
"rocm"
},
}
...
...
@@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) {
}
s
.
getCpuFn
=
getCpuFn
a
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1"
,
10
,
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
})
s
.
newServerFn
=
func
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
s
.
newServerFn
=
func
(
gpus
discover
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
require
.
Len
(
t
,
gpus
,
1
)
return
a
.
newServer
(
gpus
,
model
,
ggml
,
adapters
,
projectors
,
opts
,
numParallel
)
}
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment