Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
06e5d74e
Unverified
Commit
06e5d74e
authored
Jul 20, 2024
by
Daniel Hiltgen
Committed by
GitHub
Jul 20, 2024
Browse files
Merge pull request #5506 from dhiltgen/sched_tests
Refine scheduler unit tests for reliability
parents
5d707e6f
f4408219
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
195 additions
and
130 deletions
+195
-130
server/sched_test.go
server/sched_test.go
+195
-130
No files found.
server/sched_test.go
View file @
06e5d74e
...
@@ -7,6 +7,7 @@ import (
...
@@ -7,6 +7,7 @@ import (
"fmt"
"fmt"
"log/slog"
"log/slog"
"os"
"os"
"runtime"
"testing"
"testing"
"time"
"time"
...
@@ -94,7 +95,7 @@ func TestLoad(t *testing.T) {
...
@@ -94,7 +95,7 @@ func TestLoad(t *testing.T) {
require
.
Len
(
t
,
s
.
expiredCh
,
1
)
require
.
Len
(
t
,
s
.
expiredCh
,
1
)
}
}
type
b
undle
struct
{
type
reqB
undle
struct
{
ctx
context
.
Context
//nolint:containedctx
ctx
context
.
Context
//nolint:containedctx
ctxDone
func
()
ctxDone
func
()
srv
*
mockLlm
srv
*
mockLlm
...
@@ -102,13 +103,13 @@ type bundle struct {
...
@@ -102,13 +103,13 @@ type bundle struct {
ggml
*
llm
.
GGML
ggml
*
llm
.
GGML
}
}
func
(
scenario
*
b
undle
)
newServer
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
func
(
scenario
*
reqB
undle
)
newServer
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
return
scenario
.
srv
,
nil
return
scenario
.
srv
,
nil
}
}
func
newScenario
(
t
*
testing
.
T
,
ctx
context
.
Context
,
modelName
string
,
estimatedVRAM
uint64
)
*
b
undle
{
func
newScenario
Request
(
t
*
testing
.
T
,
ctx
context
.
Context
,
modelName
string
,
estimatedVRAM
uint64
,
duration
*
api
.
Duration
)
*
reqB
undle
{
scenario
:=
&
b
undle
{}
b
:=
&
reqB
undle
{}
scenario
.
ctx
,
scenario
.
ctxDone
=
context
.
WithCancel
(
ctx
)
b
.
ctx
,
b
.
ctxDone
=
context
.
WithCancel
(
ctx
)
t
.
Helper
()
t
.
Helper
()
f
,
err
:=
os
.
CreateTemp
(
t
.
TempDir
(),
modelName
)
f
,
err
:=
os
.
CreateTemp
(
t
.
TempDir
(),
modelName
)
...
@@ -135,124 +136,154 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
...
@@ -135,124 +136,154 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
fname
:=
f
.
Name
()
fname
:=
f
.
Name
()
model
:=
&
Model
{
Name
:
modelName
,
ModelPath
:
fname
}
model
:=
&
Model
{
Name
:
modelName
,
ModelPath
:
fname
}
scenario
.
ggml
,
err
=
llm
.
LoadModel
(
model
.
ModelPath
,
0
)
b
.
ggml
,
err
=
llm
.
LoadModel
(
model
.
ModelPath
,
0
)
require
.
NoError
(
t
,
err
)
require
.
NoError
(
t
,
err
)
scenario
.
req
=
&
LlmRequest
{
if
duration
==
nil
{
ctx
:
scenario
.
ctx
,
duration
=
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
}
}
b
.
req
=
&
LlmRequest
{
ctx
:
b
.
ctx
,
model
:
model
,
model
:
model
,
opts
:
api
.
DefaultOptions
(),
opts
:
api
.
DefaultOptions
(),
sessionDuration
:
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
}
,
sessionDuration
:
duration
,
successCh
:
make
(
chan
*
runnerRef
,
1
),
successCh
:
make
(
chan
*
runnerRef
,
1
),
errCh
:
make
(
chan
error
,
1
),
errCh
:
make
(
chan
error
,
1
),
}
}
scenario
.
srv
=
&
mockLlm
{
estimatedVRAM
:
estimatedVRAM
,
estimatedVRAMByGPU
:
map
[
string
]
uint64
{
""
:
estimatedVRAM
}}
b
.
srv
=
&
mockLlm
{
estimatedVRAM
:
estimatedVRAM
,
estimatedVRAMByGPU
:
map
[
string
]
uint64
{
""
:
estimatedVRAM
}}
return
scenario
return
b
}
}
func
TestRequests
(
t
*
testing
.
T
)
{
func
getGpuFn
()
gpu
.
GpuInfoList
{
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
10
*
time
.
Second
)
g
:=
gpu
.
GpuInfo
{
Library
:
"metal"
}
defer
done
()
g
.
TotalMemory
=
24
*
format
.
GigaByte
g
.
FreeMemory
=
12
*
format
.
GigaByte
// Same model, same request
return
[]
gpu
.
GpuInfo
{
g
}
scenario1a
:=
newScenario
(
t
,
ctx
,
"ollama-model-1"
,
10
)
}
scenario1a
.
req
.
sessionDuration
=
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
}
scenario1b
:=
newScenario
(
t
,
ctx
,
"ollama-model-1"
,
11
)
scenario1b
.
req
.
model
=
scenario1a
.
req
.
model
scenario1b
.
ggml
=
scenario1a
.
ggml
scenario1b
.
req
.
sessionDuration
=
&
api
.
Duration
{
Duration
:
0
}
// simple reload of same model
scenario2a
:=
newScenario
(
t
,
ctx
,
"ollama-model-1"
,
20
)
tmpModel
:=
*
scenario1a
.
req
.
model
scenario2a
.
req
.
model
=
&
tmpModel
scenario2a
.
ggml
=
scenario1a
.
ggml
scenario2a
.
req
.
sessionDuration
=
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
}
// Multiple loaded models
func
getCpuFn
()
gpu
.
GpuInfoList
{
scenario3a
:=
newScenario
(
t
,
ctx
,
"ollama-model-3a"
,
1
*
format
.
GigaByte
)
g
:=
gpu
.
GpuInfo
{
Library
:
"cpu"
}
scenario3b
:=
newScenario
(
t
,
ctx
,
"ollama-model-3b"
,
24
*
format
.
GigaByte
)
g
.
TotalMemory
=
32
*
format
.
GigaByte
scenario3c
:=
newScenario
(
t
,
ctx
,
"ollama-model-4a"
,
30
)
g
.
FreeMemory
=
26
*
format
.
GigaByte
scenario3c
.
req
.
opts
.
NumGPU
=
0
// CPU load, will be allowed
return
[]
gpu
.
GpuInfo
{
g
}
scenario3d
:=
newScenario
(
t
,
ctx
,
"ollama-model-3c"
,
30
)
// Needs prior unloaded
}
func
TestRequestsSameModelSameRequest
(
t
*
testing
.
T
)
{
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
500
*
time
.
Millisecond
)
defer
done
()
s
:=
InitScheduler
(
ctx
)
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
func
()
gpu
.
GpuInfoList
{
s
.
getGpuFn
=
getGpuFn
g
:=
gpu
.
GpuInfo
{
Library
:
"metal"
}
s
.
getCpuFn
=
getCpuFn
g
.
TotalMemory
=
24
*
format
.
GigaByte
a
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1"
,
10
,
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
})
g
.
FreeMemory
=
12
*
format
.
GigaByte
b
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1"
,
11
,
&
api
.
Duration
{
Duration
:
0
})
return
[]
gpu
.
GpuInfo
{
g
}
b
.
req
.
model
=
a
.
req
.
model
}
b
.
ggml
=
a
.
ggml
s
.
getCpuFn
=
func
()
gpu
.
GpuInfoList
{
g
:=
gpu
.
GpuInfo
{
Library
:
"cpu"
}
s
.
newServerFn
=
a
.
newServer
g
.
TotalMemory
=
32
*
format
.
GigaByte
slog
.
Info
(
"a"
)
g
.
FreeMemory
=
26
*
format
.
GigaByte
s
.
pendingReqCh
<-
a
.
req
return
[]
gpu
.
GpuInfo
{
g
}
}
s
.
newServerFn
=
scenario1a
.
newServer
slog
.
Info
(
"scenario1a"
)
s
.
pendingReqCh
<-
scenario1a
.
req
require
.
Len
(
t
,
s
.
pendingReqCh
,
1
)
require
.
Len
(
t
,
s
.
pendingReqCh
,
1
)
s
.
Run
(
ctx
)
s
.
Run
(
ctx
)
select
{
select
{
case
resp
:=
<-
scenario1
a
.
req
.
successCh
:
case
resp
:=
<-
a
.
req
.
successCh
:
require
.
Equal
(
t
,
resp
.
llama
,
scenario1
a
.
srv
)
require
.
Equal
(
t
,
resp
.
llama
,
a
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
scenario1
a
.
req
.
errCh
)
require
.
Empty
(
t
,
a
.
req
.
errCh
)
case
err
:=
<-
scenario1
a
.
req
.
errCh
:
case
err
:=
<-
a
.
req
.
errCh
:
t
.
Fatal
(
err
.
Error
())
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
}
}
// Same runner as first request due to not needing a reload
// Same runner as first request due to not needing a reload
s
.
newServerFn
=
scenario1
b
.
newServer
s
.
newServerFn
=
b
.
newServer
slog
.
Info
(
"
scenario1
b"
)
slog
.
Info
(
"b"
)
s
.
pendingReqCh
<-
scenario1
b
.
req
s
.
pendingReqCh
<-
b
.
req
select
{
select
{
case
resp
:=
<-
scenario1
b
.
req
.
successCh
:
case
resp
:=
<-
b
.
req
.
successCh
:
require
.
Equal
(
t
,
resp
.
llama
,
scenario1
a
.
srv
)
require
.
Equal
(
t
,
resp
.
llama
,
a
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
scenario1b
.
req
.
errCh
)
require
.
Empty
(
t
,
b
.
req
.
errCh
)
case
err
:=
<-
scenario1b
.
req
.
errCh
:
case
err
:=
<-
b
.
req
.
errCh
:
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
}
}
func
TestRequestsSimpleReloadSameModel
(
t
*
testing
.
T
)
{
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
500
*
time
.
Millisecond
)
defer
done
()
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
getGpuFn
s
.
getCpuFn
=
getCpuFn
a
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1"
,
10
,
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
})
b
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1"
,
20
,
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
})
tmpModel
:=
*
a
.
req
.
model
b
.
req
.
model
=
&
tmpModel
b
.
ggml
=
a
.
ggml
s
.
newServerFn
=
a
.
newServer
slog
.
Info
(
"a"
)
s
.
pendingReqCh
<-
a
.
req
require
.
Len
(
t
,
s
.
pendingReqCh
,
1
)
s
.
Run
(
ctx
)
select
{
case
resp
:=
<-
a
.
req
.
successCh
:
require
.
Equal
(
t
,
resp
.
llama
,
a
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
a
.
req
.
errCh
)
case
err
:=
<-
a
.
req
.
errCh
:
t
.
Fatal
(
err
.
Error
())
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
}
}
// Trigger a reload
// Trigger a reload
s
.
newServerFn
=
scenario2a
.
newServer
s
.
newServerFn
=
b
.
newServer
scenario2a
.
req
.
model
.
AdapterPaths
=
[]
string
{
"new"
}
b
.
req
.
model
.
AdapterPaths
=
[]
string
{
"new"
}
slog
.
Info
(
"
scenario2a
"
)
slog
.
Info
(
"
b
"
)
s
.
pendingReqCh
<-
scenario2a
.
req
s
.
pendingReqCh
<-
b
.
req
// finish first two requests, so model can reload
// finish first two requests, so model can reload
time
.
Sleep
(
1
*
time
.
Millisecond
)
time
.
Sleep
(
1
*
time
.
Millisecond
)
scenario1a
.
ctxDone
()
a
.
ctxDone
()
scenario1b
.
ctxDone
()
select
{
select
{
case
resp
:=
<-
scenario2a
.
req
.
successCh
:
case
resp
:=
<-
b
.
req
.
successCh
:
require
.
Equal
(
t
,
resp
.
llama
,
scenario2a
.
srv
)
require
.
Equal
(
t
,
resp
.
llama
,
b
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
scenario2a
.
req
.
errCh
)
require
.
Empty
(
t
,
b
.
req
.
errCh
)
case
err
:=
<-
scenario2a
.
req
.
errCh
:
case
err
:=
<-
b
.
req
.
errCh
:
t
.
Fatal
(
err
.
Error
())
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
}
}
}
func
TestRequestsMultipleLoadedModels
(
t
*
testing
.
T
)
{
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
500
*
time
.
Millisecond
)
defer
done
()
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
getGpuFn
s
.
getCpuFn
=
getCpuFn
// Multiple loaded models
a
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-3a"
,
1
*
format
.
GigaByte
,
nil
)
b
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-3b"
,
24
*
format
.
GigaByte
,
nil
)
c
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-4a"
,
30
,
nil
)
c
.
req
.
opts
.
NumGPU
=
0
// CPU load, will be allowed
d
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-3c"
,
30
,
nil
)
// Needs prior unloaded
envconfig
.
MaxRunners
=
1
envconfig
.
MaxRunners
=
1
s
.
newServerFn
=
scenario3a
.
newServer
s
.
newServerFn
=
a
.
newServer
slog
.
Info
(
"scenario3a"
)
slog
.
Info
(
"a"
)
s
.
pendingReqCh
<-
scenario3a
.
req
s
.
pendingReqCh
<-
a
.
req
// finish prior request, so new model can load
s
.
Run
(
ctx
)
time
.
Sleep
(
1
*
time
.
Millisecond
)
scenario2a
.
ctxDone
()
select
{
select
{
case
resp
:=
<-
scenario3
a
.
req
.
successCh
:
case
resp
:=
<-
a
.
req
.
successCh
:
require
.
Equal
(
t
,
resp
.
llama
,
scenario3
a
.
srv
)
require
.
Equal
(
t
,
resp
.
llama
,
a
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
scenario3
a
.
req
.
errCh
)
require
.
Empty
(
t
,
a
.
req
.
errCh
)
case
err
:=
<-
scenario3
a
.
req
.
errCh
:
case
err
:=
<-
a
.
req
.
errCh
:
t
.
Fatal
(
err
.
Error
())
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
...
@@ -262,15 +293,15 @@ func TestRequests(t *testing.T) {
...
@@ -262,15 +293,15 @@ func TestRequests(t *testing.T) {
s
.
loadedMu
.
Unlock
()
s
.
loadedMu
.
Unlock
()
envconfig
.
MaxRunners
=
0
envconfig
.
MaxRunners
=
0
s
.
newServerFn
=
scenario3
b
.
newServer
s
.
newServerFn
=
b
.
newServer
slog
.
Info
(
"
scenario3
b"
)
slog
.
Info
(
"b"
)
s
.
pendingReqCh
<-
scenario3
b
.
req
s
.
pendingReqCh
<-
b
.
req
select
{
select
{
case
resp
:=
<-
scenario3
b
.
req
.
successCh
:
case
resp
:=
<-
b
.
req
.
successCh
:
require
.
Equal
(
t
,
resp
.
llama
,
scenario3
b
.
srv
)
require
.
Equal
(
t
,
resp
.
llama
,
b
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
scenario3
b
.
req
.
errCh
)
require
.
Empty
(
t
,
b
.
req
.
errCh
)
case
err
:=
<-
scenario3
b
.
req
.
errCh
:
case
err
:=
<-
b
.
req
.
errCh
:
t
.
Fatal
(
err
.
Error
())
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
...
@@ -280,15 +311,15 @@ func TestRequests(t *testing.T) {
...
@@ -280,15 +311,15 @@ func TestRequests(t *testing.T) {
s
.
loadedMu
.
Unlock
()
s
.
loadedMu
.
Unlock
()
// This is a CPU load with NumGPU = 0 so it should load
// This is a CPU load with NumGPU = 0 so it should load
s
.
newServerFn
=
scenario3
c
.
newServer
s
.
newServerFn
=
c
.
newServer
slog
.
Info
(
"
scenario3
c"
)
slog
.
Info
(
"c"
)
s
.
pendingReqCh
<-
scenario3
c
.
req
s
.
pendingReqCh
<-
c
.
req
select
{
select
{
case
resp
:=
<-
scenario3
c
.
req
.
successCh
:
case
resp
:=
<-
c
.
req
.
successCh
:
require
.
Equal
(
t
,
resp
.
llama
,
scenario3
c
.
srv
)
require
.
Equal
(
t
,
resp
.
llama
,
c
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
scenario3
c
.
req
.
errCh
)
require
.
Empty
(
t
,
c
.
req
.
errCh
)
case
err
:=
<-
scenario3
c
.
req
.
errCh
:
case
err
:=
<-
c
.
req
.
errCh
:
t
.
Fatal
(
err
.
Error
())
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
...
@@ -298,25 +329,25 @@ func TestRequests(t *testing.T) {
...
@@ -298,25 +329,25 @@ func TestRequests(t *testing.T) {
s
.
loadedMu
.
Unlock
()
s
.
loadedMu
.
Unlock
()
// Try to load a model that wont fit
// Try to load a model that wont fit
s
.
newServerFn
=
scenario3
d
.
newServer
s
.
newServerFn
=
d
.
newServer
slog
.
Info
(
"
scenario3
d"
)
slog
.
Info
(
"d"
)
s
.
loadedMu
.
Lock
()
s
.
loadedMu
.
Lock
()
require
.
Len
(
t
,
s
.
loaded
,
3
)
require
.
Len
(
t
,
s
.
loaded
,
3
)
s
.
loadedMu
.
Unlock
()
s
.
loadedMu
.
Unlock
()
scenario3
a
.
ctxDone
()
// Won't help since this one isn't big enough to make room
a
.
ctxDone
()
// Won't help since this one isn't big enough to make room
time
.
Sleep
(
2
*
time
.
Millisecond
)
time
.
Sleep
(
2
*
time
.
Millisecond
)
s
.
pendingReqCh
<-
scenario3
d
.
req
s
.
pendingReqCh
<-
d
.
req
// finish prior request, so new model can load
// finish prior request, so new model can load
time
.
Sleep
(
6
*
time
.
Millisecond
)
time
.
Sleep
(
6
*
time
.
Millisecond
)
s
.
loadedMu
.
Lock
()
s
.
loadedMu
.
Lock
()
require
.
Len
(
t
,
s
.
loaded
,
2
)
require
.
Len
(
t
,
s
.
loaded
,
2
)
s
.
loadedMu
.
Unlock
()
s
.
loadedMu
.
Unlock
()
scenario3
b
.
ctxDone
()
b
.
ctxDone
()
select
{
select
{
case
resp
:=
<-
scenario3
d
.
req
.
successCh
:
case
resp
:=
<-
d
.
req
.
successCh
:
require
.
Equal
(
t
,
resp
.
llama
,
scenario3
d
.
srv
)
require
.
Equal
(
t
,
resp
.
llama
,
d
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
scenario3
d
.
req
.
errCh
)
require
.
Empty
(
t
,
d
.
req
.
errCh
)
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
}
}
...
@@ -325,30 +356,59 @@ func TestRequests(t *testing.T) {
...
@@ -325,30 +356,59 @@ func TestRequests(t *testing.T) {
s
.
loadedMu
.
Unlock
()
s
.
loadedMu
.
Unlock
()
}
}
func
Test
GetRunner
(
t
*
testing
.
T
)
{
func
Test
RequestsModelTooBigForSystem
(
t
*
testing
.
T
)
{
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
1
00
*
time
.
Millisecond
)
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
5
00
*
time
.
Millisecond
)
defer
done
()
defer
done
()
scenario1a
:=
newScenario
(
t
,
ctx
,
"ollama-model-1a"
,
10
)
scenario1a
.
req
.
sessionDuration
=
&
api
.
Duration
{
Duration
:
0
}
scenario1b
:=
newScenario
(
t
,
ctx
,
"ollama-model-1b"
,
10
)
scenario1b
.
req
.
sessionDuration
=
&
api
.
Duration
{
Duration
:
0
}
scenario1c
:=
newScenario
(
t
,
ctx
,
"ollama-model-1c"
,
10
)
scenario1c
.
req
.
sessionDuration
=
&
api
.
Duration
{
Duration
:
0
}
envconfig
.
MaxQueuedRequests
=
1
s
:=
InitScheduler
(
ctx
)
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
func
()
gpu
.
GpuInfoList
{
s
.
getGpuFn
=
func
()
gpu
.
GpuInfoList
{
g
:=
gpu
.
GpuInfo
{
Library
:
"metal"
}
g
:=
gpu
.
GpuInfo
{
Library
:
"metal"
}
g
.
TotalMemory
=
2
4
*
format
.
Giga
Byte
g
.
TotalMemory
=
4
*
format
.
Mebi
Byte
g
.
FreeMemory
=
12
*
format
.
Giga
Byte
g
.
FreeMemory
=
3
*
format
.
Mebi
Byte
return
[]
gpu
.
GpuInfo
{
g
}
return
[]
gpu
.
GpuInfo
{
g
}
}
}
s
.
newServerFn
=
scenario1a
.
newServer
slog
.
Info
(
"scenario1a"
)
s
.
getCpuFn
=
func
()
gpu
.
GpuInfoList
{
successCh1a
,
errCh1a
:=
s
.
GetRunner
(
scenario1a
.
ctx
,
scenario1a
.
req
.
model
,
scenario1a
.
req
.
opts
,
scenario1a
.
req
.
sessionDuration
)
g
:=
gpu
.
GpuInfo
{
Library
:
"cpu"
}
g
.
TotalMemory
=
4
*
format
.
MebiByte
g
.
FreeMemory
=
2
*
format
.
MebiByte
return
[]
gpu
.
GpuInfo
{
g
}
}
a
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1"
,
10
,
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
})
s
.
newServerFn
=
a
.
newServer
slog
.
Info
(
"a"
)
s
.
pendingReqCh
<-
a
.
req
require
.
Len
(
t
,
s
.
pendingReqCh
,
1
)
require
.
Len
(
t
,
s
.
pendingReqCh
,
1
)
slog
.
Info
(
"scenario1b"
)
s
.
Run
(
ctx
)
successCh1b
,
errCh1b
:=
s
.
GetRunner
(
scenario1b
.
ctx
,
scenario1b
.
req
.
model
,
scenario1b
.
req
.
opts
,
scenario1b
.
req
.
sessionDuration
)
select
{
case
<-
a
.
req
.
successCh
:
if
runtime
.
GOOS
==
"linux"
{
t
.
Fatal
(
"request should have been rejected with out of space"
)
}
// else - Darwin and Windows don't reject right now
case
err
:=
<-
a
.
req
.
errCh
:
require
.
Contains
(
t
,
err
.
Error
(),
"too large"
)
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
}
}
func
TestGetRunner
(
t
*
testing
.
T
)
{
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
100
*
time
.
Millisecond
)
defer
done
()
a
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1a"
,
10
,
&
api
.
Duration
{
Duration
:
2
*
time
.
Millisecond
})
b
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1b"
,
10
,
&
api
.
Duration
{
Duration
:
2
*
time
.
Millisecond
})
c
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1c"
,
10
,
&
api
.
Duration
{
Duration
:
2
*
time
.
Millisecond
})
envconfig
.
MaxQueuedRequests
=
1
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
getGpuFn
s
.
getCpuFn
=
getCpuFn
s
.
newServerFn
=
a
.
newServer
slog
.
Info
(
"a"
)
successCh1a
,
errCh1a
:=
s
.
GetRunner
(
a
.
ctx
,
a
.
req
.
model
,
a
.
req
.
opts
,
a
.
req
.
sessionDuration
)
require
.
Len
(
t
,
s
.
pendingReqCh
,
1
)
slog
.
Info
(
"b"
)
successCh1b
,
errCh1b
:=
s
.
GetRunner
(
b
.
ctx
,
b
.
req
.
model
,
b
.
req
.
opts
,
b
.
req
.
sessionDuration
)
require
.
Len
(
t
,
s
.
pendingReqCh
,
1
)
require
.
Len
(
t
,
s
.
pendingReqCh
,
1
)
require
.
Empty
(
t
,
successCh1b
)
require
.
Empty
(
t
,
successCh1b
)
require
.
Len
(
t
,
errCh1b
,
1
)
require
.
Len
(
t
,
errCh1b
,
1
)
...
@@ -357,22 +417,24 @@ func TestGetRunner(t *testing.T) {
...
@@ -357,22 +417,24 @@ func TestGetRunner(t *testing.T) {
s
.
Run
(
ctx
)
s
.
Run
(
ctx
)
select
{
select
{
case
resp
:=
<-
successCh1a
:
case
resp
:=
<-
successCh1a
:
require
.
Equal
(
t
,
resp
.
llama
,
scenario1
a
.
srv
)
require
.
Equal
(
t
,
resp
.
llama
,
a
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
errCh1a
)
require
.
Empty
(
t
,
errCh1a
)
case
err
:=
<-
errCh1a
:
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
}
}
scenario1
a
.
ctxDone
()
a
.
ctxDone
()
// Set "a" model to idle so it can unload
s
.
loadedMu
.
Lock
()
s
.
loadedMu
.
Lock
()
require
.
Len
(
t
,
s
.
loaded
,
1
)
require
.
Len
(
t
,
s
.
loaded
,
1
)
s
.
loadedMu
.
Unlock
()
s
.
loadedMu
.
Unlock
()
scenario1
c
.
req
.
model
.
ModelPath
=
"bad path"
c
.
req
.
model
.
ModelPath
=
"bad path"
slog
.
Info
(
"
scenario1
c"
)
slog
.
Info
(
"c"
)
successCh1c
,
errCh1c
:=
s
.
GetRunner
(
scenario1c
.
ctx
,
scenario1
c
.
req
.
model
,
scenario1
c
.
req
.
opts
,
scenario1
c
.
req
.
sessionDuration
)
successCh1c
,
errCh1c
:=
s
.
GetRunner
(
c
.
ctx
,
c
.
req
.
model
,
c
.
req
.
opts
,
c
.
req
.
sessionDuration
)
// Starts in pending channel, then should be quickly processsed to return an error
// Starts in pending channel, then should be quickly processsed to return an error
time
.
Sleep
(
5
*
time
.
Millisecond
)
time
.
Sleep
(
20
*
time
.
Millisecond
)
// Long enough for the "a" model to expire and unload
require
.
Empty
(
t
,
successCh1c
)
require
.
Empty
(
t
,
successCh1c
)
s
.
loadedMu
.
Lock
()
s
.
loadedMu
.
Lock
()
require
.
Empty
(
t
,
s
.
loaded
)
require
.
Empty
(
t
,
s
.
loaded
)
...
@@ -380,7 +442,7 @@ func TestGetRunner(t *testing.T) {
...
@@ -380,7 +442,7 @@ func TestGetRunner(t *testing.T) {
require
.
Len
(
t
,
errCh1c
,
1
)
require
.
Len
(
t
,
errCh1c
,
1
)
err
=
<-
errCh1c
err
=
<-
errCh1c
require
.
Contains
(
t
,
err
.
Error
(),
"bad path"
)
require
.
Contains
(
t
,
err
.
Error
(),
"bad path"
)
scenario1
b
.
ctxDone
()
b
.
ctxDone
()
}
}
// TODO - add one scenario that triggers the bogus finished event with positive ref count
// TODO - add one scenario that triggers the bogus finished event with positive ref count
...
@@ -389,7 +451,7 @@ func TestPrematureExpired(t *testing.T) {
...
@@ -389,7 +451,7 @@ func TestPrematureExpired(t *testing.T) {
defer
done
()
defer
done
()
// Same model, same request
// Same model, same request
scenario1a
:=
newScenario
(
t
,
ctx
,
"ollama-model-1a"
,
10
)
scenario1a
:=
newScenario
Request
(
t
,
ctx
,
"ollama-model-1a"
,
10
,
nil
)
s
:=
InitScheduler
(
ctx
)
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
func
()
gpu
.
GpuInfoList
{
s
.
getGpuFn
=
func
()
gpu
.
GpuInfoList
{
g
:=
gpu
.
GpuInfo
{
Library
:
"metal"
}
g
:=
gpu
.
GpuInfo
{
Library
:
"metal"
}
...
@@ -411,6 +473,8 @@ func TestPrematureExpired(t *testing.T) {
...
@@ -411,6 +473,8 @@ func TestPrematureExpired(t *testing.T) {
s
.
loadedMu
.
Unlock
()
s
.
loadedMu
.
Unlock
()
slog
.
Info
(
"sending premature expired event now"
)
slog
.
Info
(
"sending premature expired event now"
)
s
.
expiredCh
<-
resp
// Shouldn't happen in real life, but make sure its safe
s
.
expiredCh
<-
resp
// Shouldn't happen in real life, but make sure its safe
case
err
:=
<-
errCh1a
:
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
}
}
...
@@ -446,6 +510,8 @@ func TestUseLoadedRunner(t *testing.T) {
...
@@ -446,6 +510,8 @@ func TestUseLoadedRunner(t *testing.T) {
select
{
select
{
case
success
:=
<-
req
.
successCh
:
case
success
:=
<-
req
.
successCh
:
require
.
Equal
(
t
,
r1
,
success
)
require
.
Equal
(
t
,
r1
,
success
)
case
err
:=
<-
req
.
errCh
:
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
t
.
Fatal
(
"timeout"
)
}
}
...
@@ -625,8 +691,7 @@ func TestAlreadyCanceled(t *testing.T) {
...
@@ -625,8 +691,7 @@ func TestAlreadyCanceled(t *testing.T) {
defer
done
()
defer
done
()
dctx
,
done2
:=
context
.
WithCancel
(
ctx
)
dctx
,
done2
:=
context
.
WithCancel
(
ctx
)
done2
()
done2
()
scenario1a
:=
newScenario
(
t
,
dctx
,
"ollama-model-1"
,
10
)
scenario1a
:=
newScenarioRequest
(
t
,
dctx
,
"ollama-model-1"
,
10
,
&
api
.
Duration
{
Duration
:
0
})
scenario1a
.
req
.
sessionDuration
=
&
api
.
Duration
{
Duration
:
0
}
s
:=
InitScheduler
(
ctx
)
s
:=
InitScheduler
(
ctx
)
slog
.
Info
(
"scenario1a"
)
slog
.
Info
(
"scenario1a"
)
s
.
pendingReqCh
<-
scenario1a
.
req
s
.
pendingReqCh
<-
scenario1a
.
req
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment