Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
bee2f4a3
Commit
bee2f4a3
authored
May 04, 2024
by
Daniel Hiltgen
Browse files
Record GPU usage information
This records more GPU usage information for eventual UX inclusion.
parent
88cf1544
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
40 additions
and
20 deletions
+40
-20
format/bytes.go
format/bytes.go
+2
-0
llm/memory.go
llm/memory.go
+12
-12
llm/server.go
llm/server.go
+26
-8
No files found.
format/bytes.go
View file @
bee2f4a3
...
...
@@ -53,6 +53,8 @@ func HumanBytes(b int64) string {
func
HumanBytes2
(
b
uint64
)
string
{
switch
{
case
b
>=
GibiByte
:
return
fmt
.
Sprintf
(
"%.1f GiB"
,
float64
(
b
)
/
GibiByte
)
case
b
>=
MebiByte
:
return
fmt
.
Sprintf
(
"%.1f MiB"
,
float64
(
b
)
/
MebiByte
)
case
b
>=
KibiByte
:
...
...
llm/memory.go
View file @
bee2f4a3
...
...
@@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
// Split up the GPUs by type and try them
for
_
,
gpus
:=
range
allGpus
.
ByLibrary
()
{
var
layerCount
int
layerCount
,
estimatedVRAM
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
layerCount
,
estimatedVRAM
,
_
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
if
opts
.
NumGPU
<
0
{
if
layerCount
>
0
&&
layerCount
>=
int
(
ggml
.
KV
()
.
BlockCount
()
+
1
)
{
return
true
,
estimatedVRAM
...
...
@@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
return
false
,
estimatedVRAM
}
// Given a model and one or more GPU targets, predict how many layers and bytes we can load
// Given a model and one or more GPU targets, predict how many layers and bytes we can load
, and the total size
// The GPUs provided must all be the same Library
func
EstimateGPULayers
(
gpus
[]
gpu
.
GpuInfo
,
ggml
*
GGML
,
projectors
[]
string
,
opts
api
.
Options
)
(
int
,
uint64
)
{
if
gpus
[
0
]
.
Library
==
"cpu"
{
return
0
,
0
}
func
EstimateGPULayers
(
gpus
[]
gpu
.
GpuInfo
,
ggml
*
GGML
,
projectors
[]
string
,
opts
api
.
Options
)
(
int
,
uint64
,
uint64
)
{
var
memoryAvailable
uint64
for
_
,
info
:=
range
gpus
{
memoryAvailable
+=
info
.
FreeMemory
...
...
@@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
+
layers
[
"blk.0"
]
.
size
()
if
memoryRequiredPartial
>
memoryAvailable
{
slog
.
Debug
(
"insufficient VRAM to load any model layers"
)
return
0
,
0
}
var
memoryLayerOutput
uint64
if
layer
,
ok
:=
layers
[
"output_norm"
];
ok
{
memoryLayerOutput
+=
layer
.
size
()
...
...
@@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
),
),
)
return
layerCount
,
uint64
(
memoryRequiredPartial
)
if
gpus
[
0
]
.
Library
==
"cpu"
{
return
0
,
0
,
memoryRequiredTotal
}
if
memoryRequiredPartial
>
memoryAvailable
{
slog
.
Debug
(
"insufficient VRAM to load any model layers"
)
return
0
,
0
,
memoryRequiredTotal
}
return
layerCount
,
memoryRequiredPartial
,
memoryRequiredTotal
}
llm/server.go
View file @
bee2f4a3
...
...
@@ -49,7 +49,10 @@ type llmServer struct {
options
api
.
Options
// TODO - this should be broken down by GPU
estimatedVRAM
uint64
// Estimated usage of VRAM by the loaded model
estimatedVRAM
uint64
// Estimated usage of VRAM by the loaded model
estimatedTotal
uint64
// Total size of model
totalLayers
uint64
gpuCount
int
sem
*
semaphore
.
Weighted
}
...
...
@@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner
:=
""
var
estimatedVRAM
uint64
var
estimatedTotal
uint64
var
systemMemory
uint64
gpuCount
:=
len
(
gpus
)
if
(
len
(
gpus
)
==
1
&&
gpus
[
0
]
.
Library
==
"cpu"
)
||
opts
.
NumGPU
==
0
{
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
cpuRunner
=
serverForCpu
()
gpuCount
=
0
}
else
{
if
gpus
[
0
]
.
Library
==
"metal"
{
memInfo
,
err
:=
gpu
.
GetCPUMem
()
...
...
@@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
}
var
layers
int
layers
,
estimatedVRAM
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
layers
,
estimatedVRAM
,
estimatedTotal
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
if
gpus
[
0
]
.
Library
==
"metal"
&&
estimatedVRAM
>
systemMemory
{
// disable partial offloading when model is greater than total system memory as this
...
...
@@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
else
{
slog
.
Info
(
"user override"
,
"OLLAMA_LLM_LIBRARY"
,
demandLib
,
"path"
,
serverPath
)
servers
=
[]
string
{
demandLib
}
if
strings
.
HasPrefix
(
demandLib
,
"cpu"
)
{
// Omit the GPU flag to silence the warning
opts
.
NumGPU
=
-
1
}
}
}
...
...
@@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
continue
}
if
strings
.
HasPrefix
(
servers
[
i
],
"cpu"
)
{
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
gpuCount
=
0
}
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
port
:=
0
if
a
,
err
:=
net
.
ResolveTCPAddr
(
"tcp"
,
"localhost:0"
);
err
==
nil
{
...
...
@@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
s
:=
&
llmServer
{
port
:
port
,
cmd
:
exec
.
Command
(
server
,
finalParams
...
),
status
:
NewStatusWriter
(
os
.
Stderr
),
options
:
opts
,
estimatedVRAM
:
estimatedVRAM
,
sem
:
semaphore
.
NewWeighted
(
int64
(
numParallel
)),
port
:
port
,
cmd
:
exec
.
Command
(
server
,
finalParams
...
),
status
:
NewStatusWriter
(
os
.
Stderr
),
options
:
opts
,
estimatedVRAM
:
estimatedVRAM
,
estimatedTotal
:
estimatedTotal
,
sem
:
semaphore
.
NewWeighted
(
int64
(
numParallel
)),
totalLayers
:
ggml
.
KV
()
.
BlockCount
()
+
1
,
gpuCount
:
gpuCount
,
}
s
.
cmd
.
Env
=
os
.
Environ
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment