Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
ee49844d
"vscode:/vscode.git/clone" did not exist on "fb223d47353695bc1f09fc431ccc8a5cea56bbb4"
Unverified
Commit
ee49844d
authored
May 08, 2024
by
Daniel Hiltgen
Committed by
GitHub
May 08, 2024
Browse files
Merge pull request #4153 from dhiltgen/gpu_verbose_response
Add GPU usage
parents
8a516ac8
bee2f4a3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
40 additions
and
20 deletions
+40
-20
format/bytes.go
format/bytes.go
+2
-0
llm/memory.go
llm/memory.go
+12
-12
llm/server.go
llm/server.go
+26
-8
No files found.
format/bytes.go
View file @
ee49844d
...
...
@@ -53,6 +53,8 @@ func HumanBytes(b int64) string {
func
HumanBytes2
(
b
uint64
)
string
{
switch
{
case
b
>=
GibiByte
:
return
fmt
.
Sprintf
(
"%.1f GiB"
,
float64
(
b
)
/
GibiByte
)
case
b
>=
MebiByte
:
return
fmt
.
Sprintf
(
"%.1f MiB"
,
float64
(
b
)
/
MebiByte
)
case
b
>=
KibiByte
:
...
...
llm/memory.go
View file @
ee49844d
...
...
@@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
// Split up the GPUs by type and try them
for
_
,
gpus
:=
range
allGpus
.
ByLibrary
()
{
var
layerCount
int
layerCount
,
estimatedVRAM
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
layerCount
,
estimatedVRAM
,
_
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
if
opts
.
NumGPU
<
0
{
if
layerCount
>
0
&&
layerCount
>=
int
(
ggml
.
KV
()
.
BlockCount
()
+
1
)
{
return
true
,
estimatedVRAM
...
...
@@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
return
false
,
estimatedVRAM
}
// Given a model and one or more GPU targets, predict how many layers and bytes we can load
// Given a model and one or more GPU targets, predict how many layers and bytes we can load
, and the total size
// The GPUs provided must all be the same Library
func
EstimateGPULayers
(
gpus
[]
gpu
.
GpuInfo
,
ggml
*
GGML
,
projectors
[]
string
,
opts
api
.
Options
)
(
int
,
uint64
)
{
if
gpus
[
0
]
.
Library
==
"cpu"
{
return
0
,
0
}
func
EstimateGPULayers
(
gpus
[]
gpu
.
GpuInfo
,
ggml
*
GGML
,
projectors
[]
string
,
opts
api
.
Options
)
(
int
,
uint64
,
uint64
)
{
var
memoryAvailable
uint64
for
_
,
info
:=
range
gpus
{
memoryAvailable
+=
info
.
FreeMemory
...
...
@@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
+
layers
[
"blk.0"
]
.
size
()
if
memoryRequiredPartial
>
memoryAvailable
{
slog
.
Debug
(
"insufficient VRAM to load any model layers"
)
return
0
,
0
}
var
memoryLayerOutput
uint64
if
layer
,
ok
:=
layers
[
"output_norm"
];
ok
{
memoryLayerOutput
+=
layer
.
size
()
...
...
@@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
),
),
)
return
layerCount
,
uint64
(
memoryRequiredPartial
)
if
gpus
[
0
]
.
Library
==
"cpu"
{
return
0
,
0
,
memoryRequiredTotal
}
if
memoryRequiredPartial
>
memoryAvailable
{
slog
.
Debug
(
"insufficient VRAM to load any model layers"
)
return
0
,
0
,
memoryRequiredTotal
}
return
layerCount
,
memoryRequiredPartial
,
memoryRequiredTotal
}
llm/server.go
View file @
ee49844d
...
...
@@ -49,7 +49,10 @@ type llmServer struct {
options
api
.
Options
// TODO - this should be broken down by GPU
estimatedVRAM
uint64
// Estimated usage of VRAM by the loaded model
estimatedVRAM
uint64
// Estimated usage of VRAM by the loaded model
estimatedTotal
uint64
// Total size of model
totalLayers
uint64
gpuCount
int
sem
*
semaphore
.
Weighted
}
...
...
@@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner
:=
""
var
estimatedVRAM
uint64
var
estimatedTotal
uint64
var
systemMemory
uint64
gpuCount
:=
len
(
gpus
)
if
(
len
(
gpus
)
==
1
&&
gpus
[
0
]
.
Library
==
"cpu"
)
||
opts
.
NumGPU
==
0
{
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
cpuRunner
=
serverForCpu
()
gpuCount
=
0
}
else
{
if
gpus
[
0
]
.
Library
==
"metal"
{
memInfo
,
err
:=
gpu
.
GetCPUMem
()
...
...
@@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
}
var
layers
int
layers
,
estimatedVRAM
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
layers
,
estimatedVRAM
,
estimatedTotal
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
if
gpus
[
0
]
.
Library
==
"metal"
&&
estimatedVRAM
>
systemMemory
{
// disable partial offloading when model is greater than total system memory as this
...
...
@@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
else
{
slog
.
Info
(
"user override"
,
"OLLAMA_LLM_LIBRARY"
,
demandLib
,
"path"
,
serverPath
)
servers
=
[]
string
{
demandLib
}
if
strings
.
HasPrefix
(
demandLib
,
"cpu"
)
{
// Omit the GPU flag to silence the warning
opts
.
NumGPU
=
-
1
}
}
}
...
...
@@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
continue
}
if
strings
.
HasPrefix
(
servers
[
i
],
"cpu"
)
{
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
gpuCount
=
0
}
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
port
:=
0
if
a
,
err
:=
net
.
ResolveTCPAddr
(
"tcp"
,
"localhost:0"
);
err
==
nil
{
...
...
@@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
s
:=
&
llmServer
{
port
:
port
,
cmd
:
exec
.
Command
(
server
,
finalParams
...
),
status
:
NewStatusWriter
(
os
.
Stderr
),
options
:
opts
,
estimatedVRAM
:
estimatedVRAM
,
sem
:
semaphore
.
NewWeighted
(
int64
(
numParallel
)),
port
:
port
,
cmd
:
exec
.
Command
(
server
,
finalParams
...
),
status
:
NewStatusWriter
(
os
.
Stderr
),
options
:
opts
,
estimatedVRAM
:
estimatedVRAM
,
estimatedTotal
:
estimatedTotal
,
sem
:
semaphore
.
NewWeighted
(
int64
(
numParallel
)),
totalLayers
:
ggml
.
KV
()
.
BlockCount
()
+
1
,
gpuCount
:
gpuCount
,
}
s
.
cmd
.
Env
=
os
.
Environ
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment