Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
d6dd2ff8
Unverified
Commit
d6dd2ff8
authored
Apr 01, 2024
by
Michael Yang
Committed by
GitHub
Apr 01, 2024
Browse files
Merge pull request #3241 from ollama/mxyng/mem
update memory estimations for gpu offloading
parents
e57a6ba8
91b3e4d2
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
121 additions
and
85 deletions
+121
-85
format/bytes.go
format/bytes.go
+16
-1
gpu/gpu.go
gpu/gpu.go
+11
-14
gpu/types.go
gpu/types.go
+3
-0
llm/dyn_ext_server.go
llm/dyn_ext_server.go
+2
-2
llm/ggml.go
llm/ggml.go
+11
-0
llm/llm.go
llm/llm.go
+73
-63
server/routes.go
server/routes.go
+5
-5
No files found.
format/bytes.go
View file @
d6dd2ff8
...
...
@@ -6,11 +6,15 @@ import (
)
const
(
Byte
=
1
Byte
=
1
KiloByte
=
Byte
*
1000
MegaByte
=
KiloByte
*
1000
GigaByte
=
MegaByte
*
1000
TeraByte
=
GigaByte
*
1000
KibiByte
=
Byte
*
1024
MebiByte
=
KibiByte
*
1024
)
func
HumanBytes
(
b
int64
)
string
{
...
...
@@ -45,3 +49,14 @@ func HumanBytes(b int64) string {
return
fmt
.
Sprintf
(
"%d %s"
,
int
(
value
),
unit
)
}
}
func
HumanBytes2
(
b
int64
)
string
{
switch
{
case
b
>=
MebiByte
:
return
fmt
.
Sprintf
(
"%.1f MiB"
,
float64
(
b
)
/
MebiByte
)
case
b
>=
KibiByte
:
return
fmt
.
Sprintf
(
"%.1f KiB"
,
float64
(
b
)
/
KibiByte
)
default
:
return
fmt
.
Sprintf
(
"%d B"
,
b
)
}
}
gpu/gpu.go
View file @
d6dd2ff8
...
...
@@ -20,6 +20,8 @@ import (
"strings"
"sync"
"unsafe"
"github.com/ollama/ollama/format"
)
type
handles
struct
{
...
...
@@ -27,6 +29,11 @@ type handles struct {
cudart
*
C
.
cudart_handle_t
}
const
(
cudaMinimumMemory
=
377
*
format
.
MebiByte
rocmMinimumMemory
=
377
*
format
.
MebiByte
)
var
gpuMutex
sync
.
Mutex
var
gpuHandles
*
handles
=
nil
...
...
@@ -168,6 +175,7 @@ func GetGPUInfo() GpuInfo {
}
else
if
cc
.
major
>
CudaComputeMin
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
resp
.
Library
=
"cuda"
resp
.
MinimumMemory
=
cudaMinimumMemory
}
else
{
slog
.
Info
(
fmt
.
Sprintf
(
"[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
}
...
...
@@ -187,6 +195,7 @@ func GetGPUInfo() GpuInfo {
}
else
if
cc
.
major
>
CudaComputeMin
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
slog
.
Info
(
fmt
.
Sprintf
(
"[cudart] CUDART CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
resp
.
Library
=
"cuda"
resp
.
MinimumMemory
=
cudaMinimumMemory
}
else
{
slog
.
Info
(
fmt
.
Sprintf
(
"[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
}
...
...
@@ -194,6 +203,7 @@ func GetGPUInfo() GpuInfo {
}
else
{
AMDGetGPUInfo
(
&
resp
)
if
resp
.
Library
!=
""
{
resp
.
MinimumMemory
=
rocmMinimumMemory
return
resp
}
}
...
...
@@ -239,20 +249,7 @@ func CheckVRAM() (int64, error) {
}
gpuInfo
:=
GetGPUInfo
()
if
gpuInfo
.
FreeMemory
>
0
&&
(
gpuInfo
.
Library
==
"cuda"
||
gpuInfo
.
Library
==
"rocm"
)
{
// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
overhead
:=
gpuInfo
.
FreeMemory
/
10
gpus
:=
uint64
(
gpuInfo
.
DeviceCount
)
if
overhead
<
gpus
*
1024
*
1024
*
1024
{
overhead
=
gpus
*
1024
*
1024
*
1024
}
// Assigning full reported free memory for Tegras due to OS controlled caching.
if
CudaTegra
!=
""
{
// Setting overhead for non-Tegra devices
overhead
=
0
}
avail
:=
int64
(
gpuInfo
.
FreeMemory
-
overhead
)
slog
.
Debug
(
fmt
.
Sprintf
(
"%s detected %d devices with %dM available memory"
,
gpuInfo
.
Library
,
gpuInfo
.
DeviceCount
,
avail
/
1024
/
1024
))
return
avail
,
nil
return
int64
(
gpuInfo
.
FreeMemory
),
nil
}
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
...
...
gpu/types.go
View file @
d6dd2ff8
...
...
@@ -14,6 +14,9 @@ type GpuInfo struct {
// Optional variant to select (e.g. versions, cpu feature flags)
Variant
string
`json:"variant,omitempty"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory
int64
`json:"-"`
// TODO add other useful attributes about the card here for discovery information
}
...
...
llm/dyn_ext_server.go
View file @
d6dd2ff8
...
...
@@ -39,7 +39,7 @@ import (
type
dynExtServer
struct
{
s
C
.
struct_dynamic_llama_server
options
api
.
Options
options
*
api
.
Options
}
// Note: current implementation does not support concurrent instantiations
...
...
@@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
return
fmt
.
Errorf
(
C
.
GoString
(
resp
.
msg
))
}
func
newDynExtServer
(
library
,
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
LLM
,
error
)
{
func
newDynExtServer
(
library
,
model
string
,
adapters
,
projectors
[]
string
,
opts
*
api
.
Options
)
(
LLM
,
error
)
{
if
!
mutex
.
TryLock
()
{
slog
.
Info
(
"concurrent llm servers not yet supported, waiting for prior server to complete"
)
mutex
.
Lock
()
...
...
llm/ggml.go
View file @
d6dd2ff8
...
...
@@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"io"
"strings"
)
type
GGML
struct
{
...
...
@@ -12,6 +13,16 @@ type GGML struct {
model
}
func
(
ggml
*
GGML
)
LayerSize
(
prefix
string
)
(
n
int64
)
{
for
_
,
t
:=
range
ggml
.
Tensors
()
{
if
strings
.
HasPrefix
(
t
.
Name
,
prefix
)
{
n
+=
int64
(
t
.
size
())
}
}
return
}
const
(
fileTypeF32
uint32
=
iota
fileTypeF16
...
...
llm/llm.go
View file @
d6dd2ff8
...
...
@@ -5,10 +5,11 @@ import (
"fmt"
"log/slog"
"os"
"runtime"
"slices"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
)
...
...
@@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
"mamba"
,
}
func
New
(
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
LLM
,
error
)
{
func
New
(
model
string
,
adapters
,
projectors
[]
string
,
opts
*
api
.
Options
)
(
LLM
,
error
)
{
if
_
,
err
:=
os
.
Stat
(
model
);
err
!=
nil
{
return
nil
,
err
}
...
...
@@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
}
defer
f
.
Close
()
ggml
,
size
,
err
:=
DecodeGGML
(
f
)
ggml
,
_
,
err
:=
DecodeGGML
(
f
)
if
err
!=
nil
{
return
nil
,
err
}
...
...
@@ -49,84 +50,93 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
opts
.
NumCtx
=
4
}
vram
,
_
:=
gpu
.
CheckVRAM
()
availableMemory
,
_
:=
gpu
.
CheckVRAM
()
info
:=
gpu
.
GetGPUInfo
()
usedMemory
:=
info
.
MinimumMemory
for
_
,
projector
:=
range
projectors
{
usedMemory
+=
projectorMemoryRequirements
(
projector
)
// multimodal models require at least 2048 context
opts
.
NumCtx
=
max
(
opts
.
NumCtx
,
2048
)
}
// fp16 k,v
matrices require =
n_ctx * n_layer * n_embd / n_head * n_head_kv
* 2 bytes each * 2 key and value
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
())
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
())
*
int64
(
ggml
.
KV
()
.
HeadCount
KV
())
/
int64
(
ggml
.
KV
()
.
HeadCount
())
// fp16 k,v
= (1 (k) + 1 (v)) * sizeof(float16) *
n_ctx * n_layer * n_embd / n_head * n_head_kv
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
())
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
())
/
int64
(
ggml
.
KV
()
.
HeadCount
())
*
int64
(
ggml
.
KV
()
.
HeadCount
KV
())
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
graph
:=
int64
(
ggml
.
KV
()
.
GQA
())
*
kv
/
6
usedMemory
+=
graph
if
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
())
{
opts
.
NumGPU
=
0
if
usedMemory
>
availableMemory
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
())
{
info
.
Library
=
"cpu"
}
info
:=
gpu
.
GetGPUInfo
()
switch
runtime
.
GOOS
{
case
"darwin"
:
if
opts
.
NumGPU
==
0
{
break
}
requiredMemory
:=
usedMemory
if
size
+
kv
+
graph
>
vram
{
slog
.
Info
(
"not enough vram available, setting num_gpu=0"
)
opts
.
NumGPU
=
0
break
}
var
layers
int
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
layerMemory
:=
ggml
.
LayerSize
(
fmt
.
Sprintf
(
"blk.%d."
,
i
))
+
kv
/
int64
(
ggml
.
KV
()
.
BlockCount
())
requiredMemory
+=
layerMemory
// TODO: implement layer splitting on macOS
opts
.
NumGPU
=
999
default
:
if
info
.
Library
==
"cpu"
{
slog
.
Info
(
"GPU not available, falling back to CPU"
)
opts
.
NumGPU
=
0
break
if
availableMemory
>
usedMemory
+
layerMemory
&&
(
opts
.
NumGPU
<
0
||
layers
<
opts
.
NumGPU
)
{
usedMemory
+=
layerMemory
layers
++
}
}
// don't use GPU at all if no layers are loaded
if
opts
.
NumGPU
==
0
{
info
.
Library
=
"cpu"
info
.
Variant
=
gpu
.
GetCPUVariant
()
break
}
memOutputLayer
:=
ggml
.
LayerSize
(
"output."
)
requiredMemory
+=
memOutputLayer
// user-defined GPU count
if
opts
.
NumGPU
!=
-
1
{
break
}
// only offload output layer if all repeating layers are offloaded
if
layers
>=
int
(
ggml
.
KV
()
.
BlockCount
())
&&
availableMemory
>
usedMemory
+
memOutputLayer
{
usedMemory
+=
memOutputLayer
layers
++
}
// the "main" GPU needs the most memory and determines the limit
// of how many layers can be loaded. It needs to fit:
// 1. the full compute graph allocation for all devices (graph)
// 2. the proportional kv cache for all devices (kv * % layers)
// 3. the proportional model (size * % layers / # devices)
// This estimates the number of layers
maxlayers
:=
int64
(
ggml
.
KV
()
.
BlockCount
())
+
1
devices
:=
int64
(
info
.
DeviceCount
)
avg
:=
vram
/
devices
layers
:=
maxlayers
*
(
avg
-
graph
)
/
(
kv
+
size
/
devices
)
if
layers
>
maxlayers
{
layers
=
max
layers
}
slog
.
Info
(
"offload to gpu"
,
"layers"
,
layers
,
"required"
,
format
.
HumanBytes2
(
requiredMemory
),
"used"
,
format
.
HumanBytes2
(
usedMemory
),
"available"
,
format
.
HumanBytes2
(
availableMemory
),
"kv"
,
format
.
HumanBytes2
(
kv
),
"graph"
,
format
.
HumanBytes2
(
graph
),
)
if
opts
.
NumGPU
<
0
&&
info
.
Library
!=
"cpu"
{
opts
.
NumGPU
=
layers
}
// 1 + 2 must fit on the main gpu
min
:=
graph
+
kv
*
layers
/
maxlayers
if
layers
<=
0
||
min
>
avg
{
slog
.
Info
(
"not enough vram available, falling back to CPU only"
)
info
.
Library
=
"cpu"
info
.
Variant
=
gpu
.
GetCPUVariant
()
opts
.
NumGPU
=
0
break
}
return
newLlmServer
(
info
,
model
,
adapters
,
projectors
,
opts
)
}
opts
.
NumGPU
=
int
(
layers
)
func
projectorMemoryRequirements
(
filename
string
)
int64
{
file
,
err
:=
os
.
Open
(
filename
)
if
err
!=
nil
{
return
0
}
defer
file
.
Close
()
opts
.
RopeFrequencyBase
=
0.0
opts
.
RopeFrequencyScale
=
0.0
return
newLlmServer
(
info
,
model
,
adapters
,
projectors
,
opts
)
ggml
,
_
,
err
:=
DecodeGGML
(
file
)
if
err
!=
nil
{
return
0
}
prefixes
:=
make
(
map
[
string
]
struct
{})
for
_
,
layer
:=
range
ggml
.
Tensors
()
{
parts
:=
strings
.
Split
(
layer
.
Name
,
"."
)
prefixes
[
strings
.
Join
(
parts
[
:
2
],
"."
)]
=
struct
{}{}
}
var
ask
int64
for
prefix
:=
range
prefixes
{
ask
+=
ggml
.
LayerSize
(
prefix
)
}
return
ask
}
// Give any native cgo implementations an opportunity to initialize
...
...
@@ -134,7 +144,7 @@ func Init() error {
return
nativeInit
()
}
func
newLlmServer
(
gpuInfo
gpu
.
GpuInfo
,
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
LLM
,
error
)
{
func
newLlmServer
(
gpuInfo
gpu
.
GpuInfo
,
model
string
,
adapters
,
projectors
[]
string
,
opts
*
api
.
Options
)
(
LLM
,
error
)
{
dynLibs
:=
getDynLibs
(
gpuInfo
)
// Check to see if the user has requested a specific library instead of auto-detecting
...
...
server/routes.go
View file @
d6dd2ff8
...
...
@@ -68,7 +68,7 @@ var loaded struct {
var
defaultSessionDuration
=
5
*
time
.
Minute
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
func
load
(
c
*
gin
.
Context
,
model
*
Model
,
opts
api
.
Options
,
sessionDuration
time
.
Duration
)
error
{
func
load
(
c
*
gin
.
Context
,
model
*
Model
,
opts
*
api
.
Options
,
sessionDuration
time
.
Duration
)
error
{
needLoad
:=
loaded
.
runner
==
nil
||
// is there a model loaded?
loaded
.
ModelPath
!=
model
.
ModelPath
||
// has the base model changed?
!
reflect
.
DeepEqual
(
loaded
.
AdapterPaths
,
model
.
AdapterPaths
)
||
// have the adapters changed?
...
...
@@ -97,7 +97,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
loaded
.
Model
=
model
loaded
.
runner
=
llmRunner
loaded
.
Options
=
&
opts
loaded
.
Options
=
opts
}
loaded
.
expireAt
=
time
.
Now
()
.
Add
(
sessionDuration
)
...
...
@@ -214,7 +214,7 @@ func GenerateHandler(c *gin.Context) {
sessionDuration
=
req
.
KeepAlive
.
Duration
}
if
err
:=
load
(
c
,
model
,
opts
,
sessionDuration
);
err
!=
nil
{
if
err
:=
load
(
c
,
model
,
&
opts
,
sessionDuration
);
err
!=
nil
{
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
}
...
...
@@ -460,7 +460,7 @@ func EmbeddingsHandler(c *gin.Context) {
sessionDuration
=
req
.
KeepAlive
.
Duration
}
if
err
:=
load
(
c
,
model
,
opts
,
sessionDuration
);
err
!=
nil
{
if
err
:=
load
(
c
,
model
,
&
opts
,
sessionDuration
);
err
!=
nil
{
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
}
...
...
@@ -1267,7 +1267,7 @@ func ChatHandler(c *gin.Context) {
sessionDuration
=
req
.
KeepAlive
.
Duration
}
if
err
:=
load
(
c
,
model
,
opts
,
sessionDuration
);
err
!=
nil
{
if
err
:=
load
(
c
,
model
,
&
opts
,
sessionDuration
);
err
!=
nil
{
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment