Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
7e33a017
Commit
7e33a017
authored
Apr 05, 2024
by
Michael Yang
Browse files
partial offloading
parent
8b2c1006
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
96 additions
and
84 deletions
+96
-84
format/bytes.go
format/bytes.go
+1
-1
gpu/gpu.go
gpu/gpu.go
+3
-3
gpu/gpu_darwin.go
gpu/gpu_darwin.go
+3
-4
gpu/types.go
gpu/types.go
+1
-1
llm/ggml.go
llm/ggml.go
+42
-35
llm/server.go
llm/server.go
+46
-40
No files found.
format/bytes.go
View file @
7e33a017
...
@@ -50,7 +50,7 @@ func HumanBytes(b int64) string {
...
@@ -50,7 +50,7 @@ func HumanBytes(b int64) string {
}
}
}
}
func
HumanBytes2
(
b
int64
)
string
{
func
HumanBytes2
(
b
u
int64
)
string
{
switch
{
switch
{
case
b
>=
MebiByte
:
case
b
>=
MebiByte
:
return
fmt
.
Sprintf
(
"%.1f MiB"
,
float64
(
b
)
/
MebiByte
)
return
fmt
.
Sprintf
(
"%.1f MiB"
,
float64
(
b
)
/
MebiByte
)
...
...
gpu/gpu.go
View file @
7e33a017
...
@@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) {
...
@@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) {
return
ret
,
nil
return
ret
,
nil
}
}
func
CheckVRAM
()
(
int64
,
error
)
{
func
CheckVRAM
()
(
u
int64
,
error
)
{
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
if
userLimit
!=
""
{
if
userLimit
!=
""
{
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
...
@@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) {
...
@@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) {
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
}
}
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
return
avail
,
nil
return
uint64
(
avail
)
,
nil
}
}
gpuInfo
:=
GetGPUInfo
()
gpuInfo
:=
GetGPUInfo
()
if
gpuInfo
.
FreeMemory
>
0
&&
(
gpuInfo
.
Library
==
"cuda"
||
gpuInfo
.
Library
==
"rocm"
)
{
if
gpuInfo
.
FreeMemory
>
0
&&
(
gpuInfo
.
Library
==
"cuda"
||
gpuInfo
.
Library
==
"rocm"
)
{
return
int64
(
gpuInfo
.
FreeMemory
)
,
nil
return
gpuInfo
.
FreeMemory
,
nil
}
}
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
...
...
gpu/gpu_darwin.go
View file @
7e33a017
...
@@ -17,7 +17,7 @@ import (
...
@@ -17,7 +17,7 @@ import (
)
)
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func
CheckVRAM
()
(
int64
,
error
)
{
func
CheckVRAM
()
(
u
int64
,
error
)
{
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
if
userLimit
!=
""
{
if
userLimit
!=
""
{
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
...
@@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) {
...
@@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) {
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
}
}
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
return
avail
,
nil
return
uint64
(
avail
)
,
nil
}
}
if
runtime
.
GOARCH
==
"amd64"
{
if
runtime
.
GOARCH
==
"amd64"
{
// gpu not supported, this may not be metal
// gpu not supported, this may not be metal
return
0
,
nil
return
0
,
nil
}
}
recommendedMaxVRAM
:=
int64
(
C
.
getRecommendedMaxVRAM
())
return
uint64
(
C
.
getRecommendedMaxVRAM
()),
nil
return
recommendedMaxVRAM
,
nil
}
}
func
GetGPUInfo
()
GpuInfo
{
func
GetGPUInfo
()
GpuInfo
{
...
...
gpu/types.go
View file @
7e33a017
...
@@ -15,7 +15,7 @@ type GpuInfo struct {
...
@@ -15,7 +15,7 @@ type GpuInfo struct {
Variant
string
`json:"variant,omitempty"`
Variant
string
`json:"variant,omitempty"`
// MinimumMemory represents the minimum memory required to use the GPU
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory
int64
`json:"-"`
MinimumMemory
u
int64
`json:"-"`
// TODO add other useful attributes about the card here for discovery information
// TODO add other useful attributes about the card here for discovery information
}
}
...
...
llm/ggml.go
View file @
7e33a017
...
@@ -324,45 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
...
@@ -324,45 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
},
offset
,
nil
},
offset
,
nil
}
}
func
(
llm
GGML
)
GraphSize
(
context
,
batch
int
)
(
int64
,
bool
)
{
func
(
llm
GGML
)
GraphSize
(
context
,
batch
uint64
)
(
partialOffload
,
fullOffload
uint64
)
{
embeddingLength
:=
llm
.
KV
()
.
EmbeddingLength
()
embedding
:=
llm
.
KV
()
.
EmbeddingLength
()
headCount
:=
llm
.
KV
()
.
HeadCount
()
heads
:=
llm
.
KV
()
.
HeadCount
()
headCountKV
:=
llm
.
KV
()
.
HeadCountKV
()
headsKV
:=
llm
.
KV
()
.
HeadCountKV
()
vocabLength
:=
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
))
vocab
:=
uint64
(
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
)))
layers
:=
llm
.
Tensors
()
.
Layers
()
var
attnQKVWeight1
uint64
=
0
if
t
,
ok
:=
layers
[
"0"
][
"attn_qkv.weight"
];
ok
&&
len
(
t
.
Shape
)
>
2
{
attnQKVWeight1
=
t
.
Shape
[
1
]
}
var
ffnGate0Weight1
uint64
=
0
if
t
,
ok
:=
layers
[
"0"
][
"ffn_gate.0.weight"
];
ok
&&
len
(
t
.
Shape
)
>
2
{
ffnGate0Weight1
=
t
.
Shape
[
1
]
}
switch
llm
.
KV
()
.
Architecture
()
{
switch
llm
.
KV
()
.
Architecture
()
{
case
"gemma"
,
"command-r"
:
return
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
true
case
"phi2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
attnQKVWeight1
+
uint64
(
context
)
*
headCount
),
),
true
case
"qwen2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
2
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
),
true
case
"llama"
:
case
"llama"
:
if
ffnGate0Weight1
>
0
{
fullOffload
=
4
*
batch
*
(
1
+
4
*
embedding
+
context
*
(
1
+
heads
))
// moe
return
4
*
int64
(
batch
)
*
int64
(
2
+
3
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
+
2
*
headCountKV
+
ffnGate0Weight1
),
true
partialOffload
=
4
*
batch
*
embedding
}
partialOffload
+=
max
(
4
*
batch
*
(
1
+
embedding
+
max
(
context
,
embedding
))
+
embedding
*
embedding
*
9
/
16
+
4
*
context
*
(
batch
*
heads
+
embedding
/
heads
*
headsKV
),
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
)
case
"gemma"
:
fullOffload
=
4
*
batch
*
(
embedding
+
vocab
)
partialOffload
=
4
*
batch
*
(
2
*
embedding
+
vocab
+
1
)
+
embedding
*
vocab
*
105
/
128
case
"command-r"
:
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
2
+
4
*
embedding
+
context
*
(
1
+
heads
)),
)
partialOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
4
*
batch
*
(
1
+
2
*
embedding
+
context
*
(
1
+
heads
))
+
4
*
embedding
*
context
+
embedding
*
embedding
*
9
/
16
,
)
case
"qwen2"
:
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
1
+
2
*
embedding
+
context
+
context
*
heads
),
)
partialOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
4
*
(
batch
*
(
1
+
2
*
embedding
+
context
*
(
1
+
heads
))
+
embedding
*
(
1
+
context
)),
)
case
"phi2"
:
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
1
+
4
*
embedding
+
context
+
context
*
heads
),
)
return
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
true
partialOffload
=
4
*
batch
*
(
2
*
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
}
}
return
0
,
false
return
}
}
llm/server.go
View file @
7e33a017
...
@@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{
...
@@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{
}
}
func
NewLlamaServer
(
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
*
LlamaServer
,
error
)
{
func
NewLlamaServer
(
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
*
LlamaServer
,
error
)
{
if
_
,
err
:=
os
.
Stat
(
model
);
err
!=
nil
{
return
nil
,
err
}
f
,
err
:=
os
.
Open
(
model
)
f
,
err
:=
os
.
Open
(
model
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
...
@@ -65,12 +61,12 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
...
@@ -65,12 +61,12 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
opts
.
NumCtx
=
4
opts
.
NumCtx
=
4
}
}
a
vailable
Memory
,
_
:=
gpu
.
CheckVRAM
()
memoryA
vailable
,
_
:=
gpu
.
CheckVRAM
()
info
:=
gpu
.
GetGPUInfo
()
info
:=
gpu
.
GetGPUInfo
()
usedMemory
:=
info
.
MinimumMemory
memoryMinimum
:=
info
.
MinimumMemory
for
_
,
projector
:=
range
projectors
{
for
_
,
projector
:=
range
projectors
{
usedMemory
+=
projectorMemoryRequirements
(
projector
)
memoryMinimum
+=
projectorMemoryRequirements
(
projector
)
// multimodal models require at least 2048 context
// multimodal models require at least 2048 context
opts
.
NumCtx
=
max
(
opts
.
NumCtx
,
2048
)
opts
.
NumCtx
=
max
(
opts
.
NumCtx
,
2048
)
...
@@ -79,55 +75,65 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
...
@@ -79,55 +75,65 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var
kv
uint64
=
2
*
2
*
uint64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
ggml
.
KV
()
.
EmbeddingLength
()
/
ggml
.
KV
()
.
HeadCount
()
*
ggml
.
KV
()
.
HeadCountKV
()
var
kv
uint64
=
2
*
2
*
uint64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
ggml
.
KV
()
.
EmbeddingLength
()
/
ggml
.
KV
()
.
HeadCount
()
*
ggml
.
KV
()
.
HeadCountKV
()
graph
,
ok
:=
ggml
.
GraphSize
(
opts
.
NumCtx
,
min
(
opts
.
NumCtx
,
opts
.
NumBatch
))
graph
PartialOffload
,
graphFullOffload
:=
ggml
.
GraphSize
(
uint64
(
opts
.
NumCtx
)
,
uint64
(
min
(
opts
.
NumCtx
,
opts
.
NumBatch
))
)
if
!
ok
{
if
graphPartialOffload
==
0
{
graph
=
int64
(
ggml
.
KV
()
.
GQA
()
*
kv
)
/
6
graph
PartialOffload
=
ggml
.
KV
()
.
GQA
()
*
kv
/
6
}
}
usedMemory
+=
graph
if
graphFullOffload
==
0
{
graphFullOffload
=
graphPartialOffload
if
(
usedMemory
>
availableMemory
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
()))
&&
info
.
Library
!=
"metal"
{
info
.
Library
=
"cpu"
}
}
requiredMemory
:=
usedMemory
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal
:=
memoryMinimum
+
graphFullOffload
tensorLayers
:=
ggml
.
Tensors
()
.
Layers
()
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
var
layers
int
if
info
.
Library
!=
"metal"
{
if
memoryRequiredPartial
>
memoryAvailable
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
())
{
info
.
Library
=
"cpu"
}
}
var
layerCount
int
layers
:=
ggml
.
Tensors
()
.
Layers
()
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
layerMemory
:=
int64
(
tensorLayers
[
fmt
.
Sprintf
(
"%d"
,
i
)]
.
size
()
+
kv
/
ggml
.
KV
()
.
BlockCount
())
memoryLayer
:=
layers
[
fmt
.
Sprintf
(
"%d"
,
i
)]
.
size
()
requiredMemory
+=
layerMemory
// KV is proportional to the number of layers
memoryLayer
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
if
availableMemory
>
usedMemory
+
layerMemory
&&
(
opts
.
NumGPU
<
0
||
layers
<
opts
.
NumGPU
)
{
memoryRequiredTotal
+=
memoryLayer
usedMemory
+=
layerMemory
if
memoryAvailable
>
memoryRequiredPartial
+
memoryLayer
{
layers
++
memoryRequiredPartial
+=
memoryLayer
layerCount
++
}
}
}
}
memOutputLayer
:=
int64
(
tensorLayers
[
"output"
]
.
size
())
memoryLayerOutput
:=
layers
[
"output"
]
.
size
()
requiredMemory
+=
memOutputLayer
memoryRequiredTotal
+=
memoryLayerOutput
if
memoryAvailable
>
memoryRequiredTotal
{
layerCount
=
int
(
ggml
.
KV
()
.
BlockCount
())
+
1
memoryRequiredPartial
=
memoryRequiredTotal
}
// only offload output layer if all repeating layers are offloaded
if
opts
.
NumGPU
<
0
{
if
layers
>=
int
(
ggml
.
KV
()
.
BlockCount
())
&&
availableMemory
>
usedMemory
+
memOutputLayer
{
opts
.
NumGPU
=
layerCount
usedMemory
+=
memOutputLayer
layers
++
}
}
slog
.
Info
(
slog
.
Info
(
"offload to gpu"
,
"offload to gpu"
,
"layers"
,
layers
,
"reallayers"
,
opts
.
NumGPU
,
"required"
,
format
.
HumanBytes2
(
requiredMemory
),
"layers"
,
layerCount
,
"used"
,
format
.
HumanBytes2
(
usedMemory
),
"required"
,
format
.
HumanBytes2
(
memoryRequiredTotal
),
"available"
,
format
.
HumanBytes2
(
availableMemory
),
"used"
,
format
.
HumanBytes2
(
memoryRequiredPartial
),
"kv"
,
format
.
HumanBytes2
(
int64
(
kv
)),
"available"
,
format
.
HumanBytes2
(
memoryAvailable
),
"graph"
,
format
.
HumanBytes2
(
graph
),
"kv"
,
format
.
HumanBytes2
(
kv
),
"fulloffload"
,
format
.
HumanBytes2
(
graphFullOffload
),
"partialoffload"
,
format
.
HumanBytes2
(
graphPartialOffload
),
)
)
if
opts
.
NumGPU
<
0
&&
info
.
Library
!=
"cpu"
{
opts
.
NumGPU
=
layers
}
if
len
(
adapters
)
>
1
{
if
len
(
adapters
)
>
1
{
return
nil
,
errors
.
New
(
"ollama supports only one lora adapter, but multiple were provided"
)
return
nil
,
errors
.
New
(
"ollama supports only one lora adapter, but multiple were provided"
)
}
}
...
@@ -284,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
...
@@ -284,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
return
nil
,
finalErr
return
nil
,
finalErr
}
}
func
projectorMemoryRequirements
(
filename
string
)
int64
{
func
projectorMemoryRequirements
(
filename
string
)
u
int64
{
file
,
err
:=
os
.
Open
(
filename
)
file
,
err
:=
os
.
Open
(
filename
)
if
err
!=
nil
{
if
err
!=
nil
{
return
0
return
0
...
@@ -301,7 +307,7 @@ func projectorMemoryRequirements(filename string) int64 {
...
@@ -301,7 +307,7 @@ func projectorMemoryRequirements(filename string) int64 {
mem
+=
layer
.
size
()
mem
+=
layer
.
size
()
}
}
return
int64
(
mem
)
return
mem
}
}
type
ServerStatus
int
type
ServerStatus
int
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment