Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
5a25f935
Unverified
Commit
5a25f935
authored
Apr 10, 2024
by
Michael Yang
Committed by
GitHub
Apr 10, 2024
Browse files
Merge pull request #3478 from ollama/mxyng/tensor-layer
refactor tensor query
parents
c5c451ca
7e33a017
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
134 additions
and
110 deletions
+134
-110
format/bytes.go
format/bytes.go
+1
-1
gpu/gpu.go
gpu/gpu.go
+3
-3
gpu/gpu_darwin.go
gpu/gpu_darwin.go
+3
-4
gpu/types.go
gpu/types.go
+1
-1
llm/ggla.go
llm/ggla.go
+1
-1
llm/ggml.go
llm/ggml.go
+74
-51
llm/gguf.go
llm/gguf.go
+1
-1
llm/server.go
llm/server.go
+50
-48
No files found.
format/bytes.go
View file @
5a25f935
...
...
@@ -50,7 +50,7 @@ func HumanBytes(b int64) string {
}
}
func
HumanBytes2
(
b
int64
)
string
{
func
HumanBytes2
(
b
u
int64
)
string
{
switch
{
case
b
>=
MebiByte
:
return
fmt
.
Sprintf
(
"%.1f MiB"
,
float64
(
b
)
/
MebiByte
)
...
...
gpu/gpu.go
View file @
5a25f935
...
...
@@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) {
return
ret
,
nil
}
func
CheckVRAM
()
(
int64
,
error
)
{
func
CheckVRAM
()
(
u
int64
,
error
)
{
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
if
userLimit
!=
""
{
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
...
...
@@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) {
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
}
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
return
avail
,
nil
return
uint64
(
avail
)
,
nil
}
gpuInfo
:=
GetGPUInfo
()
if
gpuInfo
.
FreeMemory
>
0
&&
(
gpuInfo
.
Library
==
"cuda"
||
gpuInfo
.
Library
==
"rocm"
)
{
return
int64
(
gpuInfo
.
FreeMemory
)
,
nil
return
gpuInfo
.
FreeMemory
,
nil
}
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
...
...
gpu/gpu_darwin.go
View file @
5a25f935
...
...
@@ -17,7 +17,7 @@ import (
)
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func
CheckVRAM
()
(
int64
,
error
)
{
func
CheckVRAM
()
(
u
int64
,
error
)
{
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
if
userLimit
!=
""
{
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
...
...
@@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) {
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
}
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
return
avail
,
nil
return
uint64
(
avail
)
,
nil
}
if
runtime
.
GOARCH
==
"amd64"
{
// gpu not supported, this may not be metal
return
0
,
nil
}
recommendedMaxVRAM
:=
int64
(
C
.
getRecommendedMaxVRAM
())
return
recommendedMaxVRAM
,
nil
return
uint64
(
C
.
getRecommendedMaxVRAM
()),
nil
}
func
GetGPUInfo
()
GpuInfo
{
...
...
gpu/types.go
View file @
5a25f935
...
...
@@ -15,7 +15,7 @@ type GpuInfo struct {
Variant
string
`json:"variant,omitempty"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory
int64
`json:"-"`
MinimumMemory
u
int64
`json:"-"`
// TODO add other useful attributes about the card here for discovery information
}
...
...
llm/ggla.go
View file @
5a25f935
...
...
@@ -49,7 +49,7 @@ func (llm *ggla) KV() KV {
return
llm
.
kv
}
func
(
llm
*
ggla
)
Tensors
()
[]
*
Tensor
{
func
(
llm
*
ggla
)
Tensors
()
Tensor
s
{
return
llm
.
tensors
}
...
...
llm/ggml.go
View file @
5a25f935
...
...
@@ -13,16 +13,6 @@ type GGML struct {
model
}
func
(
ggml
*
GGML
)
LayerSize
(
prefix
string
)
(
n
int64
)
{
for
_
,
t
:=
range
ggml
.
Tensors
()
{
if
strings
.
HasPrefix
(
t
.
Name
,
prefix
)
{
n
+=
int64
(
t
.
size
())
}
}
return
}
const
(
fileTypeF32
uint32
=
iota
fileTypeF16
...
...
@@ -101,7 +91,7 @@ func fileType(fileType uint32) string {
type
model
interface
{
KV
()
KV
Tensors
()
[]
*
Tensor
Tensors
()
Tensor
s
}
type
KV
map
[
string
]
any
...
...
@@ -167,6 +157,36 @@ func (kv KV) ContextLength() uint64 {
return
kv
.
u64
(
fmt
.
Sprintf
(
"%s.context_length"
,
kv
.
Architecture
()))
}
type
Tensors
[]
*
Tensor
func
(
ts
Tensors
)
Layers
()
map
[
string
]
Layer
{
layers
:=
make
(
map
[
string
]
Layer
)
for
_
,
t
:=
range
ts
{
parts
:=
strings
.
Split
(
t
.
Name
,
"."
)
if
parts
[
0
]
==
"blk"
{
parts
=
parts
[
1
:
]
}
if
_
,
ok
:=
layers
[
parts
[
0
]];
!
ok
{
layers
[
parts
[
0
]]
=
make
(
Layer
)
}
layers
[
parts
[
0
]][
strings
.
Join
(
parts
[
1
:
],
"."
)]
=
t
}
return
layers
}
type
Layer
map
[
string
]
*
Tensor
func
(
l
Layer
)
size
()
(
size
uint64
)
{
for
_
,
t
:=
range
l
{
size
+=
t
.
size
()
}
return
size
}
type
Tensor
struct
{
Name
string
`json:"name"`
Kind
uint32
`json:"kind"`
...
...
@@ -304,49 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
},
offset
,
nil
}
func
(
llm
GGML
)
GraphSize
(
context
,
batch
int
)
(
int64
,
bool
)
{
embeddingLength
:=
llm
.
KV
()
.
EmbeddingLength
()
headCount
:=
llm
.
KV
()
.
HeadCount
()
headCountKV
:=
llm
.
KV
()
.
HeadCountKV
()
vocabLength
:=
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
))
var
attnQKVWeight1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
HasSuffix
(
t
.
Name
,
".attn_qkv.weight"
)
&&
len
(
t
.
Shape
)
>=
2
{
attnQKVWeight1
=
t
.
Shape
[
1
]
break
}
}
var
ffnGate1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
Index
(
t
.
Name
,
".ffn_gate"
)
>
0
&&
len
(
t
.
Shape
)
>=
2
{
ffnGate1
=
t
.
Shape
[
1
]
break
}
}
func
(
llm
GGML
)
GraphSize
(
context
,
batch
uint64
)
(
partialOffload
,
fullOffload
uint64
)
{
embedding
:=
llm
.
KV
()
.
EmbeddingLength
()
heads
:=
llm
.
KV
()
.
HeadCount
()
headsKV
:=
llm
.
KV
()
.
HeadCountKV
()
vocab
:=
uint64
(
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
)))
switch
llm
.
KV
()
.
Architecture
()
{
case
"gemma"
,
"command-r"
:
return
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
true
case
"phi2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
attnQKVWeight1
+
uint64
(
context
)
*
headCount
),
),
true
case
"qwen2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
2
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
),
true
case
"llama"
:
if
ffnGate1
>
0
{
// moe
return
4
*
int64
(
batch
)
*
int64
(
2
+
3
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
+
2
*
headCountKV
+
ffnGate1
),
true
}
return
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
true
fullOffload
=
4
*
batch
*
(
1
+
4
*
embedding
+
context
*
(
1
+
heads
))
partialOffload
=
4
*
batch
*
embedding
partialOffload
+=
max
(
4
*
batch
*
(
1
+
embedding
+
max
(
context
,
embedding
))
+
embedding
*
embedding
*
9
/
16
+
4
*
context
*
(
batch
*
heads
+
embedding
/
heads
*
headsKV
),
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
)
case
"gemma"
:
fullOffload
=
4
*
batch
*
(
embedding
+
vocab
)
partialOffload
=
4
*
batch
*
(
2
*
embedding
+
vocab
+
1
)
+
embedding
*
vocab
*
105
/
128
case
"command-r"
:
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
2
+
4
*
embedding
+
context
*
(
1
+
heads
)),
)
partialOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
4
*
batch
*
(
1
+
2
*
embedding
+
context
*
(
1
+
heads
))
+
4
*
embedding
*
context
+
embedding
*
embedding
*
9
/
16
,
)
case
"qwen2"
:
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
1
+
2
*
embedding
+
context
+
context
*
heads
),
)
partialOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
4
*
(
batch
*
(
1
+
2
*
embedding
+
context
*
(
1
+
heads
))
+
embedding
*
(
1
+
context
)),
)
case
"phi2"
:
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
1
+
4
*
embedding
+
context
+
context
*
heads
),
)
partialOffload
=
4
*
batch
*
(
2
*
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
}
return
0
,
false
return
}
llm/gguf.go
View file @
5a25f935
...
...
@@ -109,7 +109,7 @@ func (llm *gguf) KV() KV {
return
llm
.
kv
}
func
(
llm
*
gguf
)
Tensors
()
[]
*
Tensor
{
func
(
llm
*
gguf
)
Tensors
()
Tensor
s
{
return
llm
.
tensors
}
...
...
llm/server.go
View file @
5a25f935
...
...
@@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{
}
func
NewLlamaServer
(
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
*
LlamaServer
,
error
)
{
if
_
,
err
:=
os
.
Stat
(
model
);
err
!=
nil
{
return
nil
,
err
}
f
,
err
:=
os
.
Open
(
model
)
if
err
!=
nil
{
return
nil
,
err
...
...
@@ -65,67 +61,79 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
opts
.
NumCtx
=
4
}
a
vailable
Memory
,
_
:=
gpu
.
CheckVRAM
()
memoryA
vailable
,
_
:=
gpu
.
CheckVRAM
()
info
:=
gpu
.
GetGPUInfo
()
usedMemory
:=
info
.
MinimumMemory
memoryMinimum
:=
info
.
MinimumMemory
for
_
,
projector
:=
range
projectors
{
usedMemory
+=
projectorMemoryRequirements
(
projector
)
memoryMinimum
+=
projectorMemoryRequirements
(
projector
)
// multimodal models require at least 2048 context
opts
.
NumCtx
=
max
(
opts
.
NumCtx
,
2048
)
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
kv
:
=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
()
)
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
()
)
/
int64
(
ggml
.
KV
()
.
HeadCount
()
)
*
int64
(
ggml
.
KV
()
.
HeadCountKV
()
)
var
kv
uint64
=
2
*
2
*
u
int64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
ggml
.
KV
()
.
EmbeddingLength
()
/
ggml
.
KV
()
.
HeadCount
()
*
ggml
.
KV
()
.
HeadCountKV
()
graph
,
ok
:=
ggml
.
GraphSize
(
opts
.
NumCtx
,
min
(
opts
.
NumCtx
,
opts
.
NumBatch
))
if
!
ok
{
graph
=
int64
(
ggml
.
KV
()
.
GQA
()
)
*
kv
/
6
graph
PartialOffload
,
graphFullOffload
:=
ggml
.
GraphSize
(
uint64
(
opts
.
NumCtx
)
,
uint64
(
min
(
opts
.
NumCtx
,
opts
.
NumBatch
))
)
if
graphPartialOffload
==
0
{
graph
PartialOffload
=
ggml
.
KV
()
.
GQA
()
*
kv
/
6
}
usedMemory
+=
graph
if
(
usedMemory
>
availableMemory
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
()))
&&
info
.
Library
!=
"metal"
{
info
.
Library
=
"cpu"
if
graphFullOffload
==
0
{
graphFullOffload
=
graphPartialOffload
}
requiredMemory
:=
usedMemory
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal
:=
memoryMinimum
+
graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
if
info
.
Library
!=
"metal"
{
if
memoryRequiredPartial
>
memoryAvailable
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
())
{
info
.
Library
=
"cpu"
}
}
var
layers
int
var
layerCount
int
layers
:=
ggml
.
Tensors
()
.
Layers
()
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
layerMemory
:=
ggml
.
LayerSize
(
fmt
.
Sprintf
(
"blk.%d."
,
i
))
+
kv
/
int64
(
ggml
.
KV
()
.
BlockCount
())
requiredMemory
+=
layerMemory
memoryLayer
:=
layers
[
fmt
.
Sprintf
(
"%d"
,
i
)]
.
size
()
// KV is proportional to the number of layers
memoryLayer
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
if
availableMemory
>
usedMemory
+
layerMemory
&&
(
opts
.
NumGPU
<
0
||
layers
<
opts
.
NumGPU
)
{
usedMemory
+=
layerMemory
layers
++
memoryRequiredTotal
+=
memoryLayer
if
memoryAvailable
>
memoryRequiredPartial
+
memoryLayer
{
memoryRequiredPartial
+=
memoryLayer
layerCount
++
}
}
memOutputLayer
:=
ggml
.
LayerSize
(
"output."
)
requiredMemory
+=
memOutputLayer
memoryLayerOutput
:=
layers
[
"output"
]
.
size
()
memoryRequiredTotal
+=
memoryLayerOutput
if
memoryAvailable
>
memoryRequiredTotal
{
layerCount
=
int
(
ggml
.
KV
()
.
BlockCount
())
+
1
memoryRequiredPartial
=
memoryRequiredTotal
}
// only offload output layer if all repeating layers are offloaded
if
layers
>=
int
(
ggml
.
KV
()
.
BlockCount
())
&&
availableMemory
>
usedMemory
+
memOutputLayer
{
usedMemory
+=
memOutputLayer
layers
++
if
opts
.
NumGPU
<
0
{
opts
.
NumGPU
=
layerCount
}
slog
.
Info
(
"offload to gpu"
,
"layers"
,
layers
,
"required"
,
format
.
HumanBytes2
(
requiredMemory
),
"used"
,
format
.
HumanBytes2
(
usedMemory
),
"available"
,
format
.
HumanBytes2
(
availableMemory
),
"reallayers"
,
opts
.
NumGPU
,
"layers"
,
layerCount
,
"required"
,
format
.
HumanBytes2
(
memoryRequiredTotal
),
"used"
,
format
.
HumanBytes2
(
memoryRequiredPartial
),
"available"
,
format
.
HumanBytes2
(
memoryAvailable
),
"kv"
,
format
.
HumanBytes2
(
kv
),
"graph"
,
format
.
HumanBytes2
(
graph
),
"fulloffload"
,
format
.
HumanBytes2
(
graphFullOffload
),
"partialoffload"
,
format
.
HumanBytes2
(
graphPartialOffload
),
)
if
opts
.
NumGPU
<
0
&&
info
.
Library
!=
"cpu"
{
opts
.
NumGPU
=
layers
}
if
len
(
adapters
)
>
1
{
return
nil
,
errors
.
New
(
"ollama supports only one lora adapter, but multiple were provided"
)
}
...
...
@@ -282,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
return
nil
,
finalErr
}
func
projectorMemoryRequirements
(
filename
string
)
int64
{
func
projectorMemoryRequirements
(
filename
string
)
u
int64
{
file
,
err
:=
os
.
Open
(
filename
)
if
err
!=
nil
{
return
0
...
...
@@ -294,18 +302,12 @@ func projectorMemoryRequirements(filename string) int64 {
return
0
}
prefixes
:=
make
(
map
[
string
]
struct
{})
for
_
,
layer
:=
range
ggml
.
Tensors
()
{
parts
:=
strings
.
Split
(
layer
.
Name
,
"."
)
prefixes
[
strings
.
Join
(
parts
[
:
2
],
"."
)]
=
struct
{}{}
}
var
ask
int64
for
prefix
:=
range
prefixes
{
ask
+=
ggml
.
LayerSize
(
prefix
)
var
mem
uint64
for
_
,
layer
:=
range
ggml
.
Tensors
()
.
Layers
()
{
mem
+=
layer
.
size
()
}
return
ask
return
mem
}
type
ServerStatus
int
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment