Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
8b2c1006
Commit
8b2c1006
authored
Apr 03, 2024
by
Michael Yang
Browse files
refactor tensor query
parent
c5c451ca
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
54 additions
and
42 deletions
+54
-42
llm/ggla.go
llm/ggla.go
+1
-1
llm/ggml.go
llm/ggml.go
+41
-25
llm/gguf.go
llm/gguf.go
+1
-1
llm/server.go
llm/server.go
+11
-15
No files found.
llm/ggla.go
View file @
8b2c1006
...
...
@@ -49,7 +49,7 @@ func (llm *ggla) KV() KV {
return
llm
.
kv
}
func
(
llm
*
ggla
)
Tensors
()
[]
*
Tensor
{
func
(
llm
*
ggla
)
Tensors
()
Tensor
s
{
return
llm
.
tensors
}
...
...
llm/ggml.go
View file @
8b2c1006
...
...
@@ -13,16 +13,6 @@ type GGML struct {
model
}
func
(
ggml
*
GGML
)
LayerSize
(
prefix
string
)
(
n
int64
)
{
for
_
,
t
:=
range
ggml
.
Tensors
()
{
if
strings
.
HasPrefix
(
t
.
Name
,
prefix
)
{
n
+=
int64
(
t
.
size
())
}
}
return
}
const
(
fileTypeF32
uint32
=
iota
fileTypeF16
...
...
@@ -101,7 +91,7 @@ func fileType(fileType uint32) string {
type
model
interface
{
KV
()
KV
Tensors
()
[]
*
Tensor
Tensors
()
Tensor
s
}
type
KV
map
[
string
]
any
...
...
@@ -167,6 +157,36 @@ func (kv KV) ContextLength() uint64 {
return
kv
.
u64
(
fmt
.
Sprintf
(
"%s.context_length"
,
kv
.
Architecture
()))
}
type
Tensors
[]
*
Tensor
func
(
ts
Tensors
)
Layers
()
map
[
string
]
Layer
{
layers
:=
make
(
map
[
string
]
Layer
)
for
_
,
t
:=
range
ts
{
parts
:=
strings
.
Split
(
t
.
Name
,
"."
)
if
parts
[
0
]
==
"blk"
{
parts
=
parts
[
1
:
]
}
if
_
,
ok
:=
layers
[
parts
[
0
]];
!
ok
{
layers
[
parts
[
0
]]
=
make
(
Layer
)
}
layers
[
parts
[
0
]][
strings
.
Join
(
parts
[
1
:
],
"."
)]
=
t
}
return
layers
}
type
Layer
map
[
string
]
*
Tensor
func
(
l
Layer
)
size
()
(
size
uint64
)
{
for
_
,
t
:=
range
l
{
size
+=
t
.
size
()
}
return
size
}
type
Tensor
struct
{
Name
string
`json:"name"`
Kind
uint32
`json:"kind"`
...
...
@@ -310,20 +330,16 @@ func (llm GGML) GraphSize(context, batch int) (int64, bool) {
headCountKV
:=
llm
.
KV
()
.
HeadCountKV
()
vocabLength
:=
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
))
layers
:=
llm
.
Tensors
()
.
Layers
()
var
attnQKVWeight1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
HasSuffix
(
t
.
Name
,
".attn_qkv.weight"
)
&&
len
(
t
.
Shape
)
>=
2
{
attnQKVWeight1
=
t
.
Shape
[
1
]
break
}
if
t
,
ok
:=
layers
[
"0"
][
"attn_qkv.weight"
];
ok
&&
len
(
t
.
Shape
)
>
2
{
attnQKVWeight1
=
t
.
Shape
[
1
]
}
var
ffnGate1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
Index
(
t
.
Name
,
".ffn_gate"
)
>
0
&&
len
(
t
.
Shape
)
>=
2
{
ffnGate1
=
t
.
Shape
[
1
]
break
}
var
ffnGate0Weight1
uint64
=
0
if
t
,
ok
:=
layers
[
"0"
][
"ffn_gate.0.weight"
];
ok
&&
len
(
t
.
Shape
)
>
2
{
ffnGate0Weight1
=
t
.
Shape
[
1
]
}
switch
llm
.
KV
()
.
Architecture
()
{
...
...
@@ -340,11 +356,11 @@ func (llm GGML) GraphSize(context, batch int) (int64, bool) {
4
*
int64
(
batch
)
*
int64
(
1
+
2
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
),
true
case
"llama"
:
if
ffnGate1
>
0
{
if
ffnGate
0Weight
1
>
0
{
// moe
return
4
*
int64
(
batch
)
*
int64
(
2
+
3
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
+
2
*
headCountKV
+
ffnGate1
),
true
return
4
*
int64
(
batch
)
*
int64
(
2
+
3
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
+
2
*
headCountKV
+
ffnGate
0Weight
1
),
true
}
return
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
true
}
...
...
llm/gguf.go
View file @
8b2c1006
...
...
@@ -109,7 +109,7 @@ func (llm *gguf) KV() KV {
return
llm
.
kv
}
func
(
llm
*
gguf
)
Tensors
()
[]
*
Tensor
{
func
(
llm
*
gguf
)
Tensors
()
Tensor
s
{
return
llm
.
tensors
}
...
...
llm/server.go
View file @
8b2c1006
...
...
@@ -77,11 +77,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
kv
:
=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
()
)
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
()
)
/
int64
(
ggml
.
KV
()
.
HeadCount
()
)
*
int64
(
ggml
.
KV
()
.
HeadCountKV
()
)
var
kv
uint64
=
2
*
2
*
u
int64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
ggml
.
KV
()
.
EmbeddingLength
()
/
ggml
.
KV
()
.
HeadCount
()
*
ggml
.
KV
()
.
HeadCountKV
()
graph
,
ok
:=
ggml
.
GraphSize
(
opts
.
NumCtx
,
min
(
opts
.
NumCtx
,
opts
.
NumBatch
))
if
!
ok
{
graph
=
int64
(
ggml
.
KV
()
.
GQA
()
)
*
kv
/
6
graph
=
int64
(
ggml
.
KV
()
.
GQA
()
*
kv
)
/
6
}
usedMemory
+=
graph
...
...
@@ -92,9 +92,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
requiredMemory
:=
usedMemory
tensorLayers
:=
ggml
.
Tensors
()
.
Layers
()
var
layers
int
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
layerMemory
:=
ggml
.
LayerSize
(
fmt
.
Sprintf
(
"
blk.
%d
.
"
,
i
))
+
kv
/
int64
(
ggml
.
KV
()
.
BlockCount
())
layerMemory
:=
int64
(
tensorLayers
[
fmt
.
Sprintf
(
"%d"
,
i
)
]
.
size
(
)
+
kv
/
ggml
.
KV
()
.
BlockCount
())
requiredMemory
+=
layerMemory
if
availableMemory
>
usedMemory
+
layerMemory
&&
(
opts
.
NumGPU
<
0
||
layers
<
opts
.
NumGPU
)
{
...
...
@@ -103,7 +105,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
}
}
memOutputLayer
:=
ggml
.
LayerSize
(
"output."
)
memOutputLayer
:=
int64
(
tensorLayers
[
"output"
]
.
size
()
)
requiredMemory
+=
memOutputLayer
// only offload output layer if all repeating layers are offloaded
...
...
@@ -118,7 +120,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
"required"
,
format
.
HumanBytes2
(
requiredMemory
),
"used"
,
format
.
HumanBytes2
(
usedMemory
),
"available"
,
format
.
HumanBytes2
(
availableMemory
),
"kv"
,
format
.
HumanBytes2
(
kv
),
"kv"
,
format
.
HumanBytes2
(
int64
(
kv
)
)
,
"graph"
,
format
.
HumanBytes2
(
graph
),
)
...
...
@@ -294,18 +296,12 @@ func projectorMemoryRequirements(filename string) int64 {
return
0
}
prefixes
:=
make
(
map
[
string
]
struct
{})
for
_
,
layer
:=
range
ggml
.
Tensors
()
{
parts
:=
strings
.
Split
(
layer
.
Name
,
"."
)
prefixes
[
strings
.
Join
(
parts
[
:
2
],
"."
)]
=
struct
{}{}
}
var
ask
int64
for
prefix
:=
range
prefixes
{
ask
+=
ggml
.
LayerSize
(
prefix
)
var
mem
uint64
for
_
,
layer
:=
range
ggml
.
Tensors
()
.
Layers
()
{
mem
+=
layer
.
size
()
}
return
ask
return
int64
(
mem
)
}
type
ServerStatus
int
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment