Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
5a25f935
Unverified
Commit
5a25f935
authored
Apr 10, 2024
by
Michael Yang
Committed by
GitHub
Apr 10, 2024
Browse files
Merge pull request #3478 from ollama/mxyng/tensor-layer
refactor tensor query
parents
c5c451ca
7e33a017
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
134 additions
and
110 deletions
+134
-110
format/bytes.go
format/bytes.go
+1
-1
gpu/gpu.go
gpu/gpu.go
+3
-3
gpu/gpu_darwin.go
gpu/gpu_darwin.go
+3
-4
gpu/types.go
gpu/types.go
+1
-1
llm/ggla.go
llm/ggla.go
+1
-1
llm/ggml.go
llm/ggml.go
+74
-51
llm/gguf.go
llm/gguf.go
+1
-1
llm/server.go
llm/server.go
+50
-48
No files found.
format/bytes.go
View file @
5a25f935
...
@@ -50,7 +50,7 @@ func HumanBytes(b int64) string {
...
@@ -50,7 +50,7 @@ func HumanBytes(b int64) string {
}
}
}
}
func
HumanBytes2
(
b
int64
)
string
{
func
HumanBytes2
(
b
u
int64
)
string
{
switch
{
switch
{
case
b
>=
MebiByte
:
case
b
>=
MebiByte
:
return
fmt
.
Sprintf
(
"%.1f MiB"
,
float64
(
b
)
/
MebiByte
)
return
fmt
.
Sprintf
(
"%.1f MiB"
,
float64
(
b
)
/
MebiByte
)
...
...
gpu/gpu.go
View file @
5a25f935
...
@@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) {
...
@@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) {
return
ret
,
nil
return
ret
,
nil
}
}
func
CheckVRAM
()
(
int64
,
error
)
{
func
CheckVRAM
()
(
u
int64
,
error
)
{
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
if
userLimit
!=
""
{
if
userLimit
!=
""
{
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
...
@@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) {
...
@@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) {
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
}
}
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
return
avail
,
nil
return
uint64
(
avail
)
,
nil
}
}
gpuInfo
:=
GetGPUInfo
()
gpuInfo
:=
GetGPUInfo
()
if
gpuInfo
.
FreeMemory
>
0
&&
(
gpuInfo
.
Library
==
"cuda"
||
gpuInfo
.
Library
==
"rocm"
)
{
if
gpuInfo
.
FreeMemory
>
0
&&
(
gpuInfo
.
Library
==
"cuda"
||
gpuInfo
.
Library
==
"rocm"
)
{
return
int64
(
gpuInfo
.
FreeMemory
)
,
nil
return
gpuInfo
.
FreeMemory
,
nil
}
}
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
...
...
gpu/gpu_darwin.go
View file @
5a25f935
...
@@ -17,7 +17,7 @@ import (
...
@@ -17,7 +17,7 @@ import (
)
)
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func
CheckVRAM
()
(
int64
,
error
)
{
func
CheckVRAM
()
(
u
int64
,
error
)
{
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
userLimit
:=
os
.
Getenv
(
"OLLAMA_MAX_VRAM"
)
if
userLimit
!=
""
{
if
userLimit
!=
""
{
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
avail
,
err
:=
strconv
.
ParseInt
(
userLimit
,
10
,
64
)
...
@@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) {
...
@@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) {
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
return
0
,
fmt
.
Errorf
(
"Invalid OLLAMA_MAX_VRAM setting %s: %s"
,
userLimit
,
err
)
}
}
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
slog
.
Info
(
fmt
.
Sprintf
(
"user override OLLAMA_MAX_VRAM=%d"
,
avail
))
return
avail
,
nil
return
uint64
(
avail
)
,
nil
}
}
if
runtime
.
GOARCH
==
"amd64"
{
if
runtime
.
GOARCH
==
"amd64"
{
// gpu not supported, this may not be metal
// gpu not supported, this may not be metal
return
0
,
nil
return
0
,
nil
}
}
recommendedMaxVRAM
:=
int64
(
C
.
getRecommendedMaxVRAM
())
return
uint64
(
C
.
getRecommendedMaxVRAM
()),
nil
return
recommendedMaxVRAM
,
nil
}
}
func
GetGPUInfo
()
GpuInfo
{
func
GetGPUInfo
()
GpuInfo
{
...
...
gpu/types.go
View file @
5a25f935
...
@@ -15,7 +15,7 @@ type GpuInfo struct {
...
@@ -15,7 +15,7 @@ type GpuInfo struct {
Variant
string
`json:"variant,omitempty"`
Variant
string
`json:"variant,omitempty"`
// MinimumMemory represents the minimum memory required to use the GPU
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory
int64
`json:"-"`
MinimumMemory
u
int64
`json:"-"`
// TODO add other useful attributes about the card here for discovery information
// TODO add other useful attributes about the card here for discovery information
}
}
...
...
llm/ggla.go
View file @
5a25f935
...
@@ -49,7 +49,7 @@ func (llm *ggla) KV() KV {
...
@@ -49,7 +49,7 @@ func (llm *ggla) KV() KV {
return
llm
.
kv
return
llm
.
kv
}
}
func
(
llm
*
ggla
)
Tensors
()
[]
*
Tensor
{
func
(
llm
*
ggla
)
Tensors
()
Tensor
s
{
return
llm
.
tensors
return
llm
.
tensors
}
}
...
...
llm/ggml.go
View file @
5a25f935
...
@@ -13,16 +13,6 @@ type GGML struct {
...
@@ -13,16 +13,6 @@ type GGML struct {
model
model
}
}
func
(
ggml
*
GGML
)
LayerSize
(
prefix
string
)
(
n
int64
)
{
for
_
,
t
:=
range
ggml
.
Tensors
()
{
if
strings
.
HasPrefix
(
t
.
Name
,
prefix
)
{
n
+=
int64
(
t
.
size
())
}
}
return
}
const
(
const
(
fileTypeF32
uint32
=
iota
fileTypeF32
uint32
=
iota
fileTypeF16
fileTypeF16
...
@@ -101,7 +91,7 @@ func fileType(fileType uint32) string {
...
@@ -101,7 +91,7 @@ func fileType(fileType uint32) string {
type
model
interface
{
type
model
interface
{
KV
()
KV
KV
()
KV
Tensors
()
[]
*
Tensor
Tensors
()
Tensor
s
}
}
type
KV
map
[
string
]
any
type
KV
map
[
string
]
any
...
@@ -167,6 +157,36 @@ func (kv KV) ContextLength() uint64 {
...
@@ -167,6 +157,36 @@ func (kv KV) ContextLength() uint64 {
return
kv
.
u64
(
fmt
.
Sprintf
(
"%s.context_length"
,
kv
.
Architecture
()))
return
kv
.
u64
(
fmt
.
Sprintf
(
"%s.context_length"
,
kv
.
Architecture
()))
}
}
type
Tensors
[]
*
Tensor
func
(
ts
Tensors
)
Layers
()
map
[
string
]
Layer
{
layers
:=
make
(
map
[
string
]
Layer
)
for
_
,
t
:=
range
ts
{
parts
:=
strings
.
Split
(
t
.
Name
,
"."
)
if
parts
[
0
]
==
"blk"
{
parts
=
parts
[
1
:
]
}
if
_
,
ok
:=
layers
[
parts
[
0
]];
!
ok
{
layers
[
parts
[
0
]]
=
make
(
Layer
)
}
layers
[
parts
[
0
]][
strings
.
Join
(
parts
[
1
:
],
"."
)]
=
t
}
return
layers
}
type
Layer
map
[
string
]
*
Tensor
func
(
l
Layer
)
size
()
(
size
uint64
)
{
for
_
,
t
:=
range
l
{
size
+=
t
.
size
()
}
return
size
}
type
Tensor
struct
{
type
Tensor
struct
{
Name
string
`json:"name"`
Name
string
`json:"name"`
Kind
uint32
`json:"kind"`
Kind
uint32
`json:"kind"`
...
@@ -304,49 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
...
@@ -304,49 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
},
offset
,
nil
},
offset
,
nil
}
}
func
(
llm
GGML
)
GraphSize
(
context
,
batch
int
)
(
int64
,
bool
)
{
func
(
llm
GGML
)
GraphSize
(
context
,
batch
uint64
)
(
partialOffload
,
fullOffload
uint64
)
{
embeddingLength
:=
llm
.
KV
()
.
EmbeddingLength
()
embedding
:=
llm
.
KV
()
.
EmbeddingLength
()
headCount
:=
llm
.
KV
()
.
HeadCount
()
heads
:=
llm
.
KV
()
.
HeadCount
()
headCountKV
:=
llm
.
KV
()
.
HeadCountKV
()
headsKV
:=
llm
.
KV
()
.
HeadCountKV
()
vocabLength
:=
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
))
vocab
:=
uint64
(
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
)))
var
attnQKVWeight1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
HasSuffix
(
t
.
Name
,
".attn_qkv.weight"
)
&&
len
(
t
.
Shape
)
>=
2
{
attnQKVWeight1
=
t
.
Shape
[
1
]
break
}
}
var
ffnGate1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
Index
(
t
.
Name
,
".ffn_gate"
)
>
0
&&
len
(
t
.
Shape
)
>=
2
{
ffnGate1
=
t
.
Shape
[
1
]
break
}
}
switch
llm
.
KV
()
.
Architecture
()
{
switch
llm
.
KV
()
.
Architecture
()
{
case
"gemma"
,
"command-r"
:
return
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
true
case
"phi2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
attnQKVWeight1
+
uint64
(
context
)
*
headCount
),
),
true
case
"qwen2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
2
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
),
true
case
"llama"
:
case
"llama"
:
if
ffnGate1
>
0
{
fullOffload
=
4
*
batch
*
(
1
+
4
*
embedding
+
context
*
(
1
+
heads
))
// moe
return
4
*
int64
(
batch
)
*
int64
(
2
+
3
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
+
2
*
headCountKV
+
ffnGate1
),
true
partialOffload
=
4
*
batch
*
embedding
}
partialOffload
+=
max
(
4
*
batch
*
(
1
+
embedding
+
max
(
context
,
embedding
))
+
embedding
*
embedding
*
9
/
16
+
4
*
context
*
(
batch
*
heads
+
embedding
/
heads
*
headsKV
),
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
)
case
"gemma"
:
fullOffload
=
4
*
batch
*
(
embedding
+
vocab
)
partialOffload
=
4
*
batch
*
(
2
*
embedding
+
vocab
+
1
)
+
embedding
*
vocab
*
105
/
128
case
"command-r"
:
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
2
+
4
*
embedding
+
context
*
(
1
+
heads
)),
)
partialOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
4
*
batch
*
(
1
+
2
*
embedding
+
context
*
(
1
+
heads
))
+
4
*
embedding
*
context
+
embedding
*
embedding
*
9
/
16
,
)
case
"qwen2"
:
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
1
+
2
*
embedding
+
context
+
context
*
heads
),
)
partialOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
4
*
(
batch
*
(
1
+
2
*
embedding
+
context
*
(
1
+
heads
))
+
embedding
*
(
1
+
context
)),
)
case
"phi2"
:
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
1
+
4
*
embedding
+
context
+
context
*
heads
),
)
return
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
true
partialOffload
=
4
*
batch
*
(
2
*
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
}
}
return
0
,
false
return
}
}
llm/gguf.go
View file @
5a25f935
...
@@ -109,7 +109,7 @@ func (llm *gguf) KV() KV {
...
@@ -109,7 +109,7 @@ func (llm *gguf) KV() KV {
return
llm
.
kv
return
llm
.
kv
}
}
func
(
llm
*
gguf
)
Tensors
()
[]
*
Tensor
{
func
(
llm
*
gguf
)
Tensors
()
Tensor
s
{
return
llm
.
tensors
return
llm
.
tensors
}
}
...
...
llm/server.go
View file @
5a25f935
...
@@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{
...
@@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{
}
}
func
NewLlamaServer
(
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
*
LlamaServer
,
error
)
{
func
NewLlamaServer
(
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
*
LlamaServer
,
error
)
{
if
_
,
err
:=
os
.
Stat
(
model
);
err
!=
nil
{
return
nil
,
err
}
f
,
err
:=
os
.
Open
(
model
)
f
,
err
:=
os
.
Open
(
model
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
...
@@ -65,67 +61,79 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
...
@@ -65,67 +61,79 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
opts
.
NumCtx
=
4
opts
.
NumCtx
=
4
}
}
a
vailable
Memory
,
_
:=
gpu
.
CheckVRAM
()
memoryA
vailable
,
_
:=
gpu
.
CheckVRAM
()
info
:=
gpu
.
GetGPUInfo
()
info
:=
gpu
.
GetGPUInfo
()
usedMemory
:=
info
.
MinimumMemory
memoryMinimum
:=
info
.
MinimumMemory
for
_
,
projector
:=
range
projectors
{
for
_
,
projector
:=
range
projectors
{
usedMemory
+=
projectorMemoryRequirements
(
projector
)
memoryMinimum
+=
projectorMemoryRequirements
(
projector
)
// multimodal models require at least 2048 context
// multimodal models require at least 2048 context
opts
.
NumCtx
=
max
(
opts
.
NumCtx
,
2048
)
opts
.
NumCtx
=
max
(
opts
.
NumCtx
,
2048
)
}
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
kv
:
=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
()
)
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
()
)
/
int64
(
ggml
.
KV
()
.
HeadCount
()
)
*
int64
(
ggml
.
KV
()
.
HeadCountKV
()
)
var
kv
uint64
=
2
*
2
*
u
int64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
ggml
.
KV
()
.
EmbeddingLength
()
/
ggml
.
KV
()
.
HeadCount
()
*
ggml
.
KV
()
.
HeadCountKV
()
graph
,
ok
:=
ggml
.
GraphSize
(
opts
.
NumCtx
,
min
(
opts
.
NumCtx
,
opts
.
NumBatch
))
graph
PartialOffload
,
graphFullOffload
:=
ggml
.
GraphSize
(
uint64
(
opts
.
NumCtx
)
,
uint64
(
min
(
opts
.
NumCtx
,
opts
.
NumBatch
))
)
if
!
ok
{
if
graphPartialOffload
==
0
{
graph
=
int64
(
ggml
.
KV
()
.
GQA
()
)
*
kv
/
6
graph
PartialOffload
=
ggml
.
KV
()
.
GQA
()
*
kv
/
6
}
}
usedMemory
+=
graph
if
graphFullOffload
==
0
{
graphFullOffload
=
graphPartialOffload
}
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal
:=
memoryMinimum
+
graphFullOffload
if
(
usedMemory
>
availableMemory
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
()))
&&
info
.
Library
!=
"metal"
{
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
if
info
.
Library
!=
"metal"
{
if
memoryRequiredPartial
>
memoryAvailable
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
())
{
info
.
Library
=
"cpu"
info
.
Library
=
"cpu"
}
}
}
requiredMemory
:=
usedMemory
var
layerCount
int
layers
:=
ggml
.
Tensors
()
.
Layers
()
var
layers
int
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
layerMemory
:=
ggml
.
LayerSize
(
fmt
.
Sprintf
(
"blk.%d."
,
i
))
+
kv
/
int64
(
ggml
.
KV
()
.
BlockCount
())
memoryLayer
:=
layers
[
fmt
.
Sprintf
(
"%d"
,
i
)]
.
size
()
requiredMemory
+=
layerMemory
// KV is proportional to the number of layers
memoryLayer
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
if
availableMemory
>
usedMemory
+
layerMemory
&&
(
opts
.
NumGPU
<
0
||
layers
<
opts
.
NumGPU
)
{
memoryRequiredTotal
+=
memoryLayer
usedMemory
+=
layerMemory
if
memoryAvailable
>
memoryRequiredPartial
+
memoryLayer
{
layers
++
memoryRequiredPartial
+=
memoryLayer
layerCount
++
}
}
}
}
memOutputLayer
:=
ggml
.
LayerSize
(
"output."
)
memoryLayerOutput
:=
layers
[
"output"
]
.
size
()
requiredMemory
+=
memOutputLayer
memoryRequiredTotal
+=
memoryLayerOutput
if
memoryAvailable
>
memoryRequiredTotal
{
layerCount
=
int
(
ggml
.
KV
()
.
BlockCount
())
+
1
memoryRequiredPartial
=
memoryRequiredTotal
}
// only offload output layer if all repeating layers are offloaded
if
opts
.
NumGPU
<
0
{
if
layers
>=
int
(
ggml
.
KV
()
.
BlockCount
())
&&
availableMemory
>
usedMemory
+
memOutputLayer
{
opts
.
NumGPU
=
layerCount
usedMemory
+=
memOutputLayer
layers
++
}
}
slog
.
Info
(
slog
.
Info
(
"offload to gpu"
,
"offload to gpu"
,
"layers"
,
layers
,
"reallayers"
,
opts
.
NumGPU
,
"required"
,
format
.
HumanBytes2
(
requiredMemory
),
"layers"
,
layerCount
,
"used"
,
format
.
HumanBytes2
(
usedMemory
),
"required"
,
format
.
HumanBytes2
(
memoryRequiredTotal
),
"available"
,
format
.
HumanBytes2
(
availableMemory
),
"used"
,
format
.
HumanBytes2
(
memoryRequiredPartial
),
"available"
,
format
.
HumanBytes2
(
memoryAvailable
),
"kv"
,
format
.
HumanBytes2
(
kv
),
"kv"
,
format
.
HumanBytes2
(
kv
),
"graph"
,
format
.
HumanBytes2
(
graph
),
"fulloffload"
,
format
.
HumanBytes2
(
graphFullOffload
),
"partialoffload"
,
format
.
HumanBytes2
(
graphPartialOffload
),
)
)
if
opts
.
NumGPU
<
0
&&
info
.
Library
!=
"cpu"
{
opts
.
NumGPU
=
layers
}
if
len
(
adapters
)
>
1
{
if
len
(
adapters
)
>
1
{
return
nil
,
errors
.
New
(
"ollama supports only one lora adapter, but multiple were provided"
)
return
nil
,
errors
.
New
(
"ollama supports only one lora adapter, but multiple were provided"
)
}
}
...
@@ -282,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
...
@@ -282,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
return
nil
,
finalErr
return
nil
,
finalErr
}
}
func
projectorMemoryRequirements
(
filename
string
)
int64
{
func
projectorMemoryRequirements
(
filename
string
)
u
int64
{
file
,
err
:=
os
.
Open
(
filename
)
file
,
err
:=
os
.
Open
(
filename
)
if
err
!=
nil
{
if
err
!=
nil
{
return
0
return
0
...
@@ -294,18 +302,12 @@ func projectorMemoryRequirements(filename string) int64 {
...
@@ -294,18 +302,12 @@ func projectorMemoryRequirements(filename string) int64 {
return
0
return
0
}
}
prefixes
:=
make
(
map
[
string
]
struct
{})
var
mem
uint64
for
_
,
layer
:=
range
ggml
.
Tensors
()
{
for
_
,
layer
:=
range
ggml
.
Tensors
()
.
Layers
()
{
parts
:=
strings
.
Split
(
layer
.
Name
,
"."
)
mem
+=
layer
.
size
()
prefixes
[
strings
.
Join
(
parts
[
:
2
],
"."
)]
=
struct
{}{}
}
var
ask
int64
for
prefix
:=
range
prefixes
{
ask
+=
ggml
.
LayerSize
(
prefix
)
}
}
return
ask
return
mem
}
}
type
ServerStatus
int
type
ServerStatus
int
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment