Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
c7c2f3bc
Unverified
Commit
c7c2f3bc
authored
Jun 20, 2024
by
Daniel Hiltgen
Committed by
GitHub
Jun 20, 2024
Browse files
Merge pull request #5194 from dhiltgen/linux_mmap_auto
Refine mmap default logic on linux
parents
54a79d6a
5bf5aeec
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
16 additions
and
12 deletions
+16
-12
llm/server.go
llm/server.go
+16
-12
No files found.
llm/server.go
View file @
c7c2f3bc
...
...
@@ -81,7 +81,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
var
err
error
var
cpuRunner
string
var
estimate
MemoryEstimate
var
systemMemory
uint64
var
systemTotalMemory
uint64
var
systemFreeMemory
uint64
systemMemInfo
,
err
:=
gpu
.
GetCPUMem
()
if
err
!=
nil
{
slog
.
Error
(
"failed to lookup system memory"
,
"error"
,
err
)
}
else
{
systemTotalMemory
=
systemMemInfo
.
TotalMemory
systemFreeMemory
=
systemMemInfo
.
FreeMemory
slog
.
Debug
(
"system memory"
,
"total"
,
format
.
HumanBytes2
(
systemTotalMemory
),
"free"
,
systemFreeMemory
)
}
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if
opts
.
NumGPU
==
0
{
...
...
@@ -91,19 +101,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
cpuRunner
=
serverForCpu
()
estimate
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
}
else
{
if
gpus
[
0
]
.
Library
==
"metal"
{
memInfo
,
err
:=
gpu
.
GetCPUMem
()
if
err
!=
nil
{
slog
.
Error
(
"failed to lookup system memory"
,
"error"
,
err
)
}
else
{
systemMemory
=
memInfo
.
TotalMemory
slog
.
Debug
(
"system memory"
,
"total"
,
format
.
HumanBytes2
(
systemMemory
))
}
}
estimate
=
EstimateGPULayers
(
gpus
,
ggml
,
projectors
,
opts
)
switch
{
case
gpus
[
0
]
.
Library
==
"metal"
&&
estimate
.
VRAMSize
>
systemMemory
:
case
gpus
[
0
]
.
Library
==
"metal"
&&
estimate
.
VRAMSize
>
system
Total
Memory
:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
opts
.
NumGPU
=
0
...
...
@@ -211,7 +212,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
// Windows CUDA should not use mmap for best performance
if
(
runtime
.
GOOS
==
"windows"
&&
gpus
[
0
]
.
Library
==
"cuda"
)
||
opts
.
UseMMap
==
api
.
TriStateFalse
{
// Linux with a model larger than free space, mmap leads to thrashing
if
(
runtime
.
GOOS
==
"windows"
&&
gpus
[
0
]
.
Library
==
"cuda"
&&
opts
.
UseMMap
==
api
.
TriStateUndefined
)
||
(
runtime
.
GOOS
==
"linux"
&&
systemFreeMemory
<
estimate
.
TotalSize
&&
opts
.
UseMMap
==
api
.
TriStateUndefined
)
||
opts
.
UseMMap
==
api
.
TriStateFalse
{
params
=
append
(
params
,
"--no-mmap"
)
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment