Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
9ef2fce3
Unverified
Commit
9ef2fce3
authored
Oct 16, 2023
by
Michael Yang
Committed by
GitHub
Oct 16, 2023
Browse files
Merge pull request #768 from jmorganca/mxyng/bytes
fix memory check
parents
43eaba3d
11d82d7b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
50 additions
and
50 deletions
+50
-50
api/client.go
api/client.go
+2
-1
format/bytes.go
format/bytes.go
+13
-6
llm/llama.go
llm/llama.go
+12
-12
llm/llm.go
llm/llm.go
+23
-31
No files found.
api/client.go
View file @
9ef2fce3
...
...
@@ -14,6 +14,7 @@ import (
"runtime"
"strings"
"github.com/jmorganca/ollama/format"
"github.com/jmorganca/ollama/version"
)
...
...
@@ -127,7 +128,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
return
nil
}
const
maxBufferSize
=
512
*
1000
// 512KB
const
maxBufferSize
=
512
*
format
.
KiloByte
func
(
c
*
Client
)
stream
(
ctx
context
.
Context
,
method
,
path
string
,
data
any
,
fn
func
([]
byte
)
error
)
error
{
var
buf
*
bytes
.
Buffer
...
...
format/bytes.go
View file @
9ef2fce3
...
...
@@ -2,14 +2,21 @@ package format
import
"fmt"
const
(
Byte
=
1
KiloByte
=
Byte
*
1000
MegaByte
=
KiloByte
*
1000
GigaByte
=
MegaByte
*
1000
)
func
HumanBytes
(
b
int64
)
string
{
switch
{
case
b
>
1000
*
1000
*
1000
:
return
fmt
.
Sprintf
(
"%d GB"
,
b
/
1000
/
1000
/
1000
)
case
b
>
1000
*
1000
:
return
fmt
.
Sprintf
(
"%d MB"
,
b
/
1000
/
1000
)
case
b
>
1000
:
return
fmt
.
Sprintf
(
"%d KB"
,
b
/
1000
)
case
b
>
GigaByte
:
return
fmt
.
Sprintf
(
"%d GB"
,
b
/
GigaByte
)
case
b
>
MegaByte
:
return
fmt
.
Sprintf
(
"%d MB"
,
b
/
MegaByte
)
case
b
>
KiloByte
:
return
fmt
.
Sprintf
(
"%d KB"
,
b
/
KiloByte
)
default
:
return
fmt
.
Sprintf
(
"%d B"
,
b
)
}
...
...
llm/llama.go
View file @
9ef2fce3
...
...
@@ -24,6 +24,7 @@ import (
"time"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format"
)
//go:embed llama.cpp/*/build/*/bin/*
...
...
@@ -197,7 +198,7 @@ type llama struct {
var
errNoGPU
=
errors
.
New
(
"nvidia-smi command failed"
)
// CheckVRAM returns the
availabl
e VRAM in
MiB
on Linux machines with NVIDIA GPUs
// CheckVRAM returns the
fre
e VRAM in
bytes
on Linux machines with NVIDIA GPUs
func
CheckVRAM
()
(
int64
,
error
)
{
cmd
:=
exec
.
Command
(
"nvidia-smi"
,
"--query-gpu=memory.free"
,
"--format=csv,noheader,nounits"
)
var
stdout
bytes
.
Buffer
...
...
@@ -207,7 +208,7 @@ func CheckVRAM() (int64, error) {
return
0
,
errNoGPU
}
var
free
int64
var
free
MiB
int64
scanner
:=
bufio
.
NewScanner
(
&
stdout
)
for
scanner
.
Scan
()
{
line
:=
scanner
.
Text
()
...
...
@@ -216,15 +217,16 @@ func CheckVRAM() (int64, error) {
return
0
,
fmt
.
Errorf
(
"failed to parse available VRAM: %v"
,
err
)
}
free
+=
vram
free
MiB
+=
vram
}
if
free
*
1024
*
1024
<
2
*
1000
*
1000
*
1000
{
freeBytes
:=
freeMiB
*
1024
*
1024
if
freeBytes
<
2
*
format
.
GigaByte
{
log
.
Printf
(
"less than 2 GB VRAM available, falling back to CPU only"
)
free
=
0
free
MiB
=
0
}
return
free
,
nil
return
free
Bytes
,
nil
}
func
NumGPU
(
numLayer
,
fileSizeBytes
int64
,
opts
api
.
Options
)
int
{
...
...
@@ -232,7 +234,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
return
opts
.
NumGPU
}
if
runtime
.
GOOS
==
"linux"
{
vramMib
,
err
:=
CheckVRAM
()
freeBytes
,
err
:=
CheckVRAM
()
if
err
!=
nil
{
if
err
.
Error
()
!=
"nvidia-smi command failed"
{
log
.
Print
(
err
.
Error
())
...
...
@@ -241,15 +243,13 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
return
0
}
freeVramBytes
:=
int64
(
vramMib
)
*
1024
*
1024
// 1 MiB = 1024^2 bytes
// Calculate bytes per layer
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
bytesPerLayer
:=
fileSizeBytes
/
numLayer
// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
layers
:=
int
(
free
Vram
Bytes
/
bytesPerLayer
)
*
92
/
100
log
.
Printf
(
"%d MiB VRAM available, loading up to %d GPU layers"
,
vramMib
,
layers
)
layers
:=
int
(
freeBytes
/
bytesPerLayer
)
*
92
/
100
log
.
Printf
(
"%d MiB VRAM available, loading up to %d GPU layers"
,
freeBytes
,
layers
)
return
layers
}
...
...
@@ -509,7 +509,7 @@ type PredictRequest struct {
Stop
[]
string
`json:"stop,omitempty"`
}
const
maxBufferSize
=
512
*
1000
// 512KB
const
maxBufferSize
=
512
*
format
.
KiloByte
func
(
llm
*
llama
)
Predict
(
ctx
context
.
Context
,
prevContext
[]
int
,
prompt
string
,
fn
func
(
api
.
GenerateResponse
))
error
{
prevConvo
,
err
:=
llm
.
Decode
(
ctx
,
prevContext
)
...
...
llm/llm.go
View file @
9ef2fce3
...
...
@@ -10,6 +10,7 @@ import (
"github.com/pbnjay/memory"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format"
)
type
LLM
interface
{
...
...
@@ -55,39 +56,30 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
opts
.
NumGPU
=
0
}
}
}
totalResidentMemory
:=
memory
.
TotalMemory
()
switch
ggml
.
ModelType
()
{
case
"3B"
,
"7B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
16
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 16 GB of memory"
)
}
else
if
totalResidentMemory
<
8
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 8 GB of memory"
)
}
case
"13B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
32
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 32 GB of memory"
)
}
else
if
totalResidentMemory
<
16
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 16 GB of memory"
)
}
case
"30B"
,
"34B"
,
"40B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
64
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 64 GB of memory"
)
}
else
if
totalResidentMemory
<
32
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 32 GB of memory"
)
}
case
"65B"
,
"70B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
128
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 128 GB of memory"
)
}
else
if
totalResidentMemory
<
64
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 64 GB of memory"
)
var
requiredMemory
int64
var
f16Multiplier
int64
=
2
switch
ggml
.
ModelType
()
{
case
"3B"
,
"7B"
:
requiredMemory
=
8
*
format
.
GigaByte
case
"13B"
:
requiredMemory
=
16
*
format
.
GigaByte
case
"30B"
,
"34B"
,
"40B"
:
requiredMemory
=
32
*
format
.
GigaByte
case
"65B"
,
"70B"
:
requiredMemory
=
64
*
format
.
GigaByte
case
"180B"
:
requiredMemory
=
128
*
format
.
GigaByte
f16Multiplier
=
4
}
case
"180B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
512
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 512GB of memory"
)
}
else
if
totalResidentMemory
<
128
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 128GB of memory"
)
systemMemory
:=
int64
(
memory
.
TotalMemory
())
if
ggml
.
FileType
()
==
"F16"
&&
requiredMemory
*
f16Multiplier
>
systemMemory
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least %s of total memory"
,
format
.
HumanBytes
(
requiredMemory
))
}
else
if
requiredMemory
>
systemMemory
{
return
nil
,
fmt
.
Errorf
(
"model requires at least %s of total memory"
,
format
.
HumanBytes
(
requiredMemory
))
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment