Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
9ef2fce3
Unverified
Commit
9ef2fce3
authored
Oct 16, 2023
by
Michael Yang
Committed by
GitHub
Oct 16, 2023
Browse files
Merge pull request #768 from jmorganca/mxyng/bytes
fix memory check
parents
43eaba3d
11d82d7b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
50 additions
and
50 deletions
+50
-50
api/client.go
api/client.go
+2
-1
format/bytes.go
format/bytes.go
+13
-6
llm/llama.go
llm/llama.go
+12
-12
llm/llm.go
llm/llm.go
+23
-31
No files found.
api/client.go
View file @
9ef2fce3
...
@@ -14,6 +14,7 @@ import (
...
@@ -14,6 +14,7 @@ import (
"runtime"
"runtime"
"strings"
"strings"
"github.com/jmorganca/ollama/format"
"github.com/jmorganca/ollama/version"
"github.com/jmorganca/ollama/version"
)
)
...
@@ -127,7 +128,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
...
@@ -127,7 +128,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
return
nil
return
nil
}
}
const
maxBufferSize
=
512
*
1000
// 512KB
const
maxBufferSize
=
512
*
format
.
KiloByte
func
(
c
*
Client
)
stream
(
ctx
context
.
Context
,
method
,
path
string
,
data
any
,
fn
func
([]
byte
)
error
)
error
{
func
(
c
*
Client
)
stream
(
ctx
context
.
Context
,
method
,
path
string
,
data
any
,
fn
func
([]
byte
)
error
)
error
{
var
buf
*
bytes
.
Buffer
var
buf
*
bytes
.
Buffer
...
...
format/bytes.go
View file @
9ef2fce3
...
@@ -2,14 +2,21 @@ package format
...
@@ -2,14 +2,21 @@ package format
import
"fmt"
import
"fmt"
const
(
Byte
=
1
KiloByte
=
Byte
*
1000
MegaByte
=
KiloByte
*
1000
GigaByte
=
MegaByte
*
1000
)
func
HumanBytes
(
b
int64
)
string
{
func
HumanBytes
(
b
int64
)
string
{
switch
{
switch
{
case
b
>
1000
*
1000
*
1000
:
case
b
>
GigaByte
:
return
fmt
.
Sprintf
(
"%d GB"
,
b
/
1000
/
1000
/
1000
)
return
fmt
.
Sprintf
(
"%d GB"
,
b
/
GigaByte
)
case
b
>
1000
*
1000
:
case
b
>
MegaByte
:
return
fmt
.
Sprintf
(
"%d MB"
,
b
/
1000
/
1000
)
return
fmt
.
Sprintf
(
"%d MB"
,
b
/
MegaByte
)
case
b
>
1000
:
case
b
>
KiloByte
:
return
fmt
.
Sprintf
(
"%d KB"
,
b
/
1000
)
return
fmt
.
Sprintf
(
"%d KB"
,
b
/
KiloByte
)
default
:
default
:
return
fmt
.
Sprintf
(
"%d B"
,
b
)
return
fmt
.
Sprintf
(
"%d B"
,
b
)
}
}
...
...
llm/llama.go
View file @
9ef2fce3
...
@@ -24,6 +24,7 @@ import (
...
@@ -24,6 +24,7 @@ import (
"time"
"time"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format"
)
)
//go:embed llama.cpp/*/build/*/bin/*
//go:embed llama.cpp/*/build/*/bin/*
...
@@ -197,7 +198,7 @@ type llama struct {
...
@@ -197,7 +198,7 @@ type llama struct {
var
errNoGPU
=
errors
.
New
(
"nvidia-smi command failed"
)
var
errNoGPU
=
errors
.
New
(
"nvidia-smi command failed"
)
// CheckVRAM returns the
availabl
e VRAM in
MiB
on Linux machines with NVIDIA GPUs
// CheckVRAM returns the
fre
e VRAM in
bytes
on Linux machines with NVIDIA GPUs
func
CheckVRAM
()
(
int64
,
error
)
{
func
CheckVRAM
()
(
int64
,
error
)
{
cmd
:=
exec
.
Command
(
"nvidia-smi"
,
"--query-gpu=memory.free"
,
"--format=csv,noheader,nounits"
)
cmd
:=
exec
.
Command
(
"nvidia-smi"
,
"--query-gpu=memory.free"
,
"--format=csv,noheader,nounits"
)
var
stdout
bytes
.
Buffer
var
stdout
bytes
.
Buffer
...
@@ -207,7 +208,7 @@ func CheckVRAM() (int64, error) {
...
@@ -207,7 +208,7 @@ func CheckVRAM() (int64, error) {
return
0
,
errNoGPU
return
0
,
errNoGPU
}
}
var
free
int64
var
free
MiB
int64
scanner
:=
bufio
.
NewScanner
(
&
stdout
)
scanner
:=
bufio
.
NewScanner
(
&
stdout
)
for
scanner
.
Scan
()
{
for
scanner
.
Scan
()
{
line
:=
scanner
.
Text
()
line
:=
scanner
.
Text
()
...
@@ -216,15 +217,16 @@ func CheckVRAM() (int64, error) {
...
@@ -216,15 +217,16 @@ func CheckVRAM() (int64, error) {
return
0
,
fmt
.
Errorf
(
"failed to parse available VRAM: %v"
,
err
)
return
0
,
fmt
.
Errorf
(
"failed to parse available VRAM: %v"
,
err
)
}
}
free
+=
vram
free
MiB
+=
vram
}
}
if
free
*
1024
*
1024
<
2
*
1000
*
1000
*
1000
{
freeBytes
:=
freeMiB
*
1024
*
1024
if
freeBytes
<
2
*
format
.
GigaByte
{
log
.
Printf
(
"less than 2 GB VRAM available, falling back to CPU only"
)
log
.
Printf
(
"less than 2 GB VRAM available, falling back to CPU only"
)
free
=
0
free
MiB
=
0
}
}
return
free
,
nil
return
free
Bytes
,
nil
}
}
func
NumGPU
(
numLayer
,
fileSizeBytes
int64
,
opts
api
.
Options
)
int
{
func
NumGPU
(
numLayer
,
fileSizeBytes
int64
,
opts
api
.
Options
)
int
{
...
@@ -232,7 +234,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
...
@@ -232,7 +234,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
return
opts
.
NumGPU
return
opts
.
NumGPU
}
}
if
runtime
.
GOOS
==
"linux"
{
if
runtime
.
GOOS
==
"linux"
{
vramMib
,
err
:=
CheckVRAM
()
freeBytes
,
err
:=
CheckVRAM
()
if
err
!=
nil
{
if
err
!=
nil
{
if
err
.
Error
()
!=
"nvidia-smi command failed"
{
if
err
.
Error
()
!=
"nvidia-smi command failed"
{
log
.
Print
(
err
.
Error
())
log
.
Print
(
err
.
Error
())
...
@@ -241,15 +243,13 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
...
@@ -241,15 +243,13 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
return
0
return
0
}
}
freeVramBytes
:=
int64
(
vramMib
)
*
1024
*
1024
// 1 MiB = 1024^2 bytes
// Calculate bytes per layer
// Calculate bytes per layer
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
bytesPerLayer
:=
fileSizeBytes
/
numLayer
bytesPerLayer
:=
fileSizeBytes
/
numLayer
// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
layers
:=
int
(
free
Vram
Bytes
/
bytesPerLayer
)
*
92
/
100
layers
:=
int
(
freeBytes
/
bytesPerLayer
)
*
92
/
100
log
.
Printf
(
"%d MiB VRAM available, loading up to %d GPU layers"
,
vramMib
,
layers
)
log
.
Printf
(
"%d MiB VRAM available, loading up to %d GPU layers"
,
freeBytes
,
layers
)
return
layers
return
layers
}
}
...
@@ -509,7 +509,7 @@ type PredictRequest struct {
...
@@ -509,7 +509,7 @@ type PredictRequest struct {
Stop
[]
string
`json:"stop,omitempty"`
Stop
[]
string
`json:"stop,omitempty"`
}
}
const
maxBufferSize
=
512
*
1000
// 512KB
const
maxBufferSize
=
512
*
format
.
KiloByte
func
(
llm
*
llama
)
Predict
(
ctx
context
.
Context
,
prevContext
[]
int
,
prompt
string
,
fn
func
(
api
.
GenerateResponse
))
error
{
func
(
llm
*
llama
)
Predict
(
ctx
context
.
Context
,
prevContext
[]
int
,
prompt
string
,
fn
func
(
api
.
GenerateResponse
))
error
{
prevConvo
,
err
:=
llm
.
Decode
(
ctx
,
prevContext
)
prevConvo
,
err
:=
llm
.
Decode
(
ctx
,
prevContext
)
...
...
llm/llm.go
View file @
9ef2fce3
...
@@ -10,6 +10,7 @@ import (
...
@@ -10,6 +10,7 @@ import (
"github.com/pbnjay/memory"
"github.com/pbnjay/memory"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format"
)
)
type
LLM
interface
{
type
LLM
interface
{
...
@@ -55,39 +56,30 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
...
@@ -55,39 +56,30 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
opts
.
NumGPU
=
0
opts
.
NumGPU
=
0
}
}
}
}
}
totalResidentMemory
:=
memory
.
TotalMemory
()
var
requiredMemory
int64
switch
ggml
.
ModelType
()
{
var
f16Multiplier
int64
=
2
case
"3B"
,
"7B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
16
*
1000
*
1000
{
switch
ggml
.
ModelType
()
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 16 GB of memory"
)
case
"3B"
,
"7B"
:
}
else
if
totalResidentMemory
<
8
*
1000
*
1000
{
requiredMemory
=
8
*
format
.
GigaByte
return
nil
,
fmt
.
Errorf
(
"model requires at least 8 GB of memory"
)
case
"13B"
:
}
requiredMemory
=
16
*
format
.
GigaByte
case
"13B"
:
case
"30B"
,
"34B"
,
"40B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
32
*
1000
*
1000
{
requiredMemory
=
32
*
format
.
GigaByte
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 32 GB of memory"
)
case
"65B"
,
"70B"
:
}
else
if
totalResidentMemory
<
16
*
1000
*
1000
{
requiredMemory
=
64
*
format
.
GigaByte
return
nil
,
fmt
.
Errorf
(
"model requires at least 16 GB of memory"
)
case
"180B"
:
}
requiredMemory
=
128
*
format
.
GigaByte
case
"30B"
,
"34B"
,
"40B"
:
f16Multiplier
=
4
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
64
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 64 GB of memory"
)
}
else
if
totalResidentMemory
<
32
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 32 GB of memory"
)
}
case
"65B"
,
"70B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
128
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 128 GB of memory"
)
}
else
if
totalResidentMemory
<
64
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 64 GB of memory"
)
}
}
case
"180B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
512
*
1000
*
1000
{
systemMemory
:=
int64
(
memory
.
TotalMemory
())
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 512GB of memory"
)
}
else
if
totalResidentMemory
<
128
*
1000
*
1000
{
if
ggml
.
FileType
()
==
"F16"
&&
requiredMemory
*
f16Multiplier
>
systemMemory
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 128GB of memory"
)
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least %s of total memory"
,
format
.
HumanBytes
(
requiredMemory
))
}
else
if
requiredMemory
>
systemMemory
{
return
nil
,
fmt
.
Errorf
(
"model requires at least %s of total memory"
,
format
.
HumanBytes
(
requiredMemory
))
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment