Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
9ef2fce3
"vscode:/vscode.git/clone" did not exist on "f5eec0d8e96ed55e5d12d7c63213667284f0e12d"
Unverified
Commit
9ef2fce3
authored
Oct 16, 2023
by
Michael Yang
Committed by
GitHub
Oct 16, 2023
Browse files
Merge pull request #768 from jmorganca/mxyng/bytes
fix memory check
parents
43eaba3d
11d82d7b
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
50 additions
and
50 deletions
+50
-50
api/client.go
api/client.go
+2
-1
format/bytes.go
format/bytes.go
+13
-6
llm/llama.go
llm/llama.go
+12
-12
llm/llm.go
llm/llm.go
+23
-31
No files found.
api/client.go
View file @
9ef2fce3
...
...
@@ -14,6 +14,7 @@ import (
"runtime"
"strings"
"github.com/jmorganca/ollama/format"
"github.com/jmorganca/ollama/version"
)
...
...
@@ -127,7 +128,7 @@ func (c *Client) do(ctx context.Context, method, path string, reqData, respData
return
nil
}
const
maxBufferSize
=
512
*
1000
// 512KB
const
maxBufferSize
=
512
*
format
.
KiloByte
func
(
c
*
Client
)
stream
(
ctx
context
.
Context
,
method
,
path
string
,
data
any
,
fn
func
([]
byte
)
error
)
error
{
var
buf
*
bytes
.
Buffer
...
...
format/bytes.go
View file @
9ef2fce3
...
...
@@ -2,14 +2,21 @@ package format
import
"fmt"
const
(
Byte
=
1
KiloByte
=
Byte
*
1000
MegaByte
=
KiloByte
*
1000
GigaByte
=
MegaByte
*
1000
)
func
HumanBytes
(
b
int64
)
string
{
switch
{
case
b
>
1000
*
1000
*
1000
:
return
fmt
.
Sprintf
(
"%d GB"
,
b
/
1000
/
1000
/
1000
)
case
b
>
1000
*
1000
:
return
fmt
.
Sprintf
(
"%d MB"
,
b
/
1000
/
1000
)
case
b
>
1000
:
return
fmt
.
Sprintf
(
"%d KB"
,
b
/
1000
)
case
b
>
GigaByte
:
return
fmt
.
Sprintf
(
"%d GB"
,
b
/
GigaByte
)
case
b
>
MegaByte
:
return
fmt
.
Sprintf
(
"%d MB"
,
b
/
MegaByte
)
case
b
>
KiloByte
:
return
fmt
.
Sprintf
(
"%d KB"
,
b
/
KiloByte
)
default
:
return
fmt
.
Sprintf
(
"%d B"
,
b
)
}
...
...
llm/llama.go
View file @
9ef2fce3
...
...
@@ -24,6 +24,7 @@ import (
"time"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format"
)
//go:embed llama.cpp/*/build/*/bin/*
...
...
@@ -197,7 +198,7 @@ type llama struct {
var
errNoGPU
=
errors
.
New
(
"nvidia-smi command failed"
)
// CheckVRAM returns the
availabl
e VRAM in
MiB
on Linux machines with NVIDIA GPUs
// CheckVRAM returns the
fre
e VRAM in
bytes
on Linux machines with NVIDIA GPUs
func
CheckVRAM
()
(
int64
,
error
)
{
cmd
:=
exec
.
Command
(
"nvidia-smi"
,
"--query-gpu=memory.free"
,
"--format=csv,noheader,nounits"
)
var
stdout
bytes
.
Buffer
...
...
@@ -207,7 +208,7 @@ func CheckVRAM() (int64, error) {
return
0
,
errNoGPU
}
var
free
int64
var
free
MiB
int64
scanner
:=
bufio
.
NewScanner
(
&
stdout
)
for
scanner
.
Scan
()
{
line
:=
scanner
.
Text
()
...
...
@@ -216,15 +217,16 @@ func CheckVRAM() (int64, error) {
return
0
,
fmt
.
Errorf
(
"failed to parse available VRAM: %v"
,
err
)
}
free
+=
vram
free
MiB
+=
vram
}
if
free
*
1024
*
1024
<
2
*
1000
*
1000
*
1000
{
freeBytes
:=
freeMiB
*
1024
*
1024
if
freeBytes
<
2
*
format
.
GigaByte
{
log
.
Printf
(
"less than 2 GB VRAM available, falling back to CPU only"
)
free
=
0
free
MiB
=
0
}
return
free
,
nil
return
free
Bytes
,
nil
}
func
NumGPU
(
numLayer
,
fileSizeBytes
int64
,
opts
api
.
Options
)
int
{
...
...
@@ -232,7 +234,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
return
opts
.
NumGPU
}
if
runtime
.
GOOS
==
"linux"
{
vramMib
,
err
:=
CheckVRAM
()
freeBytes
,
err
:=
CheckVRAM
()
if
err
!=
nil
{
if
err
.
Error
()
!=
"nvidia-smi command failed"
{
log
.
Print
(
err
.
Error
())
...
...
@@ -241,15 +243,13 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
return
0
}
freeVramBytes
:=
int64
(
vramMib
)
*
1024
*
1024
// 1 MiB = 1024^2 bytes
// Calculate bytes per layer
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
bytesPerLayer
:=
fileSizeBytes
/
numLayer
// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
layers
:=
int
(
free
Vram
Bytes
/
bytesPerLayer
)
*
92
/
100
log
.
Printf
(
"%d MiB VRAM available, loading up to %d GPU layers"
,
vramMib
,
layers
)
layers
:=
int
(
freeBytes
/
bytesPerLayer
)
*
92
/
100
log
.
Printf
(
"%d MiB VRAM available, loading up to %d GPU layers"
,
freeBytes
,
layers
)
return
layers
}
...
...
@@ -509,7 +509,7 @@ type PredictRequest struct {
Stop
[]
string
`json:"stop,omitempty"`
}
const
maxBufferSize
=
512
*
1000
// 512KB
const
maxBufferSize
=
512
*
format
.
KiloByte
func
(
llm
*
llama
)
Predict
(
ctx
context
.
Context
,
prevContext
[]
int
,
prompt
string
,
fn
func
(
api
.
GenerateResponse
))
error
{
prevConvo
,
err
:=
llm
.
Decode
(
ctx
,
prevContext
)
...
...
llm/llm.go
View file @
9ef2fce3
...
...
@@ -10,6 +10,7 @@ import (
"github.com/pbnjay/memory"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format"
)
type
LLM
interface
{
...
...
@@ -55,39 +56,30 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
opts
.
NumGPU
=
0
}
}
}
totalResidentMemory
:=
memory
.
TotalMemory
()
var
requiredMemory
int64
var
f16Multiplier
int64
=
2
switch
ggml
.
ModelType
()
{
case
"3B"
,
"7B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
16
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 16 GB of memory"
)
}
else
if
totalResidentMemory
<
8
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 8 GB of memory"
)
}
requiredMemory
=
8
*
format
.
GigaByte
case
"13B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
32
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 32 GB of memory"
)
}
else
if
totalResidentMemory
<
16
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 16 GB of memory"
)
}
requiredMemory
=
16
*
format
.
GigaByte
case
"30B"
,
"34B"
,
"40B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
64
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 64 GB of memory"
)
}
else
if
totalResidentMemory
<
32
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 32 GB of memory"
)
}
requiredMemory
=
32
*
format
.
GigaByte
case
"65B"
,
"70B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
128
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 128 GB of memory"
)
}
else
if
totalResidentMemory
<
64
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 64 GB of memory"
)
}
requiredMemory
=
64
*
format
.
GigaByte
case
"180B"
:
if
ggml
.
FileType
()
==
"F16"
&&
totalResidentMemory
<
512
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least 512GB of memory"
)
}
else
if
totalResidentMemory
<
128
*
1000
*
1000
{
return
nil
,
fmt
.
Errorf
(
"model requires at least 128GB of memory"
)
requiredMemory
=
128
*
format
.
GigaByte
f16Multiplier
=
4
}
systemMemory
:=
int64
(
memory
.
TotalMemory
())
if
ggml
.
FileType
()
==
"F16"
&&
requiredMemory
*
f16Multiplier
>
systemMemory
{
return
nil
,
fmt
.
Errorf
(
"F16 model requires at least %s of total memory"
,
format
.
HumanBytes
(
requiredMemory
))
}
else
if
requiredMemory
>
systemMemory
{
return
nil
,
fmt
.
Errorf
(
"model requires at least %s of total memory"
,
format
.
HumanBytes
(
requiredMemory
))
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment