Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
c77d45d8
Unverified
Commit
c77d45d8
authored
Apr 09, 2024
by
Michael Yang
Committed by
GitHub
Apr 09, 2024
Browse files
Merge pull request #3506 from ollama/mxyng/quantize-redux
cgo quantize
parents
5ec12cec
9502e566
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
145 additions
and
45 deletions
+145
-45
api/client.go
api/client.go
+1
-13
api/types.go
api/types.go
+28
-27
cmd/cmd.go
cmd/cmd.go
+4
-1
llm/llm.go
llm/llm.go
+71
-0
server/images.go
server/images.go
+21
-2
server/routes.go
server/routes.go
+19
-1
server/routes_test.go
server/routes_test.go
+1
-1
No files found.
api/client.go
View file @
c77d45d8
...
...
@@ -5,7 +5,6 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net"
...
...
@@ -301,18 +300,7 @@ func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*Embedd
}
func
(
c
*
Client
)
CreateBlob
(
ctx
context
.
Context
,
digest
string
,
r
io
.
Reader
)
error
{
if
err
:=
c
.
do
(
ctx
,
http
.
MethodHead
,
fmt
.
Sprintf
(
"/api/blobs/%s"
,
digest
),
nil
,
nil
);
err
!=
nil
{
var
statusError
StatusError
if
!
errors
.
As
(
err
,
&
statusError
)
||
statusError
.
StatusCode
!=
http
.
StatusNotFound
{
return
err
}
if
err
:=
c
.
do
(
ctx
,
http
.
MethodPost
,
fmt
.
Sprintf
(
"/api/blobs/%s"
,
digest
),
r
,
nil
);
err
!=
nil
{
return
err
}
}
return
nil
return
c
.
do
(
ctx
,
http
.
MethodPost
,
fmt
.
Sprintf
(
"/api/blobs/%s"
,
digest
),
r
,
nil
)
}
func
(
c
*
Client
)
Version
(
ctx
context
.
Context
)
(
string
,
error
)
{
...
...
api/types.go
View file @
c77d45d8
...
...
@@ -109,19 +109,19 @@ type Options struct {
// Runner options which must be set when the model is loaded into memory
type
Runner
struct
{
UseNUMA
bool
`json:"numa,omitempty"`
NumCtx
int
`json:"num_ctx,omitempty"`
NumBatch
int
`json:"num_batch,omitempty"`
NumGQA
int
`json:"num_gqa,omitempty"`
NumGPU
int
`json:"num_gpu,omitempty"`
MainGPU
int
`json:"main_gpu,omitempty"`
LowVRAM
bool
`json:"low_vram,omitempty"`
F16KV
bool
`json:"f16_kv,omitempty"`
LogitsAll
bool
`json:"logits_all,omitempty"`
VocabOnly
bool
`json:"vocab_only,omitempty"`
UseMMap
bool
`json:"use_mmap,omitempty"`
UseMLock
bool
`json:"use_mlock,omitempty"`
NumThread
int
`json:"num_thread,omitempty"`
UseNUMA
bool
`json:"numa,omitempty"`
NumCtx
int
`json:"num_ctx,omitempty"`
NumBatch
int
`json:"num_batch,omitempty"`
NumGQA
int
`json:"num_gqa,omitempty"`
NumGPU
int
`json:"num_gpu,omitempty"`
MainGPU
int
`json:"main_gpu,omitempty"`
LowVRAM
bool
`json:"low_vram,omitempty"`
F16KV
bool
`json:"f16_kv,omitempty"`
LogitsAll
bool
`json:"logits_all,omitempty"`
VocabOnly
bool
`json:"vocab_only,omitempty"`
UseMMap
bool
`json:"use_mmap,omitempty"`
UseMLock
bool
`json:"use_mlock,omitempty"`
NumThread
int
`json:"num_thread,omitempty"`
}
type
EmbeddingRequest
struct
{
...
...
@@ -137,10 +137,11 @@ type EmbeddingResponse struct {
}
type
CreateRequest
struct
{
Model
string
`json:"model"`
Path
string
`json:"path"`
Modelfile
string
`json:"modelfile"`
Stream
*
bool
`json:"stream,omitempty"`
Model
string
`json:"model"`
Path
string
`json:"path"`
Modelfile
string
`json:"modelfile"`
Stream
*
bool
`json:"stream,omitempty"`
Quantization
string
`json:"quantization,omitempty"`
// Name is deprecated, see Model
Name
string
`json:"name"`
...
...
@@ -380,16 +381,16 @@ func DefaultOptions() Options {
Runner
:
Runner
{
// options set when the model is loaded
NumCtx
:
2048
,
NumBatch
:
512
,
NumGPU
:
-
1
,
// -1 here indicates that NumGPU should be set dynamically
NumGQA
:
1
,
NumThread
:
0
,
// let the runtime decide
LowVRAM
:
false
,
F16KV
:
true
,
UseMLock
:
false
,
UseMMap
:
true
,
UseNUMA
:
false
,
NumCtx
:
2048
,
NumBatch
:
512
,
NumGPU
:
-
1
,
// -1 here indicates that NumGPU should be set dynamically
NumGQA
:
1
,
NumThread
:
0
,
// let the runtime decide
LowVRAM
:
false
,
F16KV
:
true
,
UseMLock
:
false
,
UseMMap
:
true
,
UseNUMA
:
false
,
},
}
}
...
...
cmd/cmd.go
View file @
c77d45d8
...
...
@@ -194,7 +194,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
return
nil
}
request
:=
api
.
CreateRequest
{
Name
:
args
[
0
],
Modelfile
:
string
(
modelfile
)}
quantization
,
_
:=
cmd
.
Flags
()
.
GetString
(
"quantization"
)
request
:=
api
.
CreateRequest
{
Name
:
args
[
0
],
Modelfile
:
string
(
modelfile
),
Quantization
:
quantization
}
if
err
:=
client
.
Create
(
cmd
.
Context
(),
&
request
,
fn
);
err
!=
nil
{
return
err
}
...
...
@@ -943,6 +945,7 @@ func NewCLI() *cobra.Command {
}
createCmd
.
Flags
()
.
StringP
(
"file"
,
"f"
,
"Modelfile"
,
"Name of the Modelfile (default
\"
Modelfile
\"
)"
)
createCmd
.
Flags
()
.
StringP
(
"quantization"
,
"q"
,
""
,
"Quantization level."
)
showCmd
:=
&
cobra
.
Command
{
Use
:
"show MODEL"
,
...
...
llm/llm.go
View file @
c77d45d8
...
...
@@ -6,10 +6,81 @@ package llm
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
// #include <stdlib.h>
// #include "llama.h"
import
"C"
import
(
"fmt"
"unsafe"
)
// SystemInfo is an unused example of calling llama.cpp functions using CGo
func
SystemInfo
()
string
{
return
C
.
GoString
(
C
.
llama_print_system_info
())
}
func
Quantize
(
infile
,
outfile
,
filetype
string
)
error
{
cinfile
:=
C
.
CString
(
infile
)
defer
C
.
free
(
unsafe
.
Pointer
(
cinfile
))
coutfile
:=
C
.
CString
(
outfile
)
defer
C
.
free
(
unsafe
.
Pointer
(
coutfile
))
params
:=
C
.
llama_model_quantize_default_params
()
params
.
nthread
=
-
1
switch
filetype
{
case
"F32"
:
params
.
ftype
=
fileTypeF32
case
"F16"
:
params
.
ftype
=
fileTypeF16
case
"Q4_0"
:
params
.
ftype
=
fileTypeQ4_0
case
"Q4_1"
:
params
.
ftype
=
fileTypeQ4_1
case
"Q4_1_F16"
:
params
.
ftype
=
fileTypeQ4_1_F16
case
"Q8_0"
:
params
.
ftype
=
fileTypeQ8_0
case
"Q5_0"
:
params
.
ftype
=
fileTypeQ5_0
case
"Q5_1"
:
params
.
ftype
=
fileTypeQ5_1
case
"Q2_K"
:
params
.
ftype
=
fileTypeQ2_K
case
"Q3_K_S"
:
params
.
ftype
=
fileTypeQ3_K_S
case
"Q3_K_M"
:
params
.
ftype
=
fileTypeQ3_K_M
case
"Q3_K_L"
:
params
.
ftype
=
fileTypeQ3_K_L
case
"Q4_K_S"
:
params
.
ftype
=
fileTypeQ4_K_S
case
"Q4_K_M"
:
params
.
ftype
=
fileTypeQ4_K_M
case
"Q5_K_S"
:
params
.
ftype
=
fileTypeQ5_K_S
case
"Q5_K_M"
:
params
.
ftype
=
fileTypeQ5_K_M
case
"Q6_K"
:
params
.
ftype
=
fileTypeQ6_K
case
"IQ2_XXS"
:
params
.
ftype
=
fileTypeIQ2_XXS
case
"IQ2_XS"
:
params
.
ftype
=
fileTypeIQ2_XS
case
"Q2_K_S"
:
params
.
ftype
=
fileTypeQ2_K_S
case
"Q3_K_XS"
:
params
.
ftype
=
fileTypeQ3_K_XS
case
"IQ3_XXS"
:
params
.
ftype
=
fileTypeIQ3_XXS
default
:
return
fmt
.
Errorf
(
"unknown filetype: %s"
,
filetype
)
}
if
retval
:=
C
.
llama_model_quantize
(
cinfile
,
coutfile
,
&
params
);
retval
!=
0
{
return
fmt
.
Errorf
(
"llama_model_quantize: %d"
,
retval
)
}
return
nil
}
server/images.go
View file @
c77d45d8
...
...
@@ -284,7 +284,7 @@ func realpath(mfDir, from string) string {
return
abspath
}
func
CreateModel
(
ctx
context
.
Context
,
name
,
modelFileDir
string
,
commands
[]
parser
.
Command
,
fn
func
(
resp
api
.
ProgressResponse
))
error
{
func
CreateModel
(
ctx
context
.
Context
,
name
,
modelFileDir
,
quantization
string
,
commands
[]
parser
.
Command
,
fn
func
(
resp
api
.
ProgressResponse
))
error
{
deleteMap
:=
make
(
map
[
string
]
struct
{})
if
manifest
,
_
,
err
:=
GetManifest
(
ParseModelPath
(
name
));
err
==
nil
{
for
_
,
layer
:=
range
append
(
manifest
.
Layers
,
manifest
.
Config
)
{
...
...
@@ -337,8 +337,27 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
if
ggufName
!=
""
{
pathName
=
ggufName
slog
.
Debug
(
fmt
.
Sprintf
(
"new image layer path: %s"
,
pathName
))
defer
os
.
RemoveAll
(
ggufName
)
if
quantization
!=
""
{
quantization
=
strings
.
ToUpper
(
quantization
)
fn
(
api
.
ProgressResponse
{
Status
:
fmt
.
Sprintf
(
"quantizing %s model to %s"
,
"F16"
,
quantization
)})
tempfile
,
err
:=
os
.
CreateTemp
(
filepath
.
Dir
(
ggufName
),
quantization
)
if
err
!=
nil
{
return
err
}
defer
os
.
RemoveAll
(
tempfile
.
Name
())
if
err
:=
llm
.
Quantize
(
ggufName
,
tempfile
.
Name
(),
quantization
);
err
!=
nil
{
return
err
}
if
err
:=
tempfile
.
Close
();
err
!=
nil
{
return
err
}
pathName
=
tempfile
.
Name
()
}
}
bin
,
err
:=
os
.
Open
(
pathName
)
...
...
server/routes.go
View file @
c77d45d8
...
...
@@ -647,7 +647,7 @@ func CreateModelHandler(c *gin.Context) {
ctx
,
cancel
:=
context
.
WithCancel
(
c
.
Request
.
Context
())
defer
cancel
()
if
err
:=
CreateModel
(
ctx
,
model
,
filepath
.
Dir
(
req
.
Path
),
commands
,
fn
);
err
!=
nil
{
if
err
:=
CreateModel
(
ctx
,
model
,
filepath
.
Dir
(
req
.
Path
),
req
.
Quantization
,
commands
,
fn
);
err
!=
nil
{
ch
<-
gin
.
H
{
"error"
:
err
.
Error
()}
}
}()
...
...
@@ -913,6 +913,24 @@ func HeadBlobHandler(c *gin.Context) {
}
func
CreateBlobHandler
(
c
*
gin
.
Context
)
{
path
,
err
:=
GetBlobsPath
(
c
.
Param
(
"digest"
))
if
err
!=
nil
{
c
.
AbortWithStatusJSON
(
http
.
StatusBadRequest
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
}
_
,
err
=
os
.
Stat
(
path
)
switch
{
case
errors
.
Is
(
err
,
os
.
ErrNotExist
)
:
// noop
case
err
!=
nil
:
c
.
AbortWithStatusJSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
default
:
c
.
Status
(
http
.
StatusOK
)
return
}
layer
,
err
:=
NewLayer
(
c
.
Request
.
Body
,
""
)
if
err
!=
nil
{
c
.
AbortWithStatusJSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
...
...
server/routes_test.go
View file @
c77d45d8
...
...
@@ -61,7 +61,7 @@ func Test_Routes(t *testing.T) {
fn
:=
func
(
resp
api
.
ProgressResponse
)
{
t
.
Logf
(
"Status: %s"
,
resp
.
Status
)
}
err
=
CreateModel
(
context
.
TODO
(),
name
,
""
,
commands
,
fn
)
err
=
CreateModel
(
context
.
TODO
(),
name
,
""
,
""
,
commands
,
fn
)
assert
.
Nil
(
t
,
err
)
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment