Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
4c4c730a
Unverified
Commit
4c4c730a
authored
Jan 27, 2024
by
mraiser
Committed by
GitHub
Jan 27, 2024
Browse files
Merge branch 'ollama:main' into main
parents
6eb3cddc
e02ecfb6
Changes
25
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
535 additions
and
142 deletions
+535
-142
.github/workflows/test.yaml
.github/workflows/test.yaml
+77
-21
Dockerfile
Dockerfile
+13
-2
api/types.go
api/types.go
+27
-17
cmd/cmd.go
cmd/cmd.go
+11
-9
cmd/interactive.go
cmd/interactive.go
+127
-24
cmd/interactive_test.go
cmd/interactive_test.go
+65
-0
docs/development.md
docs/development.md
+2
-1
docs/modelfile.md
docs/modelfile.md
+15
-0
gpu/gpu.go
gpu/gpu.go
+37
-7
gpu/gpu_info.h
gpu/gpu_info.h
+1
-0
gpu/gpu_info_cuda.c
gpu/gpu_info_cuda.c
+1
-0
gpu/gpu_info_rocm.c
gpu/gpu_info_rocm.c
+11
-4
llm/dyn_ext_server.go
llm/dyn_ext_server.go
+1
-0
llm/generate/gen_common.sh
llm/generate/gen_common.sh
+14
-0
llm/generate/gen_linux.sh
llm/generate/gen_linux.sh
+1
-1
llm/generate/gen_windows.ps1
llm/generate/gen_windows.ps1
+29
-1
llm/gguf.go
llm/gguf.go
+61
-54
llm/llama.cpp
llm/llama.cpp
+1
-1
llm/patches/01-cache.diff
llm/patches/01-cache.diff
+30
-0
parser/parser.go
parser/parser.go
+11
-0
No files found.
.github/workflows/test.yaml
View file @
4c4c730a
...
@@ -23,29 +23,72 @@ jobs:
...
@@ -23,29 +23,72 @@ jobs:
with
:
with
:
go-version
:
'
1.21'
go-version
:
'
1.21'
cache
:
true
cache
:
true
-
if
:
${{ startsWith(matrix.os, 'windows-') }}
shell
:
pwsh
run
:
|
$path = vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
if ($path) {
$path = join-path $path 'Common7\Tools\vsdevcmd.bat'
if (test-path $path) {
cmd /s /c """$path"" $args && set" | where { $_ -match '(\w+)=(.*)' } | foreach {
echo "$($Matches[1])=$($Matches[2])" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append
}
}
}
echo "C:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
-
run
:
go get ./...
-
run
:
go get ./...
-
run
:
go generate -x ./...
-
run
:
go generate -x ./...
-
uses
:
actions/upload-artifact@v4
-
uses
:
actions/upload-artifact@v4
with
:
with
:
name
:
${{ matrix.os }}-${{ matrix.arch }}-libraries
name
:
${{ matrix.os }}-${{ matrix.arch }}-libraries
path
:
|
path
:
llm/llama.cpp/build/**/lib/*
llm/llama.cpp/build/**/lib/*
generate-cuda
:
strategy
:
matrix
:
cuda-version
:
-
'
11.8.0'
runs-on
:
ubuntu-latest
container
:
nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
steps
:
-
run
:
|
apt-get update && apt-get install -y git build-essential curl
curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
| tar -zx -C /usr --strip-components 1
env
:
DEBIAN_FRONTEND
:
noninteractive
-
uses
:
actions/checkout@v4
-
uses
:
actions/setup-go@v4
with
:
go-version
:
'
1.21'
cache
:
true
-
run
:
go get ./...
-
run
:
|
git config --global --add safe.directory /__w/ollama/ollama
go generate -x ./...
env
:
OLLAMA_SKIP_CPU_GENERATE
:
'
1'
-
uses
:
actions/upload-artifact@v4
with
:
name
:
cuda-${{ matrix.cuda-version }}-libraries
path
:
llm/llama.cpp/build/**/lib/*
generate-rocm
:
strategy
:
matrix
:
rocm-version
:
-
'
5.7.1'
-
'
6.0'
runs-on
:
ubuntu-latest
container
:
rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
steps
:
-
run
:
|
apt-get update && apt-get install -y git build-essential curl rocm-libs
curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \
| tar -zx -C /usr --strip-components 1
env
:
DEBIAN_FRONTEND
:
noninteractive
-
uses
:
actions/checkout@v4
-
uses
:
actions/setup-go@v4
with
:
go-version
:
'
1.21'
cache
:
true
-
run
:
go get ./...
-
run
:
|
git config --global --add safe.directory /__w/ollama/ollama
go generate -x ./...
env
:
OLLAMA_SKIP_CPU_GENERATE
:
'
1'
-
uses
:
actions/upload-artifact@v4
with
:
name
:
rocm-${{ matrix.rocm-version }}-libraries
path
:
llm/llama.cpp/build/**/lib/*
lint
:
lint
:
needs
:
generate
strategy
:
strategy
:
matrix
:
matrix
:
os
:
[
ubuntu-latest
,
macos-latest
,
windows-latest
]
os
:
[
ubuntu-latest
,
macos-latest
,
windows-latest
]
...
@@ -69,10 +112,19 @@ jobs:
...
@@ -69,10 +112,19 @@ jobs:
with
:
with
:
go-version
:
'
1.21'
go-version
:
'
1.21'
cache
:
false
cache
:
false
-
uses
:
actions/download-artifact@v4
-
run
:
|
with
:
mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/
name
:
${{ matrix.os }}-${{ matrix.arch }}-libraries
touch llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/stub.so
path
:
llm/llama.cpp/build
if
:
${{ startsWith(matrix.os, 'ubuntu-') }}
-
run
:
|
mkdir -p llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/
touch llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/stub.dylib
touch llm/llama.cpp/ggml-metal.metal
if
:
${{ startsWith(matrix.os, 'macos-') }}
-
run
:
|
mkdir -p llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/
touch llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/stub.dll
if
:
${{ startsWith(matrix.os, 'windows-') }}
-
uses
:
golangci/golangci-lint-action@v3
-
uses
:
golangci/golangci-lint-action@v3
test
:
test
:
needs
:
generate
needs
:
generate
...
@@ -104,3 +156,7 @@ jobs:
...
@@ -104,3 +156,7 @@ jobs:
path
:
llm/llama.cpp/build
path
:
llm/llama.cpp/build
-
run
:
go build
-
run
:
go build
-
run
:
go test -v ./...
-
run
:
go test -v ./...
-
uses
:
actions/upload-artifact@v4
with
:
name
:
${{ matrix.os }}-binaries
path
:
ollama
Dockerfile
View file @
4c4c730a
...
@@ -109,17 +109,28 @@ ARG CGO_CFLAGS
...
@@ -109,17 +109,28 @@ ARG CGO_CFLAGS
RUN
go build .
RUN
go build .
# Runtime stages
# Runtime stages
FROM
--platform=linux/amd64 rocm/dev-centos-7:6.0-complete as runtime-amd64
FROM
--platform=linux/amd64 ubuntu:22.04 as runtime-amd64
RUN
apt-get update
&&
apt-get
install
-y
ca-certificates
COPY
--from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
COPY
--from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
FROM
--platform=linux/arm64 ubuntu:22.04 as runtime-arm64
FROM
--platform=linux/arm64 ubuntu:22.04 as runtime-arm64
RUN
apt-get update
&&
apt-get
install
-y
ca-certificates
RUN
apt-get update
&&
apt-get
install
-y
ca-certificates
COPY
--from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
COPY
--from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
FROM
--platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm
RUN
update-pciids
COPY
--from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama
EXPOSE
11434
ENV
OLLAMA_HOST 0.0.0.0
ENTRYPOINT
["/bin/ollama"]
CMD
["serve"]
FROM
runtime-$TARGETARCH
FROM
runtime-$TARGETARCH
EXPOSE
11434
EXPOSE
11434
ENV
OLLAMA_HOST 0.0.0.0
ENV
OLLAMA_HOST 0.0.0.0
ENV
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV
LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
:/opt/rocm/lib:
ENV
LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV
NVIDIA_DRIVER_CAPABILITIES=compute,utility
ENV
NVIDIA_DRIVER_CAPABILITIES=compute,utility
ENTRYPOINT
["/bin/ollama"]
ENTRYPOINT
["/bin/ollama"]
...
...
api/types.go
View file @
4c4c730a
...
@@ -34,24 +34,26 @@ func (e StatusError) Error() string {
...
@@ -34,24 +34,26 @@ func (e StatusError) Error() string {
type
ImageData
[]
byte
type
ImageData
[]
byte
type
GenerateRequest
struct
{
type
GenerateRequest
struct
{
Model
string
`json:"model"`
Model
string
`json:"model"`
Prompt
string
`json:"prompt"`
Prompt
string
`json:"prompt"`
System
string
`json:"system"`
System
string
`json:"system"`
Template
string
`json:"template"`
Template
string
`json:"template"`
Context
[]
int
`json:"context,omitempty"`
Context
[]
int
`json:"context,omitempty"`
Stream
*
bool
`json:"stream,omitempty"`
Stream
*
bool
`json:"stream,omitempty"`
Raw
bool
`json:"raw,omitempty"`
Raw
bool
`json:"raw,omitempty"`
Format
string
`json:"format"`
Format
string
`json:"format"`
Images
[]
ImageData
`json:"images,omitempty"`
KeepAlive
*
Duration
`json:"keep_alive,omitempty"`
Images
[]
ImageData
`json:"images,omitempty"`
Options
map
[
string
]
interface
{}
`json:"options"`
Options
map
[
string
]
interface
{}
`json:"options"`
}
}
type
ChatRequest
struct
{
type
ChatRequest
struct
{
Model
string
`json:"model"`
Model
string
`json:"model"`
Messages
[]
Message
`json:"messages"`
Messages
[]
Message
`json:"messages"`
Stream
*
bool
`json:"stream,omitempty"`
Stream
*
bool
`json:"stream,omitempty"`
Format
string
`json:"format"`
Format
string
`json:"format"`
KeepAlive
*
Duration
`json:"keep_alive,omitempty"`
Options
map
[
string
]
interface
{}
`json:"options"`
Options
map
[
string
]
interface
{}
`json:"options"`
}
}
...
@@ -126,8 +128,9 @@ type Runner struct {
...
@@ -126,8 +128,9 @@ type Runner struct {
}
}
type
EmbeddingRequest
struct
{
type
EmbeddingRequest
struct
{
Model
string
`json:"model"`
Model
string
`json:"model"`
Prompt
string
`json:"prompt"`
Prompt
string
`json:"prompt"`
KeepAlive
*
Duration
`json:"keep_alive,omitempty"`
Options
map
[
string
]
interface
{}
`json:"options"`
Options
map
[
string
]
interface
{}
`json:"options"`
}
}
...
@@ -171,6 +174,7 @@ type ShowResponse struct {
...
@@ -171,6 +174,7 @@ type ShowResponse struct {
Template
string
`json:"template,omitempty"`
Template
string
`json:"template,omitempty"`
System
string
`json:"system,omitempty"`
System
string
`json:"system,omitempty"`
Details
ModelDetails
`json:"details,omitempty"`
Details
ModelDetails
`json:"details,omitempty"`
Messages
[]
Message
`json:"messages,omitempty"`
}
}
type
CopyRequest
struct
{
type
CopyRequest
struct
{
...
@@ -236,6 +240,7 @@ type GenerateResponse struct {
...
@@ -236,6 +240,7 @@ type GenerateResponse struct {
}
}
type
ModelDetails
struct
{
type
ModelDetails
struct
{
ParentModel
string
`json:"parent_model"`
Format
string
`json:"format"`
Format
string
`json:"format"`
Family
string
`json:"family"`
Family
string
`json:"family"`
Families
[]
string
`json:"families"`
Families
[]
string
`json:"families"`
...
@@ -411,14 +416,19 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
...
@@ -411,14 +416,19 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) {
case
float64
:
case
float64
:
if
t
<
0
{
if
t
<
0
{
t
=
math
.
MaxFloat64
t
=
math
.
MaxFloat64
d
.
Duration
=
time
.
Duration
(
t
)
}
else
{
d
.
Duration
=
time
.
Duration
(
t
*
float64
(
time
.
Second
))
}
}
d
.
Duration
=
time
.
Duration
(
t
)
case
string
:
case
string
:
d
.
Duration
,
err
=
time
.
ParseDuration
(
t
)
d
.
Duration
,
err
=
time
.
ParseDuration
(
t
)
if
err
!=
nil
{
if
err
!=
nil
{
return
err
return
err
}
}
if
d
.
Duration
<
0
{
mf
:=
math
.
MaxFloat64
d
.
Duration
=
time
.
Duration
(
mf
)
}
}
}
return
nil
return
nil
...
...
cmd/cmd.go
View file @
4c4c730a
...
@@ -458,15 +458,17 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
...
@@ -458,15 +458,17 @@ func RunGenerate(cmd *cobra.Command, args []string) error {
type
generateContextKey
string
type
generateContextKey
string
type
runOptions
struct
{
type
runOptions
struct
{
Model
string
Model
string
Prompt
string
ParentModel
string
Messages
[]
api
.
Message
Prompt
string
WordWrap
bool
Messages
[]
api
.
Message
Format
string
WordWrap
bool
System
string
Format
string
Template
string
System
string
Images
[]
api
.
ImageData
Template
string
Options
map
[
string
]
interface
{}
Images
[]
api
.
ImageData
Options
map
[
string
]
interface
{}
MultiModal
bool
}
}
type
displayResponseState
struct
{
type
displayResponseState
struct
{
...
...
cmd/interactive.go
View file @
4c4c730a
...
@@ -7,12 +7,14 @@ import (
...
@@ -7,12 +7,14 @@ import (
"net/http"
"net/http"
"os"
"os"
"regexp"
"regexp"
"sort"
"strings"
"strings"
"github.com/spf13/cobra"
"github.com/spf13/cobra"
"golang.org/x/exp/slices"
"golang.org/x/exp/slices"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/progress"
"github.com/jmorganca/ollama/readline"
"github.com/jmorganca/ollama/readline"
)
)
...
@@ -25,43 +27,75 @@ const (
...
@@ -25,43 +27,75 @@ const (
MultilineTemplate
MultilineTemplate
)
)
func
modelIsMultiModal
(
cmd
*
cobra
.
Command
,
name
string
)
bool
{
func
loadModel
(
cmd
*
cobra
.
Command
,
opts
*
runOptions
)
error
{
// get model details
client
,
err
:=
api
.
ClientFromEnvironment
()
client
,
err
:=
api
.
ClientFromEnvironment
()
if
err
!=
nil
{
if
err
!=
nil
{
fmt
.
Println
(
"error: couldn't connect to ollama server"
)
return
err
return
false
}
}
req
:=
api
.
ShowRequest
{
Name
:
name
}
p
:=
progress
.
NewProgress
(
os
.
Stderr
)
resp
,
err
:=
client
.
Show
(
cmd
.
Context
(),
&
req
)
defer
p
.
StopAndClear
()
spinner
:=
progress
.
NewSpinner
(
""
)
p
.
Add
(
""
,
spinner
)
showReq
:=
api
.
ShowRequest
{
Name
:
opts
.
Model
}
showResp
,
err
:=
client
.
Show
(
cmd
.
Context
(),
&
showReq
)
if
err
!=
nil
{
if
err
!=
nil
{
return
false
return
err
}
}
opts
.
MultiModal
=
slices
.
Contains
(
showResp
.
Details
.
Families
,
"clip"
)
opts
.
ParentModel
=
showResp
.
Details
.
ParentModel
return
slices
.
Contains
(
resp
.
Details
.
Families
,
"clip"
)
if
len
(
showResp
.
Messages
)
>
0
{
}
opts
.
Messages
=
append
(
opts
.
Messages
,
showResp
.
Messages
...
)
}
func
generateInteractive
(
cmd
*
cobra
.
Command
,
opts
runOptions
)
error
{
multiModal
:=
modelIsMultiModal
(
cmd
,
opts
.
Model
)
// load the model
chatReq
:=
&
api
.
ChatRequest
{
loadOpts
:=
runOptions
{
Model
:
opts
.
Model
,
Model
:
opts
.
Model
,
Prompt
:
""
,
Messages
:
[]
api
.
Message
{},
Messages
:
[]
api
.
Message
{},
}
}
if
_
,
err
:=
chat
(
cmd
,
loadOpts
);
err
!=
nil
{
err
=
client
.
Chat
(
cmd
.
Context
(),
chatReq
,
func
(
resp
api
.
ChatResponse
)
error
{
p
.
StopAndClear
()
if
len
(
opts
.
Messages
)
>
0
{
for
_
,
msg
:=
range
opts
.
Messages
{
switch
msg
.
Role
{
case
"user"
:
fmt
.
Printf
(
">>> %s
\n
"
,
msg
.
Content
)
case
"assistant"
:
state
:=
&
displayResponseState
{}
displayResponse
(
msg
.
Content
,
opts
.
WordWrap
,
state
)
fmt
.
Println
()
fmt
.
Println
()
}
}
}
return
nil
})
if
err
!=
nil
{
return
err
}
return
nil
}
func
generateInteractive
(
cmd
*
cobra
.
Command
,
opts
runOptions
)
error
{
opts
.
Messages
=
make
([]
api
.
Message
,
0
)
err
:=
loadModel
(
cmd
,
&
opts
)
if
err
!=
nil
{
return
err
return
err
}
}
usage
:=
func
()
{
usage
:=
func
()
{
fmt
.
Fprintln
(
os
.
Stderr
,
"Available Commands:"
)
fmt
.
Fprintln
(
os
.
Stderr
,
"Available Commands:"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /set Set session variables"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /set Set session variables"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /show Show model information"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /show Show model information"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /bye Exit"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /load <model> Load a session or model"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /?, /help Help for a command"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /save <model> Save your current session"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /? shortcuts Help for keyboard shortcuts"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /bye Exit"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /?, /help Help for a command"
)
fmt
.
Fprintln
(
os
.
Stderr
,
" /? shortcuts Help for keyboard shortcuts"
)
fmt
.
Fprintln
(
os
.
Stderr
,
""
)
fmt
.
Fprintln
(
os
.
Stderr
,
""
)
fmt
.
Fprintln
(
os
.
Stderr
,
"Use
\"\"\"
to begin a multi-line message."
)
fmt
.
Fprintln
(
os
.
Stderr
,
"Use
\"\"\"
to begin a multi-line message."
)
fmt
.
Fprintln
(
os
.
Stderr
,
""
)
fmt
.
Fprintln
(
os
.
Stderr
,
""
)
...
@@ -140,7 +174,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
...
@@ -140,7 +174,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
var
sb
strings
.
Builder
var
sb
strings
.
Builder
var
multiline
MultilineState
var
multiline
MultilineState
opts
.
Messages
=
make
([]
api
.
Message
,
0
)
for
{
for
{
line
,
err
:=
scanner
.
Readline
()
line
,
err
:=
scanner
.
Readline
()
...
@@ -203,6 +236,44 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
...
@@ -203,6 +236,44 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
if
err
:=
ListHandler
(
cmd
,
args
[
1
:
]);
err
!=
nil
{
if
err
:=
ListHandler
(
cmd
,
args
[
1
:
]);
err
!=
nil
{
return
err
return
err
}
}
case
strings
.
HasPrefix
(
line
,
"/load"
)
:
args
:=
strings
.
Fields
(
line
)
if
len
(
args
)
!=
2
{
fmt
.
Println
(
"Usage:
\n
/load <modelname>"
)
continue
}
opts
.
Model
=
args
[
1
]
opts
.
Messages
=
[]
api
.
Message
{}
fmt
.
Printf
(
"Loading model '%s'
\n
"
,
opts
.
Model
)
if
err
:=
loadModel
(
cmd
,
&
opts
);
err
!=
nil
{
return
err
}
continue
case
strings
.
HasPrefix
(
line
,
"/save"
)
:
args
:=
strings
.
Fields
(
line
)
if
len
(
args
)
!=
2
{
fmt
.
Println
(
"Usage:
\n
/save <modelname>"
)
continue
}
client
,
err
:=
api
.
ClientFromEnvironment
()
if
err
!=
nil
{
fmt
.
Println
(
"error: couldn't connect to ollama server"
)
return
err
}
req
:=
&
api
.
CreateRequest
{
Name
:
args
[
1
],
Modelfile
:
buildModelfile
(
opts
),
}
fn
:=
func
(
resp
api
.
ProgressResponse
)
error
{
return
nil
}
err
=
client
.
Create
(
cmd
.
Context
(),
req
,
fn
)
if
err
!=
nil
{
fmt
.
Println
(
"error: couldn't save model"
)
return
err
}
fmt
.
Printf
(
"Created new model '%s'
\n
"
,
args
[
1
])
continue
case
strings
.
HasPrefix
(
line
,
"/set"
)
:
case
strings
.
HasPrefix
(
line
,
"/set"
)
:
args
:=
strings
.
Fields
(
line
)
args
:=
strings
.
Fields
(
line
)
if
len
(
args
)
>
1
{
if
len
(
args
)
>
1
{
...
@@ -389,7 +460,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
...
@@ -389,7 +460,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
args
:=
strings
.
Fields
(
line
)
args
:=
strings
.
Fields
(
line
)
isFile
:=
false
isFile
:=
false
if
m
ultiModal
{
if
opts
.
M
ultiModal
{
for
_
,
f
:=
range
extractFileNames
(
line
)
{
for
_
,
f
:=
range
extractFileNames
(
line
)
{
if
strings
.
HasPrefix
(
f
,
args
[
0
])
{
if
strings
.
HasPrefix
(
f
,
args
[
0
])
{
isFile
=
true
isFile
=
true
...
@@ -411,7 +482,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
...
@@ -411,7 +482,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
if
sb
.
Len
()
>
0
&&
multiline
==
MultilineNone
{
if
sb
.
Len
()
>
0
&&
multiline
==
MultilineNone
{
newMessage
:=
api
.
Message
{
Role
:
"user"
,
Content
:
sb
.
String
()}
newMessage
:=
api
.
Message
{
Role
:
"user"
,
Content
:
sb
.
String
()}
if
m
ultiModal
{
if
opts
.
M
ultiModal
{
msg
,
images
,
err
:=
extractFileData
(
sb
.
String
())
msg
,
images
,
err
:=
extractFileData
(
sb
.
String
())
if
err
!=
nil
{
if
err
!=
nil
{
return
err
return
err
...
@@ -454,6 +525,38 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
...
@@ -454,6 +525,38 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
}
}
}
}
func
buildModelfile
(
opts
runOptions
)
string
{
var
mf
strings
.
Builder
model
:=
opts
.
ParentModel
if
model
==
""
{
model
=
opts
.
Model
}
fmt
.
Fprintf
(
&
mf
,
"FROM %s
\n
"
,
model
)
if
opts
.
System
!=
""
{
fmt
.
Fprintf
(
&
mf
,
"SYSTEM
\"\"\"
%s
\"\"\"\n
"
,
opts
.
System
)
}
if
opts
.
Template
!=
""
{
fmt
.
Fprintf
(
&
mf
,
"TEMPLATE
\"\"\"
%s
\"\"\"\n
"
,
opts
.
Template
)
}
keys
:=
make
([]
string
,
0
)
for
k
:=
range
opts
.
Options
{
keys
=
append
(
keys
,
k
)
}
sort
.
Strings
(
keys
)
for
_
,
k
:=
range
keys
{
fmt
.
Fprintf
(
&
mf
,
"PARAMETER %s %v
\n
"
,
k
,
opts
.
Options
[
k
])
}
fmt
.
Fprintln
(
&
mf
)
for
_
,
msg
:=
range
opts
.
Messages
{
fmt
.
Fprintf
(
&
mf
,
"MESSAGE %s
\"\"\"
%s
\"\"\"\n
"
,
msg
.
Role
,
msg
.
Content
)
}
return
mf
.
String
()
}
func
normalizeFilePath
(
fp
string
)
string
{
func
normalizeFilePath
(
fp
string
)
string
{
// Define a map of escaped characters and their replacements
// Define a map of escaped characters and their replacements
replacements
:=
map
[
string
]
string
{
replacements
:=
map
[
string
]
string
{
...
...
cmd/interactive_test.go
View file @
4c4c730a
package
cmd
package
cmd
import
(
import
(
"bytes"
"testing"
"testing"
"text/template"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/assert"
"github.com/jmorganca/ollama/api"
)
)
func
TestExtractFilenames
(
t
*
testing
.
T
)
{
func
TestExtractFilenames
(
t
*
testing
.
T
)
{
...
@@ -49,3 +53,64 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
...
@@ -49,3 +53,64 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
assert
.
Contains
(
t
,
res
[
9
],
"ten.svg"
)
assert
.
Contains
(
t
,
res
[
9
],
"ten.svg"
)
assert
.
Contains
(
t
,
res
[
9
],
"E:"
)
assert
.
Contains
(
t
,
res
[
9
],
"E:"
)
}
}
func
TestModelfileBuilder
(
t
*
testing
.
T
)
{
opts
:=
runOptions
{
Model
:
"hork"
,
System
:
"You are part horse and part shark, but all hork. Do horklike things"
,
Template
:
"This is a template."
,
Messages
:
[]
api
.
Message
{
{
Role
:
"user"
,
Content
:
"Hey there hork!"
},
{
Role
:
"assistant"
,
Content
:
"Yes it is true, I am half horse, half shark."
},
},
Options
:
map
[
string
]
interface
{}{},
}
opts
.
Options
[
"temperature"
]
=
0.9
opts
.
Options
[
"seed"
]
=
42
opts
.
Options
[
"penalize_newline"
]
=
false
opts
.
Options
[
"stop"
]
=
[]
string
{
"hi"
,
"there"
}
mf
:=
buildModelfile
(
opts
)
expectedModelfile
:=
`FROM {{.Model}}
SYSTEM """{{.System}}"""
TEMPLATE """{{.Template}}"""
PARAMETER penalize_newline false
PARAMETER seed 42
PARAMETER stop [hi there]
PARAMETER temperature 0.9
MESSAGE user """Hey there hork!"""
MESSAGE assistant """Yes it is true, I am half horse, half shark."""
`
tmpl
,
err
:=
template
.
New
(
""
)
.
Parse
(
expectedModelfile
)
assert
.
Nil
(
t
,
err
)
var
buf
bytes
.
Buffer
err
=
tmpl
.
Execute
(
&
buf
,
opts
)
assert
.
Nil
(
t
,
err
)
assert
.
Equal
(
t
,
buf
.
String
(),
mf
)
opts
.
ParentModel
=
"horseshark"
mf
=
buildModelfile
(
opts
)
expectedModelfile
=
`FROM {{.ParentModel}}
SYSTEM """{{.System}}"""
TEMPLATE """{{.Template}}"""
PARAMETER penalize_newline false
PARAMETER seed 42
PARAMETER stop [hi there]
PARAMETER temperature 0.9
MESSAGE user """Hey there hork!"""
MESSAGE assistant """Yes it is true, I am half horse, half shark."""
`
tmpl
,
err
=
template
.
New
(
""
)
.
Parse
(
expectedModelfile
)
assert
.
Nil
(
t
,
err
)
var
parentBuf
bytes
.
Buffer
err
=
tmpl
.
Execute
(
&
parentBuf
,
opts
)
assert
.
Nil
(
t
,
err
)
assert
.
Equal
(
t
,
parentBuf
.
String
(),
mf
)
}
docs/development.md
View file @
4c4c730a
...
@@ -50,7 +50,8 @@ development and runtime packages.
...
@@ -50,7 +50,8 @@ development and runtime packages.
Typically the build scripts will auto-detect CUDA, however, if your Linux distro
Typically the build scripts will auto-detect CUDA, however, if your Linux distro
or installation approach uses unusual paths, you can specify the location by
or installation approach uses unusual paths, you can specify the location by
specifying an environment variable
`CUDA_LIB_DIR`
to the location of the shared
specifying an environment variable
`CUDA_LIB_DIR`
to the location of the shared
libraries, and
`CUDACXX`
to the location of the nvcc compiler.
libraries, and
`CUDACXX`
to the location of the nvcc compiler. You can customize
set set of target CUDA architectues by setting
`CMAKE_CUDA_ARCHITECTURES`
(e.g. "50;60;70")
Then generate dependencies:
Then generate dependencies:
...
...
docs/modelfile.md
View file @
4c4c730a
...
@@ -19,6 +19,7 @@ A model file is the blueprint to create and share models with Ollama.
...
@@ -19,6 +19,7 @@ A model file is the blueprint to create and share models with Ollama.
-
[
SYSTEM
](
#system
)
-
[
SYSTEM
](
#system
)
-
[
ADAPTER
](
#adapter
)
-
[
ADAPTER
](
#adapter
)
-
[
LICENSE
](
#license
)
-
[
LICENSE
](
#license
)
-
[
MESSAGE
](
#message
)
-
[
Notes
](
#notes
)
-
[
Notes
](
#notes
)
## Format
## Format
...
@@ -38,6 +39,7 @@ INSTRUCTION arguments
...
@@ -38,6 +39,7 @@ INSTRUCTION arguments
|
[
`SYSTEM`
](
#system
)
| Specifies the system message that will be set in the template. |
|
[
`SYSTEM`
](
#system
)
| Specifies the system message that will be set in the template. |
|
[
`ADAPTER`
](
#adapter
)
| Defines the (Q)LoRA adapters to apply to the model. |
|
[
`ADAPTER`
](
#adapter
)
| Defines the (Q)LoRA adapters to apply to the model. |
|
[
`LICENSE`
](
#license
)
| Specifies the legal license. |
|
[
`LICENSE`
](
#license
)
| Specifies the legal license. |
|
[
`MESSAGE`
](
#message
)
| Specify message history. |
## Examples
## Examples
...
@@ -205,6 +207,19 @@ LICENSE """
...
@@ -205,6 +207,19 @@ LICENSE """
"""
"""
```
```
### MESSAGE
The
`MESSAGE`
instruction allows you to specify a message history for the model to use when responding:
```
modelfile
MESSAGE user Is Toronto in Canada?
MESSAGE assistant yes
MESSAGE user Is Sacramento in Canada?
MESSAGE assistant no
MESSAGE user Is Ontario in Canada?
MESSAGE assistant yes
```
## Notes
## Notes
-
the
**`Modelfile` is not case sensitive**
. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
-
the
**`Modelfile` is not case sensitive**
. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments.
...
...
gpu/gpu.go
View file @
4c4c730a
...
@@ -16,6 +16,7 @@ import (
...
@@ -16,6 +16,7 @@ import (
"os"
"os"
"path/filepath"
"path/filepath"
"runtime"
"runtime"
"strconv"
"strings"
"strings"
"sync"
"sync"
"unsafe"
"unsafe"
...
@@ -29,8 +30,8 @@ type handles struct {
...
@@ -29,8 +30,8 @@ type handles struct {
var
gpuMutex
sync
.
Mutex
var
gpuMutex
sync
.
Mutex
var
gpuHandles
*
handles
=
nil
var
gpuHandles
*
handles
=
nil
// With our current CUDA compile flags,
5.2 and older
will not work properly
// With our current CUDA compile flags,
older than 5.0
will not work properly
const
CudaComputeM
ajorMin
=
6
var
CudaComputeM
in
=
[
2
]
C
.
int
{
5
,
0
}
// Possible locations for the nvidia-ml library
// Possible locations for the nvidia-ml library
var
CudaLinuxGlobs
=
[]
string
{
var
CudaLinuxGlobs
=
[]
string
{
...
@@ -121,9 +122,15 @@ func GetGPUInfo() GpuInfo {
...
@@ -121,9 +122,15 @@ func GetGPUInfo() GpuInfo {
initGPUHandles
()
initGPUHandles
()
}
}
// All our GPU builds have AVX enabled, so fallback to CPU if we don't detect at least AVX
cpuVariant
:=
GetCPUVariant
()
if
cpuVariant
==
""
{
slog
.
Warn
(
"CPU does not have AVX or AVX2, disabling GPU support."
)
}
var
memInfo
C
.
mem_info_t
var
memInfo
C
.
mem_info_t
resp
:=
GpuInfo
{}
resp
:=
GpuInfo
{}
if
gpuHandles
.
cuda
!=
nil
{
if
gpuHandles
.
cuda
!=
nil
&&
cpuVariant
!=
""
{
C
.
cuda_check_vram
(
*
gpuHandles
.
cuda
,
&
memInfo
)
C
.
cuda_check_vram
(
*
gpuHandles
.
cuda
,
&
memInfo
)
if
memInfo
.
err
!=
nil
{
if
memInfo
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up CUDA GPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up CUDA GPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
...
@@ -135,19 +142,40 @@ func GetGPUInfo() GpuInfo {
...
@@ -135,19 +142,40 @@ func GetGPUInfo() GpuInfo {
if
cc
.
err
!=
nil
{
if
cc
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up CUDA GPU compute capability: %s"
,
C
.
GoString
(
cc
.
err
)))
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up CUDA GPU compute capability: %s"
,
C
.
GoString
(
cc
.
err
)))
C
.
free
(
unsafe
.
Pointer
(
cc
.
err
))
C
.
free
(
unsafe
.
Pointer
(
cc
.
err
))
}
else
if
cc
.
major
>
=
CudaComputeM
ajorMin
{
}
else
if
cc
.
major
>
CudaComputeM
in
[
0
]
||
(
cc
.
major
==
CudaComputeMin
[
0
]
&&
cc
.
minor
>=
CudaComputeMin
[
1
])
{
slog
.
Info
(
fmt
.
Sprintf
(
"CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
slog
.
Info
(
fmt
.
Sprintf
(
"CUDA Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
resp
.
Library
=
"cuda"
resp
.
Library
=
"cuda"
}
else
{
}
else
{
slog
.
Info
(
fmt
.
Sprintf
(
"CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
slog
.
Info
(
fmt
.
Sprintf
(
"CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d"
,
cc
.
major
,
cc
.
minor
))
}
}
}
}
}
else
if
gpuHandles
.
rocm
!=
nil
{
}
else
if
gpuHandles
.
rocm
!=
nil
&&
cpuVariant
!=
""
{
C
.
rocm_check_vram
(
*
gpuHandles
.
rocm
,
&
memInfo
)
C
.
rocm_check_vram
(
*
gpuHandles
.
rocm
,
&
memInfo
)
if
memInfo
.
err
!=
nil
{
if
memInfo
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up ROCm GPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up ROCm GPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
C
.
free
(
unsafe
.
Pointer
(
memInfo
.
err
))
C
.
free
(
unsafe
.
Pointer
(
memInfo
.
err
))
}
else
if
memInfo
.
igpu_index
>=
0
&&
memInfo
.
count
==
1
{
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog
.
Info
(
"ROCm unsupported integrated GPU detected"
)
}
else
{
}
else
{
if
memInfo
.
igpu_index
>=
0
{
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
val
:=
os
.
Getenv
(
"ROCR_VISIBLE_DEVICES"
)
if
val
==
""
{
devices
:=
[]
string
{}
for
i
:=
0
;
i
<
int
(
memInfo
.
count
);
i
++
{
if
i
==
int
(
memInfo
.
igpu_index
)
{
continue
}
devices
=
append
(
devices
,
strconv
.
Itoa
(
i
))
}
val
=
strings
.
Join
(
devices
,
","
)
os
.
Setenv
(
"ROCR_VISIBLE_DEVICES"
,
val
)
}
slog
.
Info
(
fmt
.
Sprintf
(
"ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s"
,
val
))
}
resp
.
Library
=
"rocm"
resp
.
Library
=
"rocm"
var
version
C
.
rocm_version_resp_t
var
version
C
.
rocm_version_resp_t
C
.
rocm_get_version
(
*
gpuHandles
.
rocm
,
&
version
)
C
.
rocm_get_version
(
*
gpuHandles
.
rocm
,
&
version
)
...
@@ -163,7 +191,7 @@ func GetGPUInfo() GpuInfo {
...
@@ -163,7 +191,7 @@ func GetGPUInfo() GpuInfo {
if
resp
.
Library
==
""
{
if
resp
.
Library
==
""
{
C
.
cpu_check_ram
(
&
memInfo
)
C
.
cpu_check_ram
(
&
memInfo
)
resp
.
Library
=
"cpu"
resp
.
Library
=
"cpu"
resp
.
Variant
=
GetCPU
Variant
()
resp
.
Variant
=
cpu
Variant
}
}
if
memInfo
.
err
!=
nil
{
if
memInfo
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up CPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up CPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
...
@@ -199,7 +227,9 @@ func CheckVRAM() (int64, error) {
...
@@ -199,7 +227,9 @@ func CheckVRAM() (int64, error) {
if
overhead
<
gpus
*
1024
*
1024
*
1024
{
if
overhead
<
gpus
*
1024
*
1024
*
1024
{
overhead
=
gpus
*
1024
*
1024
*
1024
overhead
=
gpus
*
1024
*
1024
*
1024
}
}
return
int64
(
gpuInfo
.
FreeMemory
-
overhead
),
nil
avail
:=
int64
(
gpuInfo
.
FreeMemory
-
overhead
)
slog
.
Debug
(
fmt
.
Sprintf
(
"%s detected %d devices with %dM available memory"
,
gpuInfo
.
Library
,
gpuInfo
.
DeviceCount
,
avail
/
1024
/
1024
))
return
avail
,
nil
}
}
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
...
...
gpu/gpu_info.h
View file @
4c4c730a
...
@@ -42,6 +42,7 @@ typedef struct mem_info {
...
@@ -42,6 +42,7 @@ typedef struct mem_info {
uint64_t
total
;
uint64_t
total
;
uint64_t
free
;
uint64_t
free
;
unsigned
int
count
;
unsigned
int
count
;
int
igpu_index
;
// If >= 0, we detected an integrated GPU to ignore
char
*
err
;
// If non-nill, caller responsible for freeing
char
*
err
;
// If non-nill, caller responsible for freeing
}
mem_info_t
;
}
mem_info_t
;
...
...
gpu/gpu_info_cuda.c
View file @
4c4c730a
...
@@ -70,6 +70,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
...
@@ -70,6 +70,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
resp
->
ch
.
handle
=
NULL
;
resp
->
ch
.
handle
=
NULL
;
snprintf
(
buf
,
buflen
,
"nvml vram init failure: %d"
,
ret
);
snprintf
(
buf
,
buflen
,
"nvml vram init failure: %d"
,
ret
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
return
;
}
}
// Report driver version if we're in verbose mode, ignore errors
// Report driver version if we're in verbose mode, ignore errors
...
...
gpu/gpu_info_rocm.c
View file @
4c4c730a
...
@@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
...
@@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
void
rocm_check_vram
(
rocm_handle_t
h
,
mem_info_t
*
resp
)
{
void
rocm_check_vram
(
rocm_handle_t
h
,
mem_info_t
*
resp
)
{
resp
->
err
=
NULL
;
resp
->
err
=
NULL
;
resp
->
igpu_index
=
-
1
;
uint64_t
totalMem
=
0
;
uint64_t
totalMem
=
0
;
uint64_t
usedMem
=
0
;
uint64_t
usedMem
=
0
;
rsmi_status_t
ret
;
rsmi_status_t
ret
;
...
@@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
...
@@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
}
}
LOG
(
h
.
verbose
,
"[%d] ROCm totalMem %ld
\n
"
,
i
,
totalMem
);
LOG
(
h
.
verbose
,
"[%d] ROCm totalMem %ld
\n
"
,
i
,
totalMem
);
LOG
(
h
.
verbose
,
"[%d] ROCm usedMem %ld
\n
"
,
i
,
usedMem
);
LOG
(
h
.
verbose
,
"[%d] ROCm usedMem %ld
\n
"
,
i
,
usedMem
);
resp
->
total
+=
totalMem
;
if
(
totalMem
<
1024
*
1024
*
1024
)
{
resp
->
free
+=
totalMem
-
usedMem
;
// Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
LOG
(
h
.
verbose
,
"[%d] ROCm integrated GPU
\n
"
,
i
);
resp
->
igpu_index
=
i
;
}
else
{
resp
->
total
+=
totalMem
;
resp
->
free
+=
totalMem
-
usedMem
;
}
}
}
}
}
...
@@ -171,7 +178,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
...
@@ -171,7 +178,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
const
int
buflen
=
256
;
const
int
buflen
=
256
;
char
buf
[
buflen
+
1
];
char
buf
[
buflen
+
1
];
if
(
h
.
handle
==
NULL
)
{
if
(
h
.
handle
==
NULL
)
{
resp
->
str
=
strdup
(
"
nvml
handle not initialized"
);
resp
->
str
=
strdup
(
"
rocm
handle not initialized"
);
resp
->
status
=
1
;
resp
->
status
=
1
;
return
;
return
;
}
}
...
@@ -188,4 +195,4 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
...
@@ -188,4 +195,4 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
resp
->
str
=
strdup
(
buf
);
resp
->
str
=
strdup
(
buf
);
}
}
#endif // __APPLE__
#endif // __APPLE__
\ No newline at end of file
llm/dyn_ext_server.go
View file @
4c4c730a
...
@@ -190,6 +190,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
...
@@ -190,6 +190,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
"seed"
:
predict
.
Options
.
Seed
,
"seed"
:
predict
.
Options
.
Seed
,
"stop"
:
predict
.
Options
.
Stop
,
"stop"
:
predict
.
Options
.
Stop
,
"image_data"
:
imageData
,
"image_data"
:
imageData
,
"cache_prompt"
:
true
,
}
}
if
predict
.
Format
==
"json"
{
if
predict
.
Format
==
"json"
{
...
...
llm/generate/gen_common.sh
View file @
4c4c730a
...
@@ -39,6 +39,9 @@ init_vars() {
...
@@ -39,6 +39,9 @@ init_vars() {
*
)
*
)
;;
;;
esac
esac
if
[
-z
"
${
CMAKE_CUDA_ARCHITECTURES
}
"
]
;
then
CMAKE_CUDA_ARCHITECTURES
=
"50;52;61;70;75;80"
fi
}
}
git_module_setup
()
{
git_module_setup
()
{
...
@@ -61,6 +64,17 @@ apply_patches() {
...
@@ -61,6 +64,17 @@ apply_patches() {
if
!
grep
ollama
${
LLAMACPP_DIR
}
/examples/server/CMakeLists.txt
;
then
if
!
grep
ollama
${
LLAMACPP_DIR
}
/examples/server/CMakeLists.txt
;
then
echo
'include (../../../ext_server/CMakeLists.txt) # ollama'
>>
${
LLAMACPP_DIR
}
/examples/server/CMakeLists.txt
echo
'include (../../../ext_server/CMakeLists.txt) # ollama'
>>
${
LLAMACPP_DIR
}
/examples/server/CMakeLists.txt
fi
fi
# apply temporary patches until fix is upstream
for
patch
in
../patches/
*
.diff
;
do
for
file
in
$(
grep
"^+++ "
${
patch
}
|
cut
-f2
-d
' '
|
cut
-f2-
-d
/
)
;
do
(
cd
${
LLAMACPP_DIR
}
;
git checkout
${
file
}
)
done
done
for
patch
in
../patches/
*
.diff
;
do
(
cd
${
LLAMACPP_DIR
}
&&
git apply
${
patch
}
)
done
# Avoid duplicate main symbols when we link into the cgo binary
# Avoid duplicate main symbols when we link into the cgo binary
sed
-e
's/int main(/int __main(/g'
<
${
LLAMACPP_DIR
}
/examples/server/server.cpp
>
${
LLAMACPP_DIR
}
/examples/server/server.cpp.tmp
&&
sed
-e
's/int main(/int __main(/g'
<
${
LLAMACPP_DIR
}
/examples/server/server.cpp
>
${
LLAMACPP_DIR
}
/examples/server/server.cpp.tmp
&&
mv
${
LLAMACPP_DIR
}
/examples/server/server.cpp.tmp
${
LLAMACPP_DIR
}
/examples/server/server.cpp
mv
${
LLAMACPP_DIR
}
/examples/server/server.cpp.tmp
${
LLAMACPP_DIR
}
/examples/server/server.cpp
...
...
llm/generate/gen_linux.sh
View file @
4c4c730a
...
@@ -140,7 +140,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
...
@@ -140,7 +140,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
if
[
-n
"
${
CUDA_MAJOR
}
"
]
;
then
if
[
-n
"
${
CUDA_MAJOR
}
"
]
;
then
CUDA_VARIANT
=
_v
${
CUDA_MAJOR
}
CUDA_VARIANT
=
_v
${
CUDA_MAJOR
}
fi
fi
CMAKE_DEFS
=
"-DLLAMA_CUBLAS=on
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
"
CMAKE_DEFS
=
"-DLLAMA_CUBLAS=on
-DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=
${
CMAKE_CUDA_ARCHITECTURES
}
${
COMMON_CMAKE_DEFS
}
${
CMAKE_DEFS
}
"
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/linux/
${
ARCH
}
/cuda
${
CUDA_VARIANT
}
"
BUILD_DIR
=
"
${
LLAMACPP_DIR
}
/build/linux/
${
ARCH
}
/cuda
${
CUDA_VARIANT
}
"
EXTRA_LIBS
=
"-L
${
CUDA_LIB_DIR
}
-lcudart -lcublas -lcublasLt -lcuda"
EXTRA_LIBS
=
"-L
${
CUDA_LIB_DIR
}
-lcudart -lcublas -lcublasLt -lcuda"
build
build
...
...
llm/generate/gen_windows.ps1
View file @
4c4c730a
...
@@ -25,6 +25,11 @@ function init_vars {
...
@@ -25,6 +25,11 @@ function init_vars {
}
}
$
script
:
GZIP
=
(
get-command
-ea
'silentlycontinue'
gzip
)
.
path
$
script
:
GZIP
=
(
get-command
-ea
'silentlycontinue'
gzip
)
.
path
$
script
:
DUMPBIN
=
(
get-command
-ea
'silentlycontinue'
dumpbin
)
.
path
$
script
:
DUMPBIN
=
(
get-command
-ea
'silentlycontinue'
dumpbin
)
.
path
if
(
$null
-eq
$
env
:
CMAKE_CUDA_ARCHITECTURES
)
{
$
script
:
CMAKE_CUDA_ARCHITECTURES
=
"50;52;61;70;75;80"
}
else
{
$
script
:
CMAKE_CUDA_ARCHITECTURES
=
$
env
:
CMAKE_CUDA_ARCHITECTURES
}
}
}
function
git_module_setup
{
function
git_module_setup
{
...
@@ -40,6 +45,29 @@ function apply_patches {
...
@@ -40,6 +45,29 @@ function apply_patches {
if
(
!
(
Select-String
-Path
"
${script:llamacppDir}
/examples/server/CMakeLists.txt"
-Pattern
'ollama'
))
{
if
(
!
(
Select-String
-Path
"
${script:llamacppDir}
/examples/server/CMakeLists.txt"
-Pattern
'ollama'
))
{
Add-Content
-Path
"
${script:llamacppDir}
/examples/server/CMakeLists.txt"
-Value
'include (../../../ext_server/CMakeLists.txt) # ollama'
Add-Content
-Path
"
${script:llamacppDir}
/examples/server/CMakeLists.txt"
-Value
'include (../../../ext_server/CMakeLists.txt) # ollama'
}
}
# Apply temporary patches until fix is upstream
$patches
=
Get-ChildItem
"../patches/*.diff"
foreach
(
$patch
in
$patches
)
{
# Extract file paths from the patch file
$filePaths
=
Get-Content
$patch
.
FullName
|
Where-Object
{
$_
-match
'^\+\+\+ '
}
|
ForEach-Object
{
$parts
=
$_
-split
' '
(
$parts
[
1
]
-split
'/'
,
2
)[
1
]
}
# Checkout each file
foreach
(
$file
in
$filePaths
)
{
Set-Location
-Path
${script:llamacppDir}
git
checkout
$file
}
}
# Apply each patch
foreach
(
$patch
in
$patches
)
{
Set-Location
-Path
${script:llamacppDir}
git
apply
$patch
.
FullName
}
# Avoid duplicate main symbols when we link into the cgo binary
# Avoid duplicate main symbols when we link into the cgo binary
$content
=
Get-Content
-Path
"
${script:llamacppDir}
/examples/server/server.cpp"
$content
=
Get-Content
-Path
"
${script:llamacppDir}
/examples/server/server.cpp"
$content
=
$content
-replace
'int main\('
,
'int __main('
$content
=
$content
-replace
'int main\('
,
'int __main('
...
@@ -128,7 +156,7 @@ if ($null -ne $script:CUDA_LIB_DIR) {
...
@@ -128,7 +156,7 @@ if ($null -ne $script:CUDA_LIB_DIR) {
}
}
init_vars
init_vars
$
script
:
buildDir
=
"
${script:llamacppDir}
/build/windows/
${script:ARCH}
/cuda
$
script
:
CUDA_VARIANT
"
$
script
:
buildDir
=
"
${script:llamacppDir}
/build/windows/
${script:ARCH}
/cuda
$
script
:
CUDA_VARIANT
"
$
script
:
cmakeDefs
+=
@(
"-DLLAMA_CUBLAS=ON"
,
"-DLLAMA_AVX=on"
)
$
script
:
cmakeDefs
+=
@(
"-DLLAMA_CUBLAS=ON"
,
"-DLLAMA_AVX=on"
,
"-DCMAKE_CUDA_ARCHITECTURES=
${script:CMAKE_CUDA_ARCHITECTURES}
"
)
build
build
install
install
cp
"
${script:CUDA_LIB_DIR}
/cudart64_*.dll"
"
${script:buildDir}
/lib"
cp
"
${script:CUDA_LIB_DIR}
/cudart64_*.dll"
"
${script:buildDir}
/lib"
...
...
llm/gguf.go
View file @
4c4c730a
...
@@ -69,12 +69,65 @@ type tensor struct {
...
@@ -69,12 +69,65 @@ type tensor struct {
name
string
name
string
kind
uint32
kind
uint32
offset
uint64
offset
uint64
size
uint64
// shape is the number of elements in each dimension
// shape is the number of elements in each dimension
shape
[
4
]
uint64
shape
[
4
]
uint64
}
}
func
(
t
tensor
)
blockSize
()
uint64
{
switch
{
case
t
.
kind
<
2
:
return
1
case
t
.
kind
<
10
:
return
32
default
:
return
256
}
}
func
(
t
tensor
)
typeSize
()
uint64
{
blockSize
:=
t
.
blockSize
()
switch
t
.
kind
{
case
0
:
// FP32
return
4
case
1
:
// FP16
return
2
case
2
:
// Q4_0
return
2
+
blockSize
/
2
case
3
:
// Q4_1
return
2
+
2
+
blockSize
/
2
case
6
:
// Q5_0
return
2
+
4
+
blockSize
/
2
case
7
:
// Q5_1
return
2
+
2
+
4
+
blockSize
/
2
case
8
:
// Q8_0
return
2
+
blockSize
case
9
:
// Q8_1
return
4
+
4
+
blockSize
case
10
:
// Q2_K
return
blockSize
/
16
+
blockSize
/
4
+
2
+
2
case
11
:
// Q3_K
return
blockSize
/
8
+
blockSize
/
4
+
12
+
2
case
12
:
// Q4_K
return
2
+
2
+
12
+
blockSize
/
2
case
13
:
// Q5_K
return
2
+
2
+
12
+
blockSize
/
8
+
blockSize
/
2
case
14
:
// Q6_K
return
blockSize
/
2
+
blockSize
/
4
+
blockSize
/
16
+
2
default
:
return
0
}
}
func
(
t
tensor
)
parameters
()
uint64
{
return
t
.
shape
[
0
]
*
t
.
shape
[
1
]
*
t
.
shape
[
2
]
*
t
.
shape
[
3
]
}
func
(
t
tensor
)
size
()
uint64
{
return
t
.
parameters
()
*
t
.
typeSize
()
/
t
.
blockSize
()
}
type
ggufModel
struct
{
type
ggufModel
struct
{
*
containerGGUF
*
containerGGUF
...
@@ -201,61 +254,15 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
...
@@ -201,61 +254,15 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
shape
[
i
]
=
llm
.
readU64
(
rso
)
shape
[
i
]
=
llm
.
readU64
(
rso
)
}
}
kind
:=
llm
.
readU32
(
rso
)
tensor
:=
tensor
{
offset
:=
llm
.
readU64
(
rso
)
var
blockSize
uint64
switch
{
case
kind
<
2
:
blockSize
=
1
case
kind
<
10
:
blockSize
=
32
default
:
blockSize
=
256
}
var
typeSize
uint64
switch
kind
{
case
0
:
// FP32
typeSize
=
4
case
1
:
// FP16
typeSize
=
2
case
2
:
// Q4_0
typeSize
=
2
+
blockSize
/
2
case
3
:
// Q4_1
typeSize
=
2
+
2
+
blockSize
/
2
case
6
:
// Q5_0
typeSize
=
2
+
4
+
blockSize
/
2
case
7
:
// Q5_1
typeSize
=
2
+
2
+
4
+
blockSize
/
2
case
8
:
// Q8_0
typeSize
=
2
+
blockSize
case
9
:
// Q8_1
typeSize
=
4
+
4
+
blockSize
case
10
:
// Q2_K
typeSize
=
blockSize
/
16
+
blockSize
/
4
+
2
+
2
case
11
:
// Q3_K
typeSize
=
blockSize
/
8
+
blockSize
/
4
+
12
+
2
case
12
:
// Q4_K
typeSize
=
2
+
2
+
12
+
blockSize
/
2
case
13
:
// Q5_K
typeSize
=
2
+
2
+
12
+
blockSize
/
8
+
blockSize
/
2
case
14
:
// Q6_K
typeSize
=
blockSize
/
2
+
blockSize
/
4
+
blockSize
/
16
+
2
}
parameters
:=
shape
[
0
]
*
shape
[
1
]
*
shape
[
2
]
*
shape
[
3
]
size
:=
parameters
*
typeSize
/
blockSize
llm
.
tensors
=
append
(
llm
.
tensors
,
tensor
{
name
:
name
,
name
:
name
,
kind
:
kind
,
kind
:
llm
.
readU32
(
rso
),
offset
:
offset
,
offset
:
llm
.
readU64
(
rso
),
size
:
size
,
shape
:
shape
,
shape
:
shape
,
}
)
}
llm
.
parameters
+=
parameters
llm
.
tensors
=
append
(
llm
.
tensors
,
tensor
)
llm
.
parameters
+=
tensor
.
parameters
()
}
}
alignment
,
ok
:=
llm
.
kv
[
"general.alignment"
]
.
(
uint32
)
alignment
,
ok
:=
llm
.
kv
[
"general.alignment"
]
.
(
uint32
)
...
@@ -265,7 +272,7 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
...
@@ -265,7 +272,7 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
rso
.
Seek
(
int64
(
alignment
)
-
rso
.
offset
%
int64
(
alignment
),
io
.
SeekCurrent
)
rso
.
Seek
(
int64
(
alignment
)
-
rso
.
offset
%
int64
(
alignment
),
io
.
SeekCurrent
)
for
_
,
tensor
:=
range
llm
.
tensors
{
for
_
,
tensor
:=
range
llm
.
tensors
{
padded
:=
(
int64
(
tensor
.
size
)
+
int64
(
alignment
)
-
1
)
&
^
(
int64
(
alignment
)
-
1
)
padded
:=
(
int64
(
tensor
.
size
()
)
+
int64
(
alignment
)
-
1
)
&
^
(
int64
(
alignment
)
-
1
)
rso
.
Seek
(
padded
,
io
.
SeekCurrent
)
rso
.
Seek
(
padded
,
io
.
SeekCurrent
)
}
}
...
...
llama.cpp
@
cd4fddb2
Compare
011e8ec5
...
cd4fddb2
Subproject commit
011e8ec577fd135cbc02993d3ea9840c516d6a1c
Subproject commit
cd4fddb29f81d6a1f6d51a0c016bc6b486d68def
llm/patches/01-cache.diff
0 → 100644
View file @
4c4c730a
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0462fbd2..4fa7b57f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1857,12 +1857,6 @@
struct llama_server_context
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
}
- LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
-
- llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
-
- slot.cache_tokens = prompt_tokens;
-
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
{
// we have to evaluate at least 1 token to generate logits.
@@ -1870,6 +1864,12 @@
struct llama_server_context
slot.n_past--;
}
+ LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+
+ llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+
+ slot.cache_tokens = prompt_tokens;
+
LOG_VERBOSE("prompt ingested", {
{"n_past", slot.n_past},
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
parser/parser.go
View file @
4c4c730a
...
@@ -7,6 +7,7 @@ import (
...
@@ -7,6 +7,7 @@ import (
"fmt"
"fmt"
"io"
"io"
"log/slog"
"log/slog"
"slices"
)
)
type
Command
struct
{
type
Command
struct
{
...
@@ -56,6 +57,16 @@ func Parse(reader io.Reader) ([]Command, error) {
...
@@ -56,6 +57,16 @@ func Parse(reader io.Reader) ([]Command, error) {
command
.
Args
=
string
(
bytes
.
TrimSpace
(
fields
[
1
]))
command
.
Args
=
string
(
bytes
.
TrimSpace
(
fields
[
1
]))
case
"EMBED"
:
case
"EMBED"
:
return
nil
,
fmt
.
Errorf
(
"deprecated command: EMBED is no longer supported, use the /embed API endpoint instead"
)
return
nil
,
fmt
.
Errorf
(
"deprecated command: EMBED is no longer supported, use the /embed API endpoint instead"
)
case
"MESSAGE"
:
command
.
Name
=
string
(
bytes
.
ToLower
(
fields
[
0
]))
fields
=
bytes
.
SplitN
(
fields
[
1
],
[]
byte
(
" "
),
2
)
if
len
(
fields
)
<
2
{
return
nil
,
fmt
.
Errorf
(
"should be in the format <role> <message>"
)
}
if
!
slices
.
Contains
([]
string
{
"system"
,
"user"
,
"assistant"
},
string
(
bytes
.
ToLower
(
fields
[
0
])))
{
return
nil
,
fmt
.
Errorf
(
"role must be one of
\"
system
\"
,
\"
user
\"
, or
\"
assistant
\"
"
)
}
command
.
Args
=
fmt
.
Sprintf
(
"%s: %s"
,
string
(
bytes
.
ToLower
(
fields
[
0
])),
string
(
fields
[
1
]))
default
:
default
:
if
!
bytes
.
HasPrefix
(
fields
[
0
],
[]
byte
(
"#"
))
{
if
!
bytes
.
HasPrefix
(
fields
[
0
],
[]
byte
(
"#"
))
{
// log a warning for unknown commands
// log a warning for unknown commands
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment