Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
e3936d4f
Unverified
Commit
e3936d4f
authored
Nov 28, 2024
by
ItzCrazyKns
Committed by
GitHub
Nov 27, 2024
Browse files
Support Multiple LoRa Adapters (#7667)
Closes #7627
parent
940e6277
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
26 additions
and
14 deletions
+26
-14
llama/runner/runner.go
llama/runner/runner.go
+23
-8
llm/server.go
llm/server.go
+3
-6
No files found.
llama/runner/runner.go
View file @
e3936d4f
...
@@ -833,10 +833,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
...
@@ -833,10 +833,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
}
}
}
}
type
multiLPath
[]
string
func
(
m
*
multiLPath
)
Set
(
value
string
)
error
{
*
m
=
append
(
*
m
,
value
)
return
nil
}
func
(
m
*
multiLPath
)
String
()
string
{
return
strings
.
Join
(
*
m
,
", "
)
}
func
(
s
*
Server
)
loadModel
(
func
(
s
*
Server
)
loadModel
(
params
llama
.
ModelParams
,
params
llama
.
ModelParams
,
mpath
string
,
mpath
string
,
lpath
string
,
lpath
multiLPath
,
ppath
string
,
ppath
string
,
kvSize
int
,
kvSize
int
,
flashAttention
bool
,
flashAttention
bool
,
...
@@ -857,12 +868,14 @@ func (s *Server) loadModel(
...
@@ -857,12 +868,14 @@ func (s *Server) loadModel(
panic
(
err
)
panic
(
err
)
}
}
if
lpath
!=
""
{
if
lpath
.
String
()
!=
""
{
err
:=
s
.
model
.
ApplyLoraFromFile
(
s
.
lc
,
lpath
,
1.0
,
threads
)
for
_
,
path
:=
range
lpath
{
err
:=
s
.
model
.
ApplyLoraFromFile
(
s
.
lc
,
path
,
1.0
,
threads
)
if
err
!=
nil
{
if
err
!=
nil
{
panic
(
err
)
panic
(
err
)
}
}
}
}
}
if
ppath
!=
""
{
if
ppath
!=
""
{
var
err
error
var
err
error
...
@@ -890,7 +903,6 @@ func main() {
...
@@ -890,7 +903,6 @@ func main() {
mainGpu
:=
flag
.
Int
(
"main-gpu"
,
0
,
"Main GPU"
)
mainGpu
:=
flag
.
Int
(
"main-gpu"
,
0
,
"Main GPU"
)
flashAttention
:=
flag
.
Bool
(
"flash-attn"
,
false
,
"Enable flash attention"
)
flashAttention
:=
flag
.
Bool
(
"flash-attn"
,
false
,
"Enable flash attention"
)
kvSize
:=
flag
.
Int
(
"ctx-size"
,
2048
,
"Context (or KV cache) size"
)
kvSize
:=
flag
.
Int
(
"ctx-size"
,
2048
,
"Context (or KV cache) size"
)
lpath
:=
flag
.
String
(
"lora"
,
""
,
"Path to lora layer file"
)
port
:=
flag
.
Int
(
"port"
,
8080
,
"Port to expose the server on"
)
port
:=
flag
.
Int
(
"port"
,
8080
,
"Port to expose the server on"
)
threads
:=
flag
.
Int
(
"threads"
,
runtime
.
NumCPU
(),
"Number of threads to use during generation"
)
threads
:=
flag
.
Int
(
"threads"
,
runtime
.
NumCPU
(),
"Number of threads to use during generation"
)
verbose
:=
flag
.
Bool
(
"verbose"
,
false
,
"verbose output (default: disabled)"
)
verbose
:=
flag
.
Bool
(
"verbose"
,
false
,
"verbose output (default: disabled)"
)
...
@@ -900,6 +912,9 @@ func main() {
...
@@ -900,6 +912,9 @@ func main() {
multiUserCache
:=
flag
.
Bool
(
"multiuser-cache"
,
false
,
"optimize input cache algorithm for multiple users"
)
multiUserCache
:=
flag
.
Bool
(
"multiuser-cache"
,
false
,
"optimize input cache algorithm for multiple users"
)
requirements
:=
flag
.
Bool
(
"requirements"
,
false
,
"print json requirement information"
)
requirements
:=
flag
.
Bool
(
"requirements"
,
false
,
"print json requirement information"
)
var
lpaths
multiLPath
flag
.
Var
(
&
lpaths
,
"lora"
,
"Path to lora layer file (can be specified multiple times)"
)
flag
.
Parse
()
flag
.
Parse
()
if
*
requirements
{
if
*
requirements
{
printRequirements
(
os
.
Stdout
)
printRequirements
(
os
.
Stdout
)
...
@@ -946,7 +961,7 @@ func main() {
...
@@ -946,7 +961,7 @@ func main() {
params
:=
llama
.
ModelParams
{
params
:=
llama
.
ModelParams
{
NumGpuLayers
:
*
nGpuLayers
,
NumGpuLayers
:
*
nGpuLayers
,
MainGpu
:
*
mainGpu
,
MainGpu
:
*
mainGpu
,
UseMmap
:
!*
noMmap
&&
*
lpath
==
""
,
UseMmap
:
!*
noMmap
&&
lpath
s
.
String
()
==
""
,
UseMlock
:
*
mlock
,
UseMlock
:
*
mlock
,
TensorSplit
:
tensorSplitFloats
,
TensorSplit
:
tensorSplitFloats
,
Progress
:
func
(
progress
float32
)
{
Progress
:
func
(
progress
float32
)
{
...
@@ -955,7 +970,7 @@ func main() {
...
@@ -955,7 +970,7 @@ func main() {
}
}
server
.
ready
.
Add
(
1
)
server
.
ready
.
Add
(
1
)
go
server
.
loadModel
(
params
,
*
mpath
,
*
lpath
,
*
ppath
,
*
kvSize
,
*
flashAttention
,
*
threads
,
*
multiUserCache
)
go
server
.
loadModel
(
params
,
*
mpath
,
lpath
s
,
*
ppath
,
*
kvSize
,
*
flashAttention
,
*
threads
,
*
multiUserCache
)
server
.
cond
=
sync
.
NewCond
(
&
server
.
mu
)
server
.
cond
=
sync
.
NewCond
(
&
server
.
mu
)
...
...
llm/server.go
View file @
e3936d4f
...
@@ -144,10 +144,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
...
@@ -144,10 +144,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
// Loop through potential servers
// Loop through potential servers
finalErr
:=
errors
.
New
(
"no suitable llama servers found"
)
finalErr
:=
errors
.
New
(
"no suitable llama servers found"
)
if
len
(
adapters
)
>
1
{
return
nil
,
errors
.
New
(
"ollama supports only one lora adapter, but multiple were provided"
)
}
rDir
,
err
:=
runners
.
Refresh
(
build
.
EmbedFS
)
rDir
,
err
:=
runners
.
Refresh
(
build
.
EmbedFS
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
...
@@ -201,8 +197,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
...
@@ -201,8 +197,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
}
}
if
len
(
adapters
)
>
0
{
if
len
(
adapters
)
>
0
{
// TODO: applying multiple adapters is not supported by the llama.cpp server yet
for
_
,
adapter
:=
range
adapters
{
params
=
append
(
params
,
"--lora"
,
adapters
[
0
])
params
=
append
(
params
,
"--lora"
,
adapter
)
}
}
}
if
len
(
projectors
)
>
0
{
if
len
(
projectors
)
>
0
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment