Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
ed740a25
Unverified
Commit
ed740a25
authored
May 06, 2024
by
Jeffrey Morgan
Committed by
GitHub
May 06, 2024
Browse files
Fix `no slots available` error with concurrent requests (#4160)
parent
c9f98622
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
115 additions
and
112 deletions
+115
-112
llm/server.go
llm/server.go
+115
-112
No files found.
llm/server.go
View file @
ed740a25
...
@@ -338,7 +338,7 @@ type ServerStatus int
...
@@ -338,7 +338,7 @@ type ServerStatus int
const
(
// iota is reset to 0
const
(
// iota is reset to 0
ServerStatusReady
ServerStatus
=
iota
ServerStatusReady
ServerStatus
=
iota
ServerStatusNoSlotsAvai
a
lble
ServerStatusNoSlotsAvail
a
ble
ServerStatusLoadingModel
ServerStatusLoadingModel
ServerStatusNotResponding
ServerStatusNotResponding
ServerStatusError
ServerStatusError
...
@@ -348,7 +348,7 @@ func (s ServerStatus) ToString() string {
...
@@ -348,7 +348,7 @@ func (s ServerStatus) ToString() string {
switch
s
{
switch
s
{
case
ServerStatusReady
:
case
ServerStatusReady
:
return
"llm server ready"
return
"llm server ready"
case
ServerStatusNoSlotsAvai
a
lble
:
case
ServerStatusNoSlotsAvail
a
ble
:
return
"llm busy - no slots available"
return
"llm busy - no slots available"
case
ServerStatusLoadingModel
:
case
ServerStatusLoadingModel
:
return
"llm server loading model"
return
"llm server loading model"
...
@@ -405,7 +405,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
...
@@ -405,7 +405,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
case
"ok"
:
case
"ok"
:
return
ServerStatusReady
,
nil
return
ServerStatusReady
,
nil
case
"no slot available"
:
case
"no slot available"
:
return
ServerStatusNoSlotsAvai
a
lble
,
nil
return
ServerStatusNoSlotsAvail
a
ble
,
nil
case
"loading model"
:
case
"loading model"
:
return
ServerStatusLoadingModel
,
nil
return
ServerStatusLoadingModel
,
nil
default
:
default
:
...
@@ -413,6 +413,29 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
...
@@ -413,6 +413,29 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
}
}
}
}
// getServerStatusRetry will retry if ServerStatusNoSlotsAvailable is received
func
(
s
*
llmServer
)
getServerStatusRetry
(
ctx
context
.
Context
)
(
ServerStatus
,
error
)
{
var
retries
int
for
{
status
,
err
:=
s
.
getServerStatus
(
ctx
)
if
err
!=
nil
{
return
status
,
err
}
if
status
==
ServerStatusNoSlotsAvailable
{
if
retries
>=
10
{
return
status
,
fmt
.
Errorf
(
"no slots available after %d retries"
,
retries
)
}
time
.
Sleep
(
5
*
time
.
Millisecond
)
retries
++
continue
}
return
status
,
nil
}
}
func
(
s
*
llmServer
)
Ping
(
ctx
context
.
Context
)
error
{
func
(
s
*
llmServer
)
Ping
(
ctx
context
.
Context
)
error
{
_
,
err
:=
s
.
getServerStatus
(
ctx
)
_
,
err
:=
s
.
getServerStatus
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
...
@@ -510,7 +533,6 @@ ws ::= ([ \t\n] ws)?
...
@@ -510,7 +533,6 @@ ws ::= ([ \t\n] ws)?
`
`
const
maxBufferSize
=
512
*
format
.
KiloByte
const
maxBufferSize
=
512
*
format
.
KiloByte
const
maxRetries
=
3
type
ImageData
struct
{
type
ImageData
struct
{
Data
[]
byte
`json:"data"`
Data
[]
byte
`json:"data"`
...
@@ -586,7 +608,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
...
@@ -586,7 +608,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
}
// Make sure the server is ready
// Make sure the server is ready
status
,
err
:=
s
.
getServerStatus
(
ctx
)
status
,
err
:=
s
.
getServerStatus
Retry
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
return
err
return
err
}
else
if
status
!=
ServerStatusReady
{
}
else
if
status
!=
ServerStatusReady
{
...
@@ -600,13 +622,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
...
@@ -600,13 +622,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
}
}
}
retryDelay
:=
100
*
time
.
Microsecond
for
retries
:=
0
;
retries
<
maxRetries
;
retries
++
{
if
retries
>
0
{
time
.
Sleep
(
retryDelay
)
// wait before retrying
retryDelay
*=
2
// exponential backoff
}
// Handling JSON marshaling with special characters unescaped.
// Handling JSON marshaling with special characters unescaped.
buffer
:=
&
bytes
.
Buffer
{}
buffer
:=
&
bytes
.
Buffer
{}
enc
:=
json
.
NewEncoder
(
buffer
)
enc
:=
json
.
NewEncoder
(
buffer
)
...
@@ -617,20 +632,20 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
...
@@ -617,20 +632,20 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
}
endpoint
:=
fmt
.
Sprintf
(
"http://127.0.0.1:%d/completion"
,
s
.
port
)
endpoint
:=
fmt
.
Sprintf
(
"http://127.0.0.1:%d/completion"
,
s
.
port
)
r
eq
,
err
:=
http
.
NewRequestWithContext
(
ctx
,
http
.
MethodPost
,
endpoint
,
buffer
)
serverR
eq
,
err
:=
http
.
NewRequestWithContext
(
ctx
,
http
.
MethodPost
,
endpoint
,
buffer
)
if
err
!=
nil
{
if
err
!=
nil
{
return
fmt
.
Errorf
(
"error creating POST request: %v"
,
err
)
return
fmt
.
Errorf
(
"error creating POST request: %v"
,
err
)
}
}
r
eq
.
Header
.
Set
(
"Content-Type"
,
"application/json"
)
serverR
eq
.
Header
.
Set
(
"Content-Type"
,
"application/json"
)
res
p
,
err
:=
http
.
DefaultClient
.
Do
(
r
eq
)
res
,
err
:=
http
.
DefaultClient
.
Do
(
serverR
eq
)
if
err
!=
nil
{
if
err
!=
nil
{
return
fmt
.
Errorf
(
"POST predict: %v"
,
err
)
return
fmt
.
Errorf
(
"POST predict: %v"
,
err
)
}
}
defer
res
p
.
Body
.
Close
()
defer
res
.
Body
.
Close
()
if
res
p
.
StatusCode
>=
400
{
if
res
.
StatusCode
>=
400
{
bodyBytes
,
err
:=
io
.
ReadAll
(
res
p
.
Body
)
bodyBytes
,
err
:=
io
.
ReadAll
(
res
.
Body
)
if
err
!=
nil
{
if
err
!=
nil
{
return
fmt
.
Errorf
(
"failed reading llm error response: %w"
,
err
)
return
fmt
.
Errorf
(
"failed reading llm error response: %w"
,
err
)
}
}
...
@@ -638,11 +653,10 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
...
@@ -638,11 +653,10 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
return
fmt
.
Errorf
(
"%s"
,
bodyBytes
)
return
fmt
.
Errorf
(
"%s"
,
bodyBytes
)
}
}
scanner
:=
bufio
.
NewScanner
(
res
p
.
Body
)
scanner
:=
bufio
.
NewScanner
(
res
.
Body
)
buf
:=
make
([]
byte
,
0
,
maxBufferSize
)
buf
:=
make
([]
byte
,
0
,
maxBufferSize
)
scanner
.
Buffer
(
buf
,
maxBufferSize
)
scanner
.
Buffer
(
buf
,
maxBufferSize
)
retryNeeded
:=
false
// keep track of the last token generated, this is used to abort if the model starts looping
// keep track of the last token generated, this is used to abort if the model starts looping
var
lastToken
string
var
lastToken
string
var
tokenRepeat
int
var
tokenRepeat
int
...
@@ -658,12 +672,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
...
@@ -658,12 +672,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
continue
continue
}
}
// try again on slot unavailable
if
bytes
.
Contains
(
line
,
[]
byte
(
"slot unavailable"
))
{
retryNeeded
=
true
break
}
evt
,
ok
:=
bytes
.
CutPrefix
(
line
,
[]
byte
(
"data: "
))
evt
,
ok
:=
bytes
.
CutPrefix
(
line
,
[]
byte
(
"data: "
))
if
!
ok
{
if
!
ok
{
return
fmt
.
Errorf
(
"error parsing llm response stream: %s"
,
line
)
return
fmt
.
Errorf
(
"error parsing llm response stream: %s"
,
line
)
...
@@ -714,19 +722,13 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
...
@@ -714,19 +722,13 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
if
s
.
status
!=
nil
&&
s
.
status
.
LastErrMsg
!=
""
{
if
s
.
status
!=
nil
&&
s
.
status
.
LastErrMsg
!=
""
{
msg
=
s
.
status
.
LastErrMsg
msg
=
s
.
status
.
LastErrMsg
}
}
return
fmt
.
Errorf
(
"an unknown error was encountered while running the model %s"
,
msg
)
return
fmt
.
Errorf
(
"an unknown error was encountered while running the model %s"
,
msg
)
}
}
return
fmt
.
Errorf
(
"error reading llm response: %v"
,
err
)
}
if
!
retryNeeded
{
return
fmt
.
Errorf
(
"error reading llm response: %v"
,
err
)
return
nil
// success
}
}
}
// should never reach here ideally
return
nil
return
fmt
.
Errorf
(
"max retries exceeded"
)
}
}
type
EmbeddingRequest
struct
{
type
EmbeddingRequest
struct
{
...
@@ -743,8 +745,9 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
...
@@ -743,8 +745,9 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
return
nil
,
err
return
nil
,
err
}
}
defer
s
.
sem
.
Release
(
1
)
defer
s
.
sem
.
Release
(
1
)
// Make sure the server is ready
// Make sure the server is ready
status
,
err
:=
s
.
getServerStatus
(
ctx
)
status
,
err
:=
s
.
getServerStatus
Retry
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
else
if
status
!=
ServerStatusReady
{
}
else
if
status
!=
ServerStatusReady
{
...
@@ -799,7 +802,7 @@ func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error)
...
@@ -799,7 +802,7 @@ func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error)
status
,
err
:=
s
.
getServerStatus
(
ctx
)
status
,
err
:=
s
.
getServerStatus
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
else
if
status
!=
ServerStatusReady
&&
status
!=
ServerStatusNoSlotsAvai
a
lble
{
}
else
if
status
!=
ServerStatusReady
&&
status
!=
ServerStatusNoSlotsAvail
a
ble
{
return
nil
,
fmt
.
Errorf
(
"unexpected server status: %s"
,
status
.
ToString
())
return
nil
,
fmt
.
Errorf
(
"unexpected server status: %s"
,
status
.
ToString
())
}
}
...
@@ -851,7 +854,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
...
@@ -851,7 +854,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
status
,
err
:=
s
.
getServerStatus
(
ctx
)
status
,
err
:=
s
.
getServerStatus
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
return
""
,
err
return
""
,
err
}
else
if
status
!=
ServerStatusReady
&&
status
!=
ServerStatusNoSlotsAvai
a
lble
{
}
else
if
status
!=
ServerStatusReady
&&
status
!=
ServerStatusNoSlotsAvail
a
ble
{
return
""
,
fmt
.
Errorf
(
"unexpected server status: %s"
,
status
.
ToString
())
return
""
,
fmt
.
Errorf
(
"unexpected server status: %s"
,
status
.
ToString
())
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment