Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
ed740a25
Unverified
Commit
ed740a25
authored
May 06, 2024
by
Jeffrey Morgan
Committed by
GitHub
May 06, 2024
Browse files
Fix `no slots available` error with concurrent requests (#4160)
parent
c9f98622
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
115 additions
and
112 deletions
+115
-112
llm/server.go
llm/server.go
+115
-112
No files found.
llm/server.go
View file @
ed740a25
...
@@ -338,7 +338,7 @@ type ServerStatus int
...
@@ -338,7 +338,7 @@ type ServerStatus int
const
(
// iota is reset to 0
const
(
// iota is reset to 0
ServerStatusReady
ServerStatus
=
iota
ServerStatusReady
ServerStatus
=
iota
ServerStatusNoSlotsAvai
a
lble
ServerStatusNoSlotsAvail
a
ble
ServerStatusLoadingModel
ServerStatusLoadingModel
ServerStatusNotResponding
ServerStatusNotResponding
ServerStatusError
ServerStatusError
...
@@ -348,7 +348,7 @@ func (s ServerStatus) ToString() string {
...
@@ -348,7 +348,7 @@ func (s ServerStatus) ToString() string {
switch
s
{
switch
s
{
case
ServerStatusReady
:
case
ServerStatusReady
:
return
"llm server ready"
return
"llm server ready"
case
ServerStatusNoSlotsAvai
a
lble
:
case
ServerStatusNoSlotsAvail
a
ble
:
return
"llm busy - no slots available"
return
"llm busy - no slots available"
case
ServerStatusLoadingModel
:
case
ServerStatusLoadingModel
:
return
"llm server loading model"
return
"llm server loading model"
...
@@ -405,7 +405,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
...
@@ -405,7 +405,7 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
case
"ok"
:
case
"ok"
:
return
ServerStatusReady
,
nil
return
ServerStatusReady
,
nil
case
"no slot available"
:
case
"no slot available"
:
return
ServerStatusNoSlotsAvai
a
lble
,
nil
return
ServerStatusNoSlotsAvail
a
ble
,
nil
case
"loading model"
:
case
"loading model"
:
return
ServerStatusLoadingModel
,
nil
return
ServerStatusLoadingModel
,
nil
default
:
default
:
...
@@ -413,6 +413,29 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
...
@@ -413,6 +413,29 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
}
}
}
}
// getServerStatusRetry will retry if ServerStatusNoSlotsAvailable is received
func
(
s
*
llmServer
)
getServerStatusRetry
(
ctx
context
.
Context
)
(
ServerStatus
,
error
)
{
var
retries
int
for
{
status
,
err
:=
s
.
getServerStatus
(
ctx
)
if
err
!=
nil
{
return
status
,
err
}
if
status
==
ServerStatusNoSlotsAvailable
{
if
retries
>=
10
{
return
status
,
fmt
.
Errorf
(
"no slots available after %d retries"
,
retries
)
}
time
.
Sleep
(
5
*
time
.
Millisecond
)
retries
++
continue
}
return
status
,
nil
}
}
func
(
s
*
llmServer
)
Ping
(
ctx
context
.
Context
)
error
{
func
(
s
*
llmServer
)
Ping
(
ctx
context
.
Context
)
error
{
_
,
err
:=
s
.
getServerStatus
(
ctx
)
_
,
err
:=
s
.
getServerStatus
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
...
@@ -510,7 +533,6 @@ ws ::= ([ \t\n] ws)?
...
@@ -510,7 +533,6 @@ ws ::= ([ \t\n] ws)?
`
`
const
maxBufferSize
=
512
*
format
.
KiloByte
const
maxBufferSize
=
512
*
format
.
KiloByte
const
maxRetries
=
3
type
ImageData
struct
{
type
ImageData
struct
{
Data
[]
byte
`json:"data"`
Data
[]
byte
`json:"data"`
...
@@ -586,7 +608,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
...
@@ -586,7 +608,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
}
// Make sure the server is ready
// Make sure the server is ready
status
,
err
:=
s
.
getServerStatus
(
ctx
)
status
,
err
:=
s
.
getServerStatus
Retry
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
return
err
return
err
}
else
if
status
!=
ServerStatusReady
{
}
else
if
status
!=
ServerStatusReady
{
...
@@ -600,133 +622,113 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
...
@@ -600,133 +622,113 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
}
}
}
}
retryDelay
:=
100
*
time
.
Microsecond
// Handling JSON marshaling with special characters unescaped.
for
retries
:=
0
;
retries
<
maxRetries
;
retries
++
{
buffer
:=
&
bytes
.
Buffer
{}
if
retries
>
0
{
enc
:=
json
.
NewEncoder
(
buffer
)
time
.
Sleep
(
retryDelay
)
// wait before retrying
enc
.
SetEscapeHTML
(
false
)
retryDelay
*=
2
// exponential backoff
}
// Handling JSON marshaling with special characters unescaped.
if
err
:=
enc
.
Encode
(
request
);
err
!=
nil
{
buffer
:=
&
bytes
.
Buffer
{}
return
fmt
.
Errorf
(
"failed to marshal data: %v"
,
err
)
enc
:=
json
.
NewEncoder
(
buffer
)
}
enc
.
SetEscapeHTML
(
false
)
if
err
:=
enc
.
Encode
(
request
);
err
!=
nil
{
endpoint
:=
fmt
.
Sprintf
(
"http://127.0.0.1:%d/completion"
,
s
.
port
)
return
fmt
.
Errorf
(
"failed to marshal data: %v"
,
err
)
serverReq
,
err
:=
http
.
NewRequestWithContext
(
ctx
,
http
.
MethodPost
,
endpoint
,
buffer
)
}
if
err
!=
nil
{
return
fmt
.
Errorf
(
"error creating POST request: %v"
,
err
)
}
serverReq
.
Header
.
Set
(
"Content-Type"
,
"application/json"
)
endpoint
:=
fmt
.
Sprintf
(
"http://127.0.0.1:%d/completion"
,
s
.
port
)
res
,
err
:=
http
.
DefaultClient
.
Do
(
serverReq
)
req
,
err
:=
http
.
NewRequestWithContext
(
ctx
,
http
.
MethodPost
,
endpoint
,
buffer
)
if
err
!=
nil
{
if
err
!=
nil
{
return
fmt
.
Errorf
(
"POST predict: %v"
,
err
)
return
fmt
.
Errorf
(
"error creating POST request: %v"
,
err
)
}
}
defer
res
.
Body
.
Close
()
req
.
Header
.
Set
(
"Content-Type"
,
"application/json"
)
resp
,
err
:=
http
.
DefaultClient
.
Do
(
req
)
if
res
.
StatusCode
>=
400
{
bodyBytes
,
err
:=
io
.
ReadAll
(
res
.
Body
)
if
err
!=
nil
{
if
err
!=
nil
{
return
fmt
.
Errorf
(
"POST predict: %v"
,
err
)
return
fmt
.
Errorf
(
"failed reading llm error response: %w"
,
err
)
}
defer
resp
.
Body
.
Close
()
if
resp
.
StatusCode
>=
400
{
bodyBytes
,
err
:=
io
.
ReadAll
(
resp
.
Body
)
if
err
!=
nil
{
return
fmt
.
Errorf
(
"failed reading llm error response: %w"
,
err
)
}
log
.
Printf
(
"llm predict error: %s"
,
bodyBytes
)
return
fmt
.
Errorf
(
"%s"
,
bodyBytes
)
}
}
log
.
Printf
(
"llm predict error: %s"
,
bodyBytes
)
return
fmt
.
Errorf
(
"%s"
,
bodyBytes
)
}
scanner
:=
bufio
.
NewScanner
(
resp
.
Body
)
scanner
:=
bufio
.
NewScanner
(
res
.
Body
)
buf
:=
make
([]
byte
,
0
,
maxBufferSize
)
buf
:=
make
([]
byte
,
0
,
maxBufferSize
)
scanner
.
Buffer
(
buf
,
maxBufferSize
)
scanner
.
Buffer
(
buf
,
maxBufferSize
)
retryNeeded
:=
false
// keep track of the last token generated, this is used to abort if the model starts looping
var
lastToken
string
var
tokenRepeat
int
for
scanner
.
Scan
()
{
// keep track of the last token generated, this is used to abort if the model starts looping
select
{
var
lastToken
string
case
<-
ctx
.
Done
()
:
var
tokenRepeat
int
// This handles the request cancellation
return
ctx
.
Err
()
default
:
line
:=
scanner
.
Bytes
()
if
len
(
line
)
==
0
{
continue
}
// try again on slot unavailable
for
scanner
.
Scan
()
{
if
bytes
.
Contains
(
line
,
[]
byte
(
"slot unavailable"
))
{
select
{
retryNeeded
=
true
case
<-
ctx
.
Done
()
:
break
// This handles the request cancellation
}
return
ctx
.
Err
()
default
:
line
:=
scanner
.
Bytes
()
if
len
(
line
)
==
0
{
continue
}
evt
,
ok
:=
bytes
.
CutPrefix
(
line
,
[]
byte
(
"data: "
))
evt
,
ok
:=
bytes
.
CutPrefix
(
line
,
[]
byte
(
"data: "
))
if
!
ok
{
if
!
ok
{
return
fmt
.
Errorf
(
"error parsing llm response stream: %s"
,
line
)
return
fmt
.
Errorf
(
"error parsing llm response stream: %s"
,
line
)
}
}
var
c
completion
var
c
completion
if
err
:=
json
.
Unmarshal
(
evt
,
&
c
);
err
!=
nil
{
if
err
:=
json
.
Unmarshal
(
evt
,
&
c
);
err
!=
nil
{
return
fmt
.
Errorf
(
"error unmarshaling llm prediction response: %v"
,
err
)
return
fmt
.
Errorf
(
"error unmarshaling llm prediction response: %v"
,
err
)
}
}
switch
{
switch
{
case
strings
.
TrimSpace
(
c
.
Content
)
==
lastToken
:
case
strings
.
TrimSpace
(
c
.
Content
)
==
lastToken
:
tokenRepeat
++
tokenRepeat
++
default
:
default
:
lastToken
=
strings
.
TrimSpace
(
c
.
Content
)
lastToken
=
strings
.
TrimSpace
(
c
.
Content
)
tokenRepeat
=
0
tokenRepeat
=
0
}
}
// 30 picked as an arbitrary max token repeat limit, modify as needed
// 30 picked as an arbitrary max token repeat limit, modify as needed
if
tokenRepeat
>
30
{
if
tokenRepeat
>
30
{
slog
.
Debug
(
"prediction aborted, token repeat limit reached"
)
slog
.
Debug
(
"prediction aborted, token repeat limit reached"
)
return
ctx
.
Err
()
return
ctx
.
Err
()
}
}
if
c
.
Content
!=
""
{
if
c
.
Content
!=
""
{
fn
(
CompletionResponse
{
fn
(
CompletionResponse
{
Content
:
c
.
Content
,
Content
:
c
.
Content
,
})
})
}
}
if
c
.
Stop
{
if
c
.
Stop
{
fn
(
CompletionResponse
{
fn
(
CompletionResponse
{
Done
:
true
,
Done
:
true
,
PromptEvalCount
:
c
.
Timings
.
PromptN
,
PromptEvalCount
:
c
.
Timings
.
PromptN
,
PromptEvalDuration
:
parseDurationMs
(
c
.
Timings
.
PromptMS
),
PromptEvalDuration
:
parseDurationMs
(
c
.
Timings
.
PromptMS
),
EvalCount
:
c
.
Timings
.
PredictedN
,
EvalCount
:
c
.
Timings
.
PredictedN
,
EvalDuration
:
parseDurationMs
(
c
.
Timings
.
PredictedMS
),
EvalDuration
:
parseDurationMs
(
c
.
Timings
.
PredictedMS
),
})
})
return
nil
return
nil
}
}
}
}
}
}
if
err
:=
scanner
.
Err
();
err
!=
nil
{
if
err
:=
scanner
.
Err
();
err
!=
nil
{
if
strings
.
Contains
(
err
.
Error
(),
"unexpected EOF"
)
{
if
strings
.
Contains
(
err
.
Error
(),
"unexpected EOF"
)
{
s
.
Close
()
s
.
Close
()
msg
:=
""
msg
:=
""
if
s
.
status
!=
nil
&&
s
.
status
.
LastErrMsg
!=
""
{
if
s
.
status
!=
nil
&&
s
.
status
.
LastErrMsg
!=
""
{
msg
=
s
.
status
.
LastErrMsg
msg
=
s
.
status
.
LastErrMsg
}
return
fmt
.
Errorf
(
"an unknown error was encountered while running the model %s"
,
msg
)
}
}
return
fmt
.
Errorf
(
"
error reading llm response:
%
v
"
,
err
)
return
fmt
.
Errorf
(
"
an unknown error was encountered while running the model
%
s
"
,
msg
)
}
}
if
!
retryNeeded
{
return
fmt
.
Errorf
(
"error reading llm response: %v"
,
err
)
return
nil
// success
}
}
}
// should never reach here ideally
return
nil
return
fmt
.
Errorf
(
"max retries exceeded"
)
}
}
type
EmbeddingRequest
struct
{
type
EmbeddingRequest
struct
{
...
@@ -743,8 +745,9 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
...
@@ -743,8 +745,9 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er
return
nil
,
err
return
nil
,
err
}
}
defer
s
.
sem
.
Release
(
1
)
defer
s
.
sem
.
Release
(
1
)
// Make sure the server is ready
// Make sure the server is ready
status
,
err
:=
s
.
getServerStatus
(
ctx
)
status
,
err
:=
s
.
getServerStatus
Retry
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
else
if
status
!=
ServerStatusReady
{
}
else
if
status
!=
ServerStatusReady
{
...
@@ -799,7 +802,7 @@ func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error)
...
@@ -799,7 +802,7 @@ func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error)
status
,
err
:=
s
.
getServerStatus
(
ctx
)
status
,
err
:=
s
.
getServerStatus
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
else
if
status
!=
ServerStatusReady
&&
status
!=
ServerStatusNoSlotsAvai
a
lble
{
}
else
if
status
!=
ServerStatusReady
&&
status
!=
ServerStatusNoSlotsAvail
a
ble
{
return
nil
,
fmt
.
Errorf
(
"unexpected server status: %s"
,
status
.
ToString
())
return
nil
,
fmt
.
Errorf
(
"unexpected server status: %s"
,
status
.
ToString
())
}
}
...
@@ -851,7 +854,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
...
@@ -851,7 +854,7 @@ func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error
status
,
err
:=
s
.
getServerStatus
(
ctx
)
status
,
err
:=
s
.
getServerStatus
(
ctx
)
if
err
!=
nil
{
if
err
!=
nil
{
return
""
,
err
return
""
,
err
}
else
if
status
!=
ServerStatusReady
&&
status
!=
ServerStatusNoSlotsAvai
a
lble
{
}
else
if
status
!=
ServerStatusReady
&&
status
!=
ServerStatusNoSlotsAvail
a
ble
{
return
""
,
fmt
.
Errorf
(
"unexpected server status: %s"
,
status
.
ToString
())
return
""
,
fmt
.
Errorf
(
"unexpected server status: %s"
,
status
.
ToString
())
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment