Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
d0425f26
Unverified
Commit
d0425f26
authored
May 09, 2024
by
Daniel Hiltgen
Committed by
GitHub
May 09, 2024
Browse files
Merge pull request #4294 from dhiltgen/harden_subprocess_reaping
Harden subprocess reaping
parents
cfa84b84
84ac7ce1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
65 additions
and
59 deletions
+65
-59
llm/ext_server/server.cpp
llm/ext_server/server.cpp
+21
-21
llm/server.go
llm/server.go
+44
-38
No files found.
llm/ext_server/server.cpp
View file @
d0425f26
...
@@ -2727,7 +2727,7 @@ static json format_detokenized_response(std::string content)
...
@@ -2727,7 +2727,7 @@ static json format_detokenized_response(std::string content)
static
void
log_server_request
(
const
httplib
::
Request
&
req
,
const
httplib
::
Response
&
res
)
static
void
log_server_request
(
const
httplib
::
Request
&
req
,
const
httplib
::
Response
&
res
)
{
{
// skip GH copilot requests when using default port
// skip GH copilot requests when using default port
if
(
req
.
path
==
"/v1/health"
||
req
.
path
==
"/v1/completions"
)
if
(
req
.
path
==
"/health"
||
req
.
path
==
"/v1/health"
||
req
.
path
==
"/v1/completions"
)
{
{
return
;
return
;
}
}
...
@@ -3054,6 +3054,26 @@ int main(int argc, char **argv) {
...
@@ -3054,6 +3054,26 @@ int main(int argc, char **argv) {
log_data
[
"api_key"
]
=
"api_key: "
+
std
::
to_string
(
sparams
.
api_keys
.
size
())
+
" keys loaded"
;
log_data
[
"api_key"
]
=
"api_key: "
+
std
::
to_string
(
sparams
.
api_keys
.
size
())
+
" keys loaded"
;
}
}
if
(
sparams
.
n_threads_http
<
1
)
{
// +2 threads for monitoring endpoints
sparams
.
n_threads_http
=
std
::
max
(
params
.
n_parallel
+
2
,
(
int32_t
)
std
::
thread
::
hardware_concurrency
()
-
1
);
}
log_data
[
"n_threads_http"
]
=
std
::
to_string
(
sparams
.
n_threads_http
);
svr
.
new_task_queue
=
[
&
sparams
]
{
return
new
httplib
::
ThreadPool
(
sparams
.
n_threads_http
);
};
LOG_INFO
(
"HTTP server listening"
,
log_data
);
// run the HTTP server in a thread - see comment below
std
::
thread
t
([
&
]()
{
if
(
!
svr
.
listen_after_bind
())
{
state
.
store
(
SERVER_STATE_ERROR
);
return
1
;
}
return
0
;
});
// load the model
// load the model
if
(
!
llama
.
load_model
(
params
))
if
(
!
llama
.
load_model
(
params
))
{
{
...
@@ -3258,26 +3278,6 @@ int main(int argc, char **argv) {
...
@@ -3258,26 +3278,6 @@ int main(int argc, char **argv) {
}*/
}*/
//);
//);
if
(
sparams
.
n_threads_http
<
1
)
{
// +2 threads for monitoring endpoints
sparams
.
n_threads_http
=
std
::
max
(
params
.
n_parallel
+
2
,
(
int32_t
)
std
::
thread
::
hardware_concurrency
()
-
1
);
}
log_data
[
"n_threads_http"
]
=
std
::
to_string
(
sparams
.
n_threads_http
);
svr
.
new_task_queue
=
[
&
sparams
]
{
return
new
httplib
::
ThreadPool
(
sparams
.
n_threads_http
);
};
LOG_INFO
(
"HTTP server listening"
,
log_data
);
// run the HTTP server in a thread - see comment below
std
::
thread
t
([
&
]()
{
if
(
!
svr
.
listen_after_bind
())
{
state
.
store
(
SERVER_STATE_ERROR
);
return
1
;
}
return
0
;
});
llama
.
queue_tasks
.
on_new_task
(
std
::
bind
(
llama
.
queue_tasks
.
on_new_task
(
std
::
bind
(
&
llama_server_context
::
process_single_task
,
&
llama
,
std
::
placeholders
::
_1
));
&
llama_server_context
::
process_single_task
,
&
llama
,
std
::
placeholders
::
_1
));
llama
.
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
llama
.
queue_tasks
.
on_finish_multitask
(
std
::
bind
(
...
...
llm/server.go
View file @
d0425f26
...
@@ -53,6 +53,7 @@ type llmServer struct {
...
@@ -53,6 +53,7 @@ type llmServer struct {
estimatedTotal
uint64
// Total size of model
estimatedTotal
uint64
// Total size of model
totalLayers
uint64
totalLayers
uint64
gpuCount
int
gpuCount
int
loadDuration
time
.
Duration
// Record how long it took the model to load
sem
*
semaphore
.
Weighted
sem
*
semaphore
.
Weighted
}
}
...
@@ -291,6 +292,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
...
@@ -291,6 +292,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
sem
:
semaphore
.
NewWeighted
(
int64
(
numParallel
)),
sem
:
semaphore
.
NewWeighted
(
int64
(
numParallel
)),
totalLayers
:
ggml
.
KV
()
.
BlockCount
()
+
1
,
totalLayers
:
ggml
.
KV
()
.
BlockCount
()
+
1
,
gpuCount
:
gpuCount
,
gpuCount
:
gpuCount
,
done
:
make
(
chan
error
,
1
),
}
}
s
.
cmd
.
Env
=
os
.
Environ
()
s
.
cmd
.
Env
=
os
.
Environ
()
...
@@ -339,6 +341,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
...
@@ -339,6 +341,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
continue
continue
}
}
// reap subprocess when it exits
go
func
()
{
s
.
done
<-
s
.
cmd
.
Wait
()
}()
return
s
,
nil
return
s
,
nil
}
}
...
@@ -483,13 +490,11 @@ func (s *llmServer) Ping(ctx context.Context) error {
...
@@ -483,13 +490,11 @@ func (s *llmServer) Ping(ctx context.Context) error {
func
(
s
*
llmServer
)
WaitUntilRunning
(
ctx
context
.
Context
)
error
{
func
(
s
*
llmServer
)
WaitUntilRunning
(
ctx
context
.
Context
)
error
{
start
:=
time
.
Now
()
start
:=
time
.
Now
()
// TODO we need to wire up a better way to detect hangs during model load and startup of the server
expiresAt
:=
time
.
Now
()
.
Add
(
10
*
time
.
Minute
)
// be generous with timeout, large models can take a while to load
expiresAt
:=
time
.
Now
()
.
Add
(
10
*
time
.
Minute
)
// be generous with timeout, large models can take a while to load
ticker
:=
time
.
NewTicker
(
50
*
time
.
Millisecond
)
defer
ticker
.
Stop
()
slog
.
Info
(
"waiting for llama runner to start responding"
)
slog
.
Info
(
"waiting for llama runner to start responding"
)
var
lastStatus
ServerStatus
=
-
1
var
lastStatus
ServerStatus
=
-
1
for
{
for
{
select
{
select
{
case
<-
ctx
.
Done
()
:
case
<-
ctx
.
Done
()
:
...
@@ -501,41 +506,39 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
...
@@ -501,41 +506,39 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
msg
=
s
.
status
.
LastErrMsg
msg
=
s
.
status
.
LastErrMsg
}
}
return
fmt
.
Errorf
(
"llama runner process has terminated: %v %s"
,
err
,
msg
)
return
fmt
.
Errorf
(
"llama runner process has terminated: %v %s"
,
err
,
msg
)
case
<-
ticker
.
C
:
default
:
if
time
.
Now
()
.
After
(
expiresAt
)
{
}
// timeout
if
time
.
Now
()
.
After
(
expiresAt
)
{
msg
:=
""
// timeout
if
s
.
status
!=
nil
&&
s
.
status
.
LastErrMsg
!=
""
{
msg
:=
""
msg
=
s
.
status
.
LastErrMsg
if
s
.
status
!=
nil
&&
s
.
status
.
LastErrMsg
!=
""
{
}
msg
=
s
.
status
.
LastErrMsg
return
fmt
.
Errorf
(
"timed out waiting for llama runner to start: %s"
,
msg
)
}
if
s
.
cmd
.
ProcessState
!=
nil
{
msg
:=
""
if
s
.
status
!=
nil
&&
s
.
status
.
LastErrMsg
!=
""
{
msg
=
s
.
status
.
LastErrMsg
}
return
fmt
.
Errorf
(
"llama runner process no longer running: %d %s"
,
s
.
cmd
.
ProcessState
.
ExitCode
(),
msg
)
}
c
,
cancel
:=
context
.
WithTimeout
(
ctx
,
200
*
time
.
Millisecond
)
defer
cancel
()
status
,
err
:=
s
.
getServerStatus
(
c
)
if
err
!=
nil
&&
lastStatus
!=
status
{
slog
.
Debug
(
"server not yet available"
,
"error"
,
err
)
lastStatus
=
status
continue
}
}
return
fmt
.
Errorf
(
"timed out waiting for llama runner to start: %s"
,
msg
)
switch
status
{
}
case
ServerStatusLoadingModel
:
if
s
.
cmd
.
ProcessState
!=
nil
{
// TODO - this state never seems to happen with the current server.cpp code (bug?)
msg
:=
""
// it doesn't respond to the health endpoint until after the model is loaded
if
s
.
status
!=
nil
&&
s
.
status
.
LastErrMsg
!=
""
{
slog
.
Debug
(
"loading model"
)
msg
=
s
.
status
.
LastErrMsg
case
ServerStatusReady
:
slog
.
Debug
(
fmt
.
Sprintf
(
"llama runner started in %f seconds"
,
time
.
Since
(
start
)
.
Seconds
()))
return
nil
}
}
return
fmt
.
Errorf
(
"llama runner process no longer running: %d %s"
,
s
.
cmd
.
ProcessState
.
ExitCode
(),
msg
)
}
ctx
,
cancel
:=
context
.
WithTimeout
(
ctx
,
200
*
time
.
Millisecond
)
defer
cancel
()
status
,
_
:=
s
.
getServerStatus
(
ctx
)
if
lastStatus
!=
status
&&
status
!=
ServerStatusReady
{
// Only log on status changes
slog
.
Info
(
"waiting for server to become available"
,
"status"
,
status
.
ToString
())
}
switch
status
{
case
ServerStatusReady
:
s
.
loadDuration
=
time
.
Since
(
start
)
slog
.
Info
(
fmt
.
Sprintf
(
"llama runner started in %0.2f seconds"
,
s
.
loadDuration
.
Seconds
()))
return
nil
default
:
lastStatus
=
status
time
.
Sleep
(
time
.
Millisecond
*
250
)
continue
}
}
}
}
}
}
...
@@ -943,8 +946,11 @@ func (s *llmServer) Close() error {
...
@@ -943,8 +946,11 @@ func (s *llmServer) Close() error {
if
err
:=
s
.
cmd
.
Process
.
Kill
();
err
!=
nil
{
if
err
:=
s
.
cmd
.
Process
.
Kill
();
err
!=
nil
{
return
err
return
err
}
}
// if ProcessState is already populated, Wait already completed, no need to wait again
_
=
s
.
cmd
.
Wait
()
if
s
.
cmd
.
ProcessState
==
nil
{
slog
.
Debug
(
"waiting for llama server to exit"
)
<-
s
.
done
}
slog
.
Debug
(
"llama server stopped"
)
slog
.
Debug
(
"llama server stopped"
)
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment