Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
orangecat
ollama
Commits
939c6047
"vscode:/vscode.git/clone" did not exist on "9a297541b4e96cbfa38705a7820de310b37b643c"
Unverified
Commit
939c6047
authored
Feb 12, 2024
by
Daniel Hiltgen
Committed by
GitHub
Feb 12, 2024
Browse files
Merge pull request #2422 from dhiltgen/better_kill
More robust shutdown
parents
f76ca04f
66807615
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
44 additions
and
1 deletion
+44
-1
llm/dyn_ext_server.go
llm/dyn_ext_server.go
+1
-1
llm/ext_server/ext_server.cpp
llm/ext_server/ext_server.cpp
+43
-0
No files found.
llm/dyn_ext_server.go
View file @
939c6047
...
...
@@ -258,7 +258,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
})
}
if
p
.
Stop
{
if
p
.
Stop
||
bool
(
result
.
stop
)
{
fn
(
PredictResult
{
Done
:
true
,
PromptEvalCount
:
p
.
Timings
.
PromptN
,
...
...
llm/ext_server/ext_server.cpp
View file @
939c6047
#include "ext_server.h"
#include <atomic>
// Necessary evil since the server types are not defined in a header
#include "server.cpp"
...
...
@@ -27,8 +28,24 @@
// Expose the llama server as a callable extern "C" API
llama_server_context
*
llama
=
NULL
;
std
::
thread
ext_server_thread
;
bool
shutting_down
=
false
;
std
::
atomic_int
recv_counter
;
// RAII wrapper for tracking in-flight recv calls
class
atomicRecv
{
public:
atomicRecv
(
std
::
atomic
<
int
>
&
atomic
)
:
atomic
(
atomic
)
{
++
this
->
atomic
;
}
~
atomicRecv
()
{
--
this
->
atomic
;
}
private:
std
::
atomic
<
int
>
&
atomic
;
};
void
llama_server_init
(
ext_server_params
*
sparams
,
ext_server_resp_t
*
err
)
{
recv_counter
=
0
;
assert
(
err
!=
NULL
&&
sparams
!=
NULL
);
log_set_target
(
stderr
);
if
(
!
sparams
->
verbose_logging
)
{
...
...
@@ -151,7 +168,14 @@ void llama_server_start() {
void
llama_server_stop
()
{
assert
(
llama
!=
NULL
);
// Shutdown any in-flight requests and block incoming requests.
LOG_TEE
(
"
\n
initiating shutdown - draining remaining tasks...
\n
"
);
shutting_down
=
true
;
while
(
recv_counter
.
load
()
>
0
)
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
50
));
}
// This may take a while for any pending tasks to drain
// TODO - consider a timeout to cancel tasks if it's taking too long
llama
->
queue_tasks
.
terminate
();
...
...
@@ -166,6 +190,9 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
resp
->
id
=
-
1
;
resp
->
msg
[
0
]
=
'\0'
;
try
{
if
(
shutting_down
)
{
throw
std
::
runtime_error
(
"server shutting down"
);
}
json
data
=
json
::
parse
(
json_req
);
resp
->
id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
resp
->
id
);
...
...
@@ -187,6 +214,7 @@ void llama_server_completion_next_result(const int task_id,
resp
->
json_resp
=
NULL
;
std
::
string
result_json
;
try
{
atomicRecv
ar
(
recv_counter
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
result_json
=
result
.
result_json
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
...
...
@@ -203,6 +231,11 @@ void llama_server_completion_next_result(const int task_id,
llama
->
request_cancel
(
task_id
);
LOG_TEE
(
"next result removing waiting task ID: %d
\n
"
,
task_id
);
llama
->
queue_results
.
remove_waiting_task_id
(
task_id
);
}
else
if
(
shutting_down
)
{
LOG_TEE
(
"aborting completion due to shutdown %d
\n
"
,
task_id
);
llama
->
request_cancel
(
task_id
);
llama
->
queue_results
.
remove_waiting_task_id
(
task_id
);
resp
->
stop
=
true
;
}
}
catch
(
std
::
exception
&
e
)
{
resp
->
error
=
true
;
...
...
@@ -251,6 +284,9 @@ void llama_server_tokenize(const char *json_req, char **json_resp,
err
->
id
=
0
;
err
->
msg
[
0
]
=
'\0'
;
try
{
if
(
shutting_down
)
{
throw
std
::
runtime_error
(
"server shutting down"
);
}
const
json
body
=
json
::
parse
(
json_req
);
std
::
vector
<
llama_token
>
tokens
;
if
(
body
.
count
(
"content"
)
!=
0
)
{
...
...
@@ -284,6 +320,9 @@ void llama_server_detokenize(const char *json_req, char **json_resp,
err
->
id
=
0
;
err
->
msg
[
0
]
=
'\0'
;
try
{
if
(
shutting_down
)
{
throw
std
::
runtime_error
(
"server shutting down"
);
}
const
json
body
=
json
::
parse
(
json_req
);
std
::
string
content
;
if
(
body
.
count
(
"tokens"
)
!=
0
)
{
...
...
@@ -311,6 +350,9 @@ void llama_server_embedding(const char *json_req, char **json_resp,
err
->
id
=
0
;
err
->
msg
[
0
]
=
'\0'
;
try
{
if
(
shutting_down
)
{
throw
std
::
runtime_error
(
"server shutting down"
);
}
const
json
body
=
json
::
parse
(
json_req
);
json
prompt
;
if
(
body
.
count
(
"content"
)
!=
0
)
{
...
...
@@ -321,6 +363,7 @@ void llama_server_embedding(const char *json_req, char **json_resp,
const
int
task_id
=
llama
->
queue_tasks
.
get_new_id
();
llama
->
queue_results
.
add_waiting_task_id
(
task_id
);
llama
->
request_completion
(
task_id
,
{{
"prompt"
,
prompt
},
{
"n_predict"
,
0
}},
false
,
true
,
-
1
);
atomicRecv
ar
(
recv_counter
);
task_result
result
=
llama
->
queue_results
.
recv
(
task_id
);
std
::
string
result_json
=
result
.
result_json
.
dump
();
const
std
::
string
::
size_type
size
=
result_json
.
size
()
+
1
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment