Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
0f2daad8
Unverified
Commit
0f2daad8
authored
Feb 16, 2024
by
OlivierDehaene
Committed by
GitHub
Feb 16, 2024
Browse files
feat: add chat template struct to avoid tuple ordering errors (#1570)
parent
9946165e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
46 additions
and
33 deletions
+46
-33
router/src/infer.rs
router/src/infer.rs
+46
-33
No files found.
router/src/infer.rs
View file @
0f2daad8
...
@@ -31,14 +31,10 @@ pub struct Infer {
...
@@ -31,14 +31,10 @@ pub struct Infer {
queue
:
Queue
,
queue
:
Queue
,
/// Shared state
/// Shared state
shared
:
Arc
<
Shared
>
,
shared
:
Arc
<
Shared
>
,
/// Chat template
chat_template
:
Option
<
ChatTemplate
>
,
/// Inference limit
/// Inference limit
limit_concurrent_requests
:
Arc
<
Semaphore
>
,
limit_concurrent_requests
:
Arc
<
Semaphore
>
,
/// Chat template (template, bos_token, eos_token)
template
:
(
Option
<
Template
<
'static
,
'static
>>
,
Option
<
String
>
,
Option
<
String
>
,
),
}
}
/// Infer shared state
/// Infer shared state
...
@@ -88,32 +84,19 @@ impl Infer {
...
@@ -88,32 +84,19 @@ impl Infer {
generation_health
,
generation_health
,
));
));
let
chat_template
=
tokenizer_config
.chat_template
.map
(|
t
|
ChatTemplate
::
new
(
t
,
tokenizer_config
.bos_token
,
tokenizer_config
.eos_token
));
// Inference limit with a semaphore
// Inference limit with a semaphore
let
semaphore
=
Arc
::
new
(
Semaphore
::
new
(
max_concurrent_requests
));
let
semaphore
=
Arc
::
new
(
Semaphore
::
new
(
max_concurrent_requests
));
let
template
=
tokenizer_config
.chat_template
.map
(|
t
|
{
let
mut
env
=
Box
::
new
(
Environment
::
new
());
let
template_str
=
t
.into_boxed_str
();
env
.add_function
(
"raise_exception"
,
raise_exception
);
// leaking env and template_str as read-only, static resources for performance.
Box
::
leak
(
env
)
.template_from_str
(
Box
::
leak
(
template_str
))
.unwrap
()
});
let
eos_token
=
tokenizer_config
.eos_token
.map_or_else
(
String
::
new
,
|
t
|
t
)
.into
();
let
bos_token
=
tokenizer_config
.bos_token
.map_or_else
(
String
::
new
,
|
t
|
t
)
.into
();
Self
{
Self
{
validation
,
validation
,
queue
,
queue
,
shared
,
shared
,
chat_template
,
limit_concurrent_requests
:
semaphore
,
limit_concurrent_requests
:
semaphore
,
template
:
(
template
,
bos_token
,
eos_token
),
}
}
}
}
...
@@ -192,20 +175,14 @@ impl Infer {
...
@@ -192,20 +175,14 @@ impl Infer {
/// Apply the chat template to the chat request
/// Apply the chat template to the chat request
#[instrument(skip_all)]
#[instrument(skip_all)]
pub
(
crate
)
fn
apply_chat_template
(
&
self
,
messages
:
Vec
<
Message
>
)
->
Result
<
String
,
InferError
>
{
pub
(
crate
)
fn
apply_chat_template
(
&
self
,
messages
:
Vec
<
Message
>
)
->
Result
<
String
,
InferError
>
{
let
(
template
,
bos_token
,
eos_token
)
=
&
self
.template
;
self
.chat_template
template
.as_ref
()
.as_ref
()
.ok_or_else
(||
InferError
::
TemplateError
(
ErrorKind
::
TemplateNotFound
.into
()))
?
.ok_or_else
(||
InferError
::
TemplateError
(
ErrorKind
::
TemplateNotFound
.into
()))
?
.render
(
ChatTemplateInputs
{
.apply
(
messages
)
messages
,
eos_token
:
eos_token
.as_deref
(),
bos_token
:
bos_token
.as_deref
(),
add_generation_prompt
:
true
,
})
.map_err
(|
e
|
{
.map_err
(|
e
|
{
metrics
::
increment_counter!
(
"tgi_request_failure"
,
"err"
=>
"template"
);
metrics
::
increment_counter!
(
"tgi_request_failure"
,
"err"
=>
"template"
);
tracing
::
error!
(
"{e}"
);
tracing
::
error!
(
"{e}"
);
InferError
::
TemplateError
(
e
)
e
})
})
}
}
...
@@ -329,6 +306,42 @@ impl Infer {
...
@@ -329,6 +306,42 @@ impl Infer {
}
}
}
}
#[derive(Clone)]
struct
ChatTemplate
{
template
:
Template
<
'static
,
'static
>
,
bos_token
:
Option
<
String
>
,
eos_token
:
Option
<
String
>
,
}
impl
ChatTemplate
{
fn
new
(
template
:
String
,
bos_token
:
Option
<
String
>
,
eos_token
:
Option
<
String
>
)
->
Self
{
let
mut
env
=
Box
::
new
(
Environment
::
new
());
let
template_str
=
template
.into_boxed_str
();
env
.add_function
(
"raise_exception"
,
raise_exception
);
// leaking env and template_str as read-only, static resources for performance.
let
template
=
Box
::
leak
(
env
)
.template_from_str
(
Box
::
leak
(
template_str
))
.unwrap
();
Self
{
template
,
bos_token
,
eos_token
,
}
}
fn
apply
(
&
self
,
messages
:
Vec
<
Message
>
)
->
Result
<
String
,
InferError
>
{
self
.template
.render
(
ChatTemplateInputs
{
messages
,
bos_token
:
self
.bos_token
.as_deref
(),
eos_token
:
self
.eos_token
.as_deref
(),
add_generation_prompt
:
true
,
})
.map_err
(
InferError
::
TemplateError
)
}
}
/// Batching logic
/// Batching logic
/// Will be launched in a background Tokio task
/// Will be launched in a background Tokio task
///
///
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment