Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
110f3f8c
"lib/runtime/src/logging.rs" did not exist on "4e6f3fef9aa485f0607693bdaea6c7aa63409f5e"
Commit
110f3f8c
authored
Feb 27, 2025
by
Paul Hendricks
Committed by
GitHub
Feb 27, 2025
Browse files
refactor: rename ChatCompletionResponseDelta to NvCreateChatCompletionStreamResponse (#292)
parent
c13ea718
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
59 additions
and
52 deletions
+59
-52
launch/tio/src/input/endpoint.rs
launch/tio/src/input/endpoint.rs
+4
-2
launch/tio/src/input/http.rs
launch/tio/src/input/http.rs
+4
-2
launch/tio/src/input/text.rs
launch/tio/src/input/text.rs
+2
-2
launch/tio/src/lib.rs
launch/tio/src/lib.rs
+3
-3
launch/tio/src/output/echo_full.rs
launch/tio/src/output/echo_full.rs
+5
-5
lib/llm/src/engines/mistralrs.rs
lib/llm/src/engines/mistralrs.rs
+4
-4
lib/llm/src/http/service/discovery.rs
lib/llm/src/http/service/discovery.rs
+2
-2
lib/llm/src/preprocessor.rs
lib/llm/src/preprocessor.rs
+5
-5
lib/llm/src/protocols/codec.rs
lib/llm/src/protocols/codec.rs
+2
-2
lib/llm/src/protocols/openai/chat_completions.rs
lib/llm/src/protocols/openai/chat_completions.rs
+1
-1
lib/llm/src/protocols/openai/chat_completions/aggregator.rs
lib/llm/src/protocols/openai/chat_completions/aggregator.rs
+11
-10
lib/llm/src/protocols/openai/chat_completions/delta.rs
lib/llm/src/protocols/openai/chat_completions/delta.rs
+6
-4
lib/llm/src/types.rs
lib/llm/src/types.rs
+3
-3
lib/llm/tests/http-service.rs
lib/llm/tests/http-service.rs
+7
-7
No files found.
launch/tio/src/input/endpoint.rs
View file @
110f3f8c
...
...
@@ -19,7 +19,9 @@ use triton_distributed_llm::{
model_type
::
ModelType
,
preprocessor
::
OpenAIPreprocessor
,
types
::{
openai
::
chat_completions
::{
ChatCompletionResponseDelta
,
NvCreateChatCompletionRequest
},
openai
::
chat_completions
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
,
},
Annotated
,
},
};
...
...
@@ -55,7 +57,7 @@ pub async fn run(
}
=>
{
let
frontend
=
SegmentSource
::
<
SingleIn
<
NvCreateChatCompletionRequest
>
,
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
>
::
new
();
let
preprocessor
=
OpenAIPreprocessor
::
new
(
*
card
.clone
())
.await
?
...
...
launch/tio/src/input/http.rs
View file @
110f3f8c
...
...
@@ -21,7 +21,9 @@ use triton_distributed_llm::{
model_type
::
ModelType
,
preprocessor
::
OpenAIPreprocessor
,
types
::{
openai
::
chat_completions
::{
ChatCompletionResponseDelta
,
NvCreateChatCompletionRequest
},
openai
::
chat_completions
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionStreamResponse
,
},
Annotated
,
},
};
...
...
@@ -75,7 +77,7 @@ pub async fn run(
}
=>
{
let
frontend
=
ServiceFrontend
::
<
SingleIn
<
NvCreateChatCompletionRequest
>
,
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
>
::
new
();
let
preprocessor
=
OpenAIPreprocessor
::
new
(
*
card
.clone
())
.await
?
...
...
launch/tio/src/input/text.rs
View file @
110f3f8c
...
...
@@ -23,7 +23,7 @@ use triton_distributed_llm::{
preprocessor
::
OpenAIPreprocessor
,
types
::{
openai
::
chat_completions
::{
ChatCompletionRe
sponseDelta
,
NvCreateChatCompletion
Request
,
NvCreate
ChatCompletionRe
quest
,
NvCreateChatCompletion
StreamResponse
,
OpenAIChatCompletionsStreamingEngine
,
},
Annotated
,
...
...
@@ -72,7 +72,7 @@ pub async fn run(
}
=>
{
let
frontend
=
ServiceFrontend
::
<
SingleIn
<
NvCreateChatCompletionRequest
>
,
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
>
::
new
();
let
preprocessor
=
OpenAIPreprocessor
::
new
(
*
card
.clone
())
.await
?
...
...
launch/tio/src/lib.rs
View file @
110f3f8c
...
...
@@ -21,7 +21,7 @@ use triton_distributed_llm::{
model_card
::
model
::
ModelDeploymentCard
,
types
::{
openai
::
chat_completions
::{
ChatCompletionRe
sponseDelta
,
NvCreateChatCompletion
Request
,
NvCreate
ChatCompletionRe
quest
,
NvCreateChatCompletion
StreamResponse
,
OpenAIChatCompletionsStreamingEngine
,
},
Annotated
,
...
...
@@ -113,7 +113,7 @@ pub struct Flags {
pub
enum
EngineConfig
{
/// An remote networked engine we don't know about yet
/// We don't have the pre-processor yet so this is only text requests. Type will change later.
Dynamic
(
Client
<
NvCreateChatCompletionRequest
,
Annotated
<
ChatCompletionResponse
Delta
>>
),
Dynamic
(
Client
<
NvCreateChatCompletionRequest
,
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
),
/// A Full service engine does it's own tokenization and prompt formatting.
StaticFull
{
...
...
@@ -223,7 +223,7 @@ pub async fn run(
.namespace
(
endpoint
.namespace
)
?
.component
(
endpoint
.component
)
?
.endpoint
(
endpoint
.name
)
.client
::
<
NvCreateChatCompletionRequest
,
Annotated
<
ChatCompletionResponse
Delta
>>
()
.client
::
<
NvCreateChatCompletionRequest
,
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
()
.await
?
;
tracing
::
info!
(
"Waiting for remote {}..."
,
client
.path
());
...
...
launch/tio/src/output/echo_full.rs
View file @
110f3f8c
...
...
@@ -19,7 +19,7 @@ use async_stream::stream;
use
async_trait
::
async_trait
;
use
triton_distributed_llm
::
protocols
::
openai
::
chat_completions
::{
ChatCompletionRe
sponseDelta
,
NvCreateChatCompletion
Request
,
NvCreate
ChatCompletionRe
quest
,
NvCreateChatCompletion
StreamResponse
,
};
use
triton_distributed_llm
::
types
::
openai
::
chat_completions
::
OpenAIChatCompletionsStreamingEngine
;
use
triton_distributed_runtime
::
engine
::{
AsyncEngine
,
AsyncEngineContextProvider
,
ResponseStream
};
...
...
@@ -41,14 +41,14 @@ pub fn make_engine_full() -> OpenAIChatCompletionsStreamingEngine {
impl
AsyncEngine
<
SingleIn
<
NvCreateChatCompletionRequest
>
,
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
Error
,
>
for
EchoEngineFull
{
async
fn
generate
(
&
self
,
incoming_request
:
SingleIn
<
NvCreateChatCompletionRequest
>
,
)
->
Result
<
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
Error
>
{
)
->
Result
<
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
Error
>
{
let
(
request
,
context
)
=
incoming_request
.transfer
(());
let
deltas
=
request
.response_generator
();
let
ctx
=
context
.context
();
...
...
@@ -72,7 +72,7 @@ impl
// we are returning characters not tokens, so speed up some
tokio
::
time
::
sleep
(
TOKEN_ECHO_DELAY
/
2
)
.await
;
let
inner
=
deltas
.create_choice
(
0
,
Some
(
c
.to_string
()),
None
,
None
);
let
response
=
ChatCompletionResponse
Delta
{
let
response
=
NvCreate
ChatCompletion
Stream
Response
{
inner
,
};
yield
Annotated
{
id
:
Some
(
id
.to_string
()),
data
:
Some
(
response
),
event
:
None
,
comment
:
None
};
...
...
@@ -80,7 +80,7 @@ impl
}
let
inner
=
deltas
.create_choice
(
0
,
None
,
Some
(
async_openai
::
types
::
FinishReason
::
Stop
),
None
);
let
response
=
ChatCompletionResponse
Delta
{
let
response
=
NvCreate
ChatCompletion
Stream
Response
{
inner
,
};
yield
Annotated
{
id
:
Some
(
id
.to_string
()),
data
:
Some
(
response
),
event
:
None
,
comment
:
None
};
...
...
lib/llm/src/engines/mistralrs.rs
View file @
110f3f8c
...
...
@@ -34,7 +34,7 @@ use triton_distributed_runtime::pipeline::{Error, ManyOut, SingleIn};
use
triton_distributed_runtime
::
protocols
::
annotated
::
Annotated
;
use
crate
::
protocols
::
openai
::
chat_completions
::{
ChatCompletionRequest
,
ChatCompletionResponse
Delta
,
ChatCompletionRequest
,
NvCreate
ChatCompletion
Stream
Response
,
};
use
crate
::
types
::
openai
::
chat_completions
::
OpenAIChatCompletionsStreamingEngine
;
...
...
@@ -161,14 +161,14 @@ impl MistralRsEngine {
impl
AsyncEngine
<
SingleIn
<
ChatCompletionRequest
>
,
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
Error
,
>
for
MistralRsEngine
{
async
fn
generate
(
&
self
,
request
:
SingleIn
<
ChatCompletionRequest
>
,
)
->
Result
<
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
Error
>
{
)
->
Result
<
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
Error
>
{
let
(
request
,
context
)
=
request
.transfer
(());
let
ctx
=
context
.context
();
let
(
tx
,
mut
rx
)
=
channel
(
10_000
);
...
...
@@ -286,7 +286,7 @@ impl
system_fingerprint
:
Some
(
c
.system_fingerprint
),
service_tier
:
None
,
};
let
delta
=
ChatCompletionResponse
Delta
{
inner
};
let
delta
=
NvCreate
ChatCompletion
Stream
Response
{
inner
};
let
ann
=
Annotated
{
id
:
None
,
data
:
Some
(
delta
),
...
...
lib/llm/src/http/service/discovery.rs
View file @
110f3f8c
...
...
@@ -28,7 +28,7 @@ use triton_distributed_runtime::{
use
super
::
ModelManager
;
use
crate
::
model_type
::
ModelType
;
use
crate
::
protocols
::
openai
::
chat_completions
::{
ChatCompletionRe
sponseDelta
,
NvCreateChatCompletion
Request
,
NvCreate
ChatCompletionRe
quest
,
NvCreateChatCompletion
StreamResponse
,
};
use
crate
::
protocols
::
openai
::
completions
::{
CompletionRequest
,
CompletionResponse
};
use
tracing
;
...
...
@@ -135,7 +135,7 @@ async fn handle_put(kv: &KeyValue, state: Arc<ModelWatchState>) -> Result<(&str,
.namespace
(
model_entry
.endpoint.namespace
)
?
.component
(
model_entry
.endpoint.component
)
?
.endpoint
(
model_entry
.endpoint.name
)
.client
::
<
NvCreateChatCompletionRequest
,
Annotated
<
ChatCompletionResponse
Delta
>>
()
.client
::
<
NvCreateChatCompletionRequest
,
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
()
.await
?
;
state
.manager
...
...
lib/llm/src/preprocessor.rs
View file @
110f3f8c
...
...
@@ -44,7 +44,7 @@ use triton_distributed_runtime::protocols::annotated::{Annotated, AnnotationsPro
use
crate
::
protocols
::{
common
::{
SamplingOptionsProvider
,
StopConditionsProvider
},
openai
::{
chat_completions
::{
ChatCompletionRe
sponseDelta
,
NvCreateChatCompletion
Request
},
chat_completions
::{
NvCreate
ChatCompletionRe
quest
,
NvCreateChatCompletion
StreamResponse
},
completions
::{
CompletionRequest
,
CompletionResponse
},
nvext
::
NvExtProvider
,
DeltaGeneratorExt
,
...
...
@@ -225,7 +225,7 @@ impl OpenAIPreprocessor {
tracing
::
trace!
(
request_id
=
inner
.context
.id
(),
"OpenAI ChatCompletionResponse
Delta
: {:?}"
,
"OpenAI
NvCreate
ChatCompletion
Stream
Response: {:?}"
,
response
);
...
...
@@ -252,7 +252,7 @@ impl OpenAIPreprocessor {
impl
Operator
<
SingleIn
<
NvCreateChatCompletionRequest
>
,
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
SingleIn
<
BackendInput
>
,
ManyOut
<
Annotated
<
BackendOutput
>>
,
>
for
OpenAIPreprocessor
...
...
@@ -263,7 +263,7 @@ impl
next
:
Arc
<
dyn
AsyncEngine
<
SingleIn
<
BackendInput
>
,
ManyOut
<
Annotated
<
BackendOutput
>>
,
Error
>
,
>
,
)
->
Result
<
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
Error
>
{
)
->
Result
<
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
Error
>
{
// unpack the request
let
(
request
,
context
)
=
request
.into_parts
();
...
...
@@ -281,7 +281,7 @@ impl
let
common_request
=
context
.map
(|
_
|
common_request
);
// create a stream of annotations this will be prepend to the response stream
let
annotations
:
Vec
<
Annotated
<
ChatCompletionResponse
Delta
>>
=
annotations
let
annotations
:
Vec
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
=
annotations
.into_iter
()
.flat_map
(|(
k
,
v
)|
Annotated
::
from_annotation
(
k
,
&
v
))
.collect
();
...
...
lib/llm/src/protocols/codec.rs
View file @
110f3f8c
...
...
@@ -640,7 +640,7 @@ data: [DONE]
#[tokio::test]
async
fn
test_openai_chat_stream
()
{
use
crate
::
protocols
::
openai
::
chat_completions
::
ChatCompletionResponse
Delta
;
use
crate
::
protocols
::
openai
::
chat_completions
::
NvCreate
ChatCompletion
Stream
Response
;
// let cursor = Cursor::new(SAMPLE_CHAT_DATA);
// let mut framed = FramedRead::new(cursor, SseLineCodec::new());
...
...
@@ -652,7 +652,7 @@ data: [DONE]
loop
{
match
stream
.next
()
.await
{
Some
(
Ok
(
message
))
=>
{
let
delta
:
ChatCompletionResponse
Delta
=
let
delta
:
NvCreate
ChatCompletion
Stream
Response
=
serde_json
::
from_str
(
&
message
.data
.unwrap
())
.unwrap
();
counter
+=
1
;
println!
(
"counter: {}"
,
counter
);
...
...
lib/llm/src/protocols/openai/chat_completions.rs
View file @
110f3f8c
...
...
@@ -47,7 +47,7 @@ pub struct ChatCompletionContent {
}
#[derive(Serialize,
Deserialize,
Validate,
Debug,
Clone)]
pub
struct
ChatCompletionResponse
Delta
{
pub
struct
NvCreate
ChatCompletion
Stream
Response
{
#[serde(flatten)]
pub
inner
:
async_openai
::
types
::
CreateChatCompletionStreamResponse
,
}
...
...
lib/llm/src/protocols/openai/chat_completions/aggregator.rs
View file @
110f3f8c
...
...
@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use
super
::{
ChatCompletionResponse
Delta
,
NvCreateChatCompletionResponse
};
use
super
::{
NvCreate
ChatCompletionResponse
,
NvCreateChatCompletion
Stream
Response
};
use
crate
::
protocols
::{
codec
::{
Message
,
SseCodecError
},
convert_sse_stream
,
Annotated
,
...
...
@@ -24,7 +24,7 @@ use std::{collections::HashMap, pin::Pin};
type
DataStream
<
T
>
=
Pin
<
Box
<
dyn
Stream
<
Item
=
T
>
+
Send
+
Sync
>>
;
/// Aggregates a stream of [`ChatCompletionResponse
Delta
`]s into a single [`ChatCompletionResponse`].
/// Aggregates a stream of [`
NvCreate
ChatCompletion
Stream
Response`]s into a single [`
NvCreate
ChatCompletionResponse`].
pub
struct
DeltaAggregator
{
id
:
String
,
model
:
String
,
...
...
@@ -66,9 +66,9 @@ impl DeltaAggregator {
}
}
/// Aggregates a stream of [`ChatCompletionResponse
Delta
`]s into a single [`ChatCompletionResponse`].
/// Aggregates a stream of [`
NvCreate
ChatCompletion
Stream
Response`]s into a single [`
NvCreate
ChatCompletionResponse`].
pub
async
fn
apply
(
stream
:
DataStream
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
stream
:
DataStream
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
)
->
Result
<
NvCreateChatCompletionResponse
,
String
>
{
let
aggregator
=
stream
.fold
(
DeltaAggregator
::
new
(),
|
mut
aggregator
,
delta
|
async
move
{
...
...
@@ -184,12 +184,12 @@ impl NvCreateChatCompletionResponse {
pub
async
fn
from_sse_stream
(
stream
:
DataStream
<
Result
<
Message
,
SseCodecError
>>
,
)
->
Result
<
NvCreateChatCompletionResponse
,
String
>
{
let
stream
=
convert_sse_stream
::
<
ChatCompletionResponse
Delta
>
(
stream
);
let
stream
=
convert_sse_stream
::
<
NvCreate
ChatCompletion
Stream
Response
>
(
stream
);
NvCreateChatCompletionResponse
::
from_annotated_stream
(
stream
)
.await
}
pub
async
fn
from_annotated_stream
(
stream
:
DataStream
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
stream
:
DataStream
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
)
->
Result
<
NvCreateChatCompletionResponse
,
String
>
{
DeltaAggregator
::
apply
(
stream
)
.await
}
...
...
@@ -207,7 +207,7 @@ mod tests {
text
:
&
str
,
role
:
Option
<
async_openai
::
types
::
Role
>
,
finish_reason
:
Option
<
async_openai
::
types
::
FinishReason
>
,
)
->
Annotated
<
ChatCompletionResponse
Delta
>
{
)
->
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>
{
// ALLOW: function_call is deprecated
let
delta
=
async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
content
:
Some
(
text
.to_string
()),
...
...
@@ -234,7 +234,7 @@ mod tests {
object
:
"chat.completion"
.to_string
(),
};
let
data
=
ChatCompletionResponse
Delta
{
inner
};
let
data
=
NvCreate
ChatCompletion
Stream
Response
{
inner
};
Annotated
{
data
:
Some
(
data
),
...
...
@@ -247,7 +247,8 @@ mod tests {
#[tokio::test]
async
fn
test_empty_stream
()
{
// Create an empty stream
let
stream
:
DataStream
<
Annotated
<
ChatCompletionResponseDelta
>>
=
Box
::
pin
(
stream
::
empty
());
let
stream
:
DataStream
<
Annotated
<
NvCreateChatCompletionStreamResponse
>>
=
Box
::
pin
(
stream
::
empty
());
// Call DeltaAggregator::apply
let
result
=
DeltaAggregator
::
apply
(
stream
)
.await
;
...
...
@@ -375,7 +376,7 @@ mod tests {
object
:
"chat.completion"
.to_string
(),
};
let
data
=
ChatCompletionResponse
Delta
{
inner
:
delta
};
let
data
=
NvCreate
ChatCompletion
Stream
Response
{
inner
:
delta
};
// Wrap it in Annotated and create a stream
let
annotated_delta
=
Annotated
{
...
...
lib/llm/src/protocols/openai/chat_completions/delta.rs
View file @
110f3f8c
...
...
@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use
super
::{
ChatCompletionRe
sponseDelta
,
NvCreateChatCompletion
Request
};
use
super
::{
NvCreate
ChatCompletionRe
quest
,
NvCreateChatCompletion
StreamResponse
};
use
crate
::
protocols
::
common
;
impl
NvCreateChatCompletionRequest
{
...
...
@@ -135,11 +135,13 @@ impl DeltaGenerator {
}
}
impl
crate
::
protocols
::
openai
::
DeltaGeneratorExt
<
ChatCompletionResponseDelta
>
for
DeltaGenerator
{
impl
crate
::
protocols
::
openai
::
DeltaGeneratorExt
<
NvCreateChatCompletionStreamResponse
>
for
DeltaGenerator
{
fn
choice_from_postprocessor
(
&
mut
self
,
delta
:
crate
::
protocols
::
common
::
llm_backend
::
BackendOutput
,
)
->
anyhow
::
Result
<
ChatCompletionResponse
Delta
>
{
)
->
anyhow
::
Result
<
NvCreate
ChatCompletion
Stream
Response
>
{
// aggregate usage
if
self
.options.enable_usage
{
self
.usage.completion_tokens
+=
delta
.token_ids
.len
()
as
u32
;
...
...
@@ -163,7 +165,7 @@ impl crate::protocols::openai::DeltaGeneratorExt<ChatCompletionResponseDelta> fo
let
index
=
0
;
let
stream_response
=
self
.create_choice
(
index
,
delta
.text
,
finish_reason
,
logprobs
);
Ok
(
ChatCompletionResponse
Delta
{
Ok
(
NvCreate
ChatCompletion
Stream
Response
{
inner
:
stream_response
,
})
}
...
...
lib/llm/src/types.rs
View file @
110f3f8c
...
...
@@ -38,8 +38,8 @@ pub mod openai {
use
super
::
*
;
pub
use
protocols
::
openai
::
chat_completions
::{
ChatCompletionRe
sponseDelta
,
NvCreateChatCompletionRe
quest
,
NvCreateChatCompletionResponse
,
NvCreate
ChatCompletionRe
quest
,
NvCreateChatCompletionRe
sponse
,
NvCreateChatCompletion
Stream
Response
,
};
/// A [`UnaryEngine`] implementation for the OpenAI Chat Completions API
...
...
@@ -49,7 +49,7 @@ pub mod openai {
/// A [`ServerStreamingEngine`] implementation for the OpenAI Chat Completions API
pub
type
OpenAIChatCompletionsStreamingEngine
=
ServerStreamingEngine
<
NvCreateChatCompletionRequest
,
Annotated
<
ChatCompletionResponse
Delta
>
,
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>
,
>
;
}
}
lib/llm/tests/http-service.rs
View file @
110f3f8c
...
...
@@ -26,7 +26,7 @@ use triton_distributed_llm::http::service::{
};
use
triton_distributed_llm
::
protocols
::{
openai
::{
chat_completions
::{
ChatCompletionRe
sponseDelta
,
NvCreateChatCompletion
Request
},
chat_completions
::{
NvCreate
ChatCompletionRe
quest
,
NvCreateChatCompletion
StreamResponse
},
completions
::{
CompletionRequest
,
CompletionResponse
},
},
Annotated
,
...
...
@@ -45,21 +45,21 @@ struct CounterEngine {}
impl
AsyncEngine
<
SingleIn
<
NvCreateChatCompletionRequest
>
,
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
Error
,
>
for
CounterEngine
{
async
fn
generate
(
&
self
,
request
:
SingleIn
<
NvCreateChatCompletionRequest
>
,
)
->
Result
<
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
Error
>
{
)
->
Result
<
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
Error
>
{
let
(
request
,
context
)
=
request
.transfer
(());
let
ctx
=
context
.context
();
// ALLOW: max_tokens is deprecated in favor of completion_usage_tokens
let
max_tokens
=
request
.inner.max_tokens
.unwrap_or
(
0
)
as
u64
;
// let generator = ChatCompletionResponse
Delta
::generator(request.model.clone());
// let generator =
NvCreate
ChatCompletion
Stream
Response::generator(request.model.clone());
let
generator
=
request
.response_generator
();
let
stream
=
stream!
{
...
...
@@ -67,7 +67,7 @@ impl
for
i
in
0
..
10
{
let
inner
=
generator
.create_choice
(
i
,
Some
(
format!
(
"choice {i}"
)),
None
,
None
);
let
output
=
ChatCompletionResponse
Delta
{
let
output
=
NvCreate
ChatCompletion
Stream
Response
{
inner
,
};
...
...
@@ -85,14 +85,14 @@ struct AlwaysFailEngine {}
impl
AsyncEngine
<
SingleIn
<
NvCreateChatCompletionRequest
>
,
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
Error
,
>
for
AlwaysFailEngine
{
async
fn
generate
(
&
self
,
_
request
:
SingleIn
<
NvCreateChatCompletionRequest
>
,
)
->
Result
<
ManyOut
<
Annotated
<
ChatCompletionResponse
Delta
>>
,
Error
>
{
)
->
Result
<
ManyOut
<
Annotated
<
NvCreate
ChatCompletion
Stream
Response
>>
,
Error
>
{
Err
(
HttpError
{
code
:
403
,
message
:
"Always fail"
.to_string
(),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment