Unverified Commit 3f6a7472 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Remove PreprocessedRequest alias BackendInput (#1307)

It was confusing to have two names for one type.

This tidy up started in #1064 , is now complete.
parent 859944f4
......@@ -9,7 +9,7 @@ use dynamo_llm::{
engines::StreamingEngineAdapter,
model_card::ModelDeploymentCard,
preprocessor::OpenAIPreprocessor,
protocols::common::llm_backend::{BackendInput, BackendOutput},
protocols::common::llm_backend::{BackendOutput, PreprocessedRequest},
types::{
openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
......@@ -113,7 +113,7 @@ where
OpenAIPreprocessor: Operator<
Context<Req>,
Pin<Box<dyn AsyncEngineStream<Annotated<Resp>>>>,
Context<BackendInput>,
Context<PreprocessedRequest>,
Pin<Box<dyn AsyncEngineStream<Annotated<BackendOutput>>>>,
>,
{
......
......@@ -19,7 +19,7 @@ use dynamo_llm::{
backend::Backend,
engines::StreamingEngineAdapter,
model_type::ModelType,
preprocessor::{BackendInput, BackendOutput},
preprocessor::{BackendOutput, PreprocessedRequest},
types::{
openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
......@@ -71,8 +71,10 @@ pub async fn run(
mut model,
} => {
// Pre-processing is done ingress-side, so it should be already done.
let frontend =
SegmentSource::<SingleIn<BackendInput>, ManyOut<Annotated<BackendOutput>>>::new();
let frontend = SegmentSource::<
SingleIn<PreprocessedRequest>,
ManyOut<Annotated<BackendOutput>>,
>::new();
let backend = Backend::from_mdc(model.card().clone())
.await?
.into_operator();
......
......@@ -16,7 +16,7 @@
use super::*;
use crate::llm::model_card::ModelDeploymentCard;
use llm_rs::protocols::common::llm_backend::{BackendInput, BackendOutput};
use llm_rs::protocols::common::llm_backend::{BackendOutput, PreprocessedRequest};
use llm_rs::types::Annotated;
use dynamo_runtime::pipeline::{Operator, ServiceBackend, ServiceFrontend, Source};
......@@ -44,8 +44,10 @@ impl Backend {
}
fn start<'p>(&self, py: Python<'p>, generator: PyObject) -> PyResult<Bound<'p, PyAny>> {
let frontend =
ServiceFrontend::<SingleIn<BackendInput>, ManyOut<Annotated<BackendOutput>>>::new();
let frontend = ServiceFrontend::<
SingleIn<PreprocessedRequest>,
ManyOut<Annotated<BackendOutput>>,
>::new();
let backend = self.inner.into_operator();
let engine = Arc::new(PythonAsyncEngine::new(
......
......@@ -18,7 +18,7 @@ use crate::llm::model_card::ModelDeploymentCard;
use llm_rs::{
preprocessor::OpenAIPreprocessor,
protocols::common::llm_backend::{BackendInput, BackendOutput},
protocols::common::llm_backend::{BackendOutput, PreprocessedRequest},
types::{
openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
......@@ -60,7 +60,7 @@ impl OAIChatPreprocessor {
>::new();
let network =
SegmentSink::<SingleIn<BackendInput>, ManyOut<Annotated<BackendOutput>>>::new();
SegmentSink::<SingleIn<PreprocessedRequest>, ManyOut<Annotated<BackendOutput>>>::new();
let preprocessor = self.inner.into_operator();
let pipeline = frontend
......@@ -77,7 +77,7 @@ impl OAIChatPreprocessor {
let endpoint = Arc::new(self.next.inner.clone());
pyo3_async_runtimes::tokio::future_into_py(py, async move {
let client = endpoint.client().await.map_err(to_pyerr)?;
let router = PushRouter::<BackendInput, Annotated<BackendOutput>>::from_client(
let router = PushRouter::<PreprocessedRequest, Annotated<BackendOutput>>::from_client(
client,
Default::default(),
)
......
......@@ -23,7 +23,7 @@ use llama_cpp_2::{
LogOptions,
};
use dynamo_llm::protocols::common::llm_backend::{BackendInput, LLMEngineOutput};
use dynamo_llm::protocols::common::llm_backend::LLMEngineOutput;
use dynamo_llm::protocols::common::preprocessor::PreprocessedRequest;
use dynamo_llm::{backend::ExecutionContext, local_model::LocalModel};
......@@ -119,12 +119,12 @@ fn load_model(backend: &LlamaBackend, model_path: &Path) -> Result<LlamaModel> {
}
#[async_trait]
impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutput>>, Error>
for LlamacppEngine
{
async fn generate(
&self,
request: SingleIn<BackendInput>,
request: SingleIn<PreprocessedRequest>,
) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
let (request, context) = request.into_parts();
let ctx = context.context();
......
......@@ -44,7 +44,7 @@ use dynamo_runtime::{
use crate::protocols::{
common::{
llm_backend::{BackendInput, BackendOutput, FinishReason, LLMEngineOutput},
llm_backend::{BackendOutput, FinishReason, LLMEngineOutput, PreprocessedRequest},
StopConditions,
},
TokenIdType,
......@@ -56,7 +56,7 @@ use tokenizers::Tokenizer as HfTokenizer;
pub type ExecutionOutputStream = Annotated<LLMEngineOutput>;
/// Context for executing LLM inference, engine consumes backend input and produces execution output stream
pub type ExecutionContext = ServerStreamingEngine<BackendInput, ExecutionOutputStream>;
pub type ExecutionContext = ServerStreamingEngine<PreprocessedRequest, ExecutionOutputStream>;
/// Backend handles resource management and orchestrates LLM execution
#[allow(dead_code)]
......@@ -121,16 +121,16 @@ impl Backend {
#[async_trait]
impl
Operator<
SingleIn<BackendInput>,
SingleIn<PreprocessedRequest>,
ManyOut<Annotated<BackendOutput>>,
SingleIn<BackendInput>,
SingleIn<PreprocessedRequest>,
ManyOut<Annotated<LLMEngineOutput>>,
> for Backend
{
async fn generate(
&self,
request: SingleIn<BackendInput>,
next: ServerStreamingEngine<BackendInput, Annotated<LLMEngineOutput>>,
request: SingleIn<PreprocessedRequest>,
next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
) -> Result<ManyOut<Annotated<BackendOutput>>> {
let stop_conditions = request.stop_conditions.clone();
let next_stream = next.generate(request).await?;
......
......@@ -20,7 +20,7 @@ use crate::{
backend::Backend,
kv_router::KvPushRouter,
model_type::ModelType,
preprocessor::{BackendInput, OpenAIPreprocessor},
preprocessor::{OpenAIPreprocessor, PreprocessedRequest},
protocols::common::llm_backend::LLMEngineOutput,
protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
......@@ -196,7 +196,8 @@ impl ModelWatcher {
>::new();
let preprocessor = OpenAIPreprocessor::new(card.clone()).await?.into_operator();
let backend = Backend::from_mdc(card.clone()).await?.into_operator();
let router = PushRouter::<BackendInput, Annotated<LLMEngineOutput>>::from_client(
let router =
PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client(
client.clone(),
self.router_mode,
)
......@@ -231,7 +232,8 @@ impl ModelWatcher {
>::new();
let preprocessor = OpenAIPreprocessor::new(card.clone()).await?.into_operator();
let backend = Backend::from_mdc(card.clone()).await?.into_operator();
let router = PushRouter::<BackendInput, Annotated<LLMEngineOutput>>::from_client(
let router =
PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client(
client,
self.router_mode,
)
......
......@@ -26,7 +26,7 @@ use dynamo_runtime::pipeline::{Error, ManyOut, SingleIn};
use dynamo_runtime::protocols::annotated::Annotated;
use crate::backend::ExecutionContext;
use crate::preprocessor::BackendInput;
use crate::preprocessor::PreprocessedRequest;
use crate::protocols::common::llm_backend::LLMEngineOutput;
use crate::protocols::openai::{
chat_completions::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse},
......@@ -86,12 +86,12 @@ pub fn make_engine_core() -> ExecutionContext {
}
#[async_trait]
impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutput>>, Error>
for EchoEngineCore
{
async fn generate(
&self,
incoming_request: SingleIn<BackendInput>,
incoming_request: SingleIn<PreprocessedRequest>,
) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
let (request, context) = incoming_request.into_parts();
let ctx = context.context();
......
......@@ -31,7 +31,7 @@ use crate::{
scheduler::{KvScheduler, KvSchedulerError, SchedulingRequest},
scoring::ProcessedEndpoints,
},
preprocessor::BackendInput,
preprocessor::PreprocessedRequest,
protocols::common::llm_backend::LLMEngineOutput,
tokens::TokenBlockSequence,
};
......@@ -173,13 +173,13 @@ impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Er
}
pub struct KvPushRouter {
inner: PushRouter<BackendInput, Annotated<LLMEngineOutput>>,
inner: PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>,
chooser: Arc<KvRouter>,
}
impl KvPushRouter {
pub fn new(
inner: PushRouter<BackendInput, Annotated<LLMEngineOutput>>,
inner: PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>,
chooser: Arc<KvRouter>,
) -> Self {
KvPushRouter { inner, chooser }
......@@ -187,12 +187,12 @@ impl KvPushRouter {
}
#[async_trait]
impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutput>>, Error>
for KvPushRouter
{
async fn generate(
&self,
request: SingleIn<BackendInput>,
request: SingleIn<PreprocessedRequest>,
) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
match self.inner.client.instance_source.as_ref() {
InstanceSource::Static => self.inner.r#static(request).await,
......
......@@ -55,7 +55,7 @@ use crate::tokenizers::{traits::Tokenizer, HuggingFaceTokenizer};
use crate::preprocessor::prompt::PromptFormatter;
pub use crate::protocols::common::llm_backend::{BackendInput, BackendOutput};
pub use crate::protocols::common::llm_backend::{BackendOutput, PreprocessedRequest};
pub const ANNOTATION_FORMATTED_PROMPT: &str = "formatted_prompt";
pub const ANNOTATION_TOKEN_IDS: &str = "token_ids";
......@@ -121,9 +121,9 @@ impl OpenAIPreprocessor {
>(
&self,
request: &R,
) -> Result<(BackendInput, HashMap<String, String>)> {
) -> Result<(PreprocessedRequest, HashMap<String, String>)> {
let mut annotations = HashMap::new();
let mut builder = BackendInput::builder();
let mut builder = PreprocessedRequest::builder();
let use_raw_prompt = request
.nvext()
......@@ -266,7 +266,7 @@ impl
Operator<
SingleIn<NvCreateChatCompletionRequest>,
ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>,
SingleIn<BackendInput>,
SingleIn<PreprocessedRequest>,
ManyOut<Annotated<BackendOutput>>,
> for OpenAIPreprocessor
{
......@@ -274,7 +274,11 @@ impl
&self,
request: SingleIn<NvCreateChatCompletionRequest>,
next: Arc<
dyn AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<BackendOutput>>, Error>,
dyn AsyncEngine<
SingleIn<PreprocessedRequest>,
ManyOut<Annotated<BackendOutput>>,
Error,
>,
>,
) -> Result<ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>, Error> {
// unpack the request
......@@ -320,7 +324,7 @@ impl
Operator<
SingleIn<CompletionRequest>,
ManyOut<Annotated<CompletionResponse>>,
SingleIn<BackendInput>,
SingleIn<PreprocessedRequest>,
ManyOut<Annotated<BackendOutput>>,
> for OpenAIPreprocessor
{
......@@ -328,7 +332,11 @@ impl
&self,
request: SingleIn<CompletionRequest>,
next: Arc<
dyn AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<BackendOutput>>, Error>,
dyn AsyncEngine<
SingleIn<PreprocessedRequest>,
ManyOut<Annotated<BackendOutput>>,
Error,
>,
>,
) -> Result<ManyOut<Annotated<CompletionResponse>>, Error> {
// unpack the request
......
......@@ -20,7 +20,6 @@ use crate::protocols::TokenIdType;
pub type TokenType = Option<String>;
pub type LogProbs = Vec<f64>;
pub use super::preprocessor::PreprocessedRequest as BackendInput; // TODO stop renaming this
pub use super::preprocessor::PreprocessedRequest;
pub use super::FinishReason;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment