Commit dd31a322 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

chore: stragglers rename (#69)


Co-authored-by: default avatarHarrison King Saturley-Hall <hsaturleyhal@nvidia.com>
parent efe82b86
...@@ -99,7 +99,7 @@ PREFILL_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 \ ...@@ -99,7 +99,7 @@ PREFILL_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 \
--max-model-len 1000 \ --max-model-len 1000 \
--tensor-parallel-size 1 \ --tensor-parallel-size 1 \
--kv-transfer-config \ --kv-transfer-config \
'{\"kv_connector\":\"TritonNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_rank\":0,\"kv_parallel_size\":2}'" '{\"kv_connector\":\"DynamoNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_rank\":0,\"kv_parallel_size\":2}'"
tmux select-pane -t 2 tmux select-pane -t 2
tmux send-keys "$INIT_CMD && $PREFILL_CMD" C-m tmux send-keys "$INIT_CMD && $PREFILL_CMD" C-m
...@@ -115,7 +115,7 @@ DECODE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1 \ ...@@ -115,7 +115,7 @@ DECODE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1 \
--max-model-len 1000 \ --max-model-len 1000 \
--tensor-parallel-size 1 \ --tensor-parallel-size 1 \
--kv-transfer-config \ --kv-transfer-config \
'{\"kv_connector\":\"TritonNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_rank\":1,\"kv_parallel_size\":2}'" '{\"kv_connector\":\"DynamoNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_rank\":1,\"kv_parallel_size\":2}'"
tmux select-pane -t 3 tmux select-pane -t 3
tmux send-keys "$INIT_CMD && $DECODE_CMD" C-m tmux send-keys "$INIT_CMD && $DECODE_CMD" C-m
......
...@@ -94,7 +94,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): ...@@ -94,7 +94,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
request_handler = RequestHandler(engine_client, metadata_store) request_handler = RequestHandler(engine_client, metadata_store)
# TODO: integrate prefill_queue to an triton_distributed endpoint # TODO: integrate prefill_queue to a dynamo endpoint
async with PrefillQueue.get_instance( async with PrefillQueue.get_instance(
nats_server=prefill_queue_nats_server, nats_server=prefill_queue_nats_server,
stream_name=prefill_queue_stream_name, stream_name=prefill_queue_stream_name,
......
...@@ -70,7 +70,7 @@ class RequestHandler: ...@@ -70,7 +70,7 @@ class RequestHandler:
print("RequestHandler initialized") print("RequestHandler initialized")
def get_remote_prefill_request_callback(self): def get_remote_prefill_request_callback(self):
# TODO: integrate prefill_queue to an triton_distributed endpoint # TODO: integrate prefill_queue to dynamo endpoint
async def callback(request: RemotePrefillRequest): async def callback(request: RemotePrefillRequest):
async with PrefillQueue.get_instance( async with PrefillQueue.get_instance(
nats_server=self._prefill_queue_nats_server, nats_server=self._prefill_queue_nats_server,
......
...@@ -33,7 +33,7 @@ nvllm_trt_engine_t nvllm_trt_engine_create(const char* config_proto); ...@@ -33,7 +33,7 @@ nvllm_trt_engine_t nvllm_trt_engine_create(const char* config_proto);
// Create a nvLLM TRT Engine from an instance of the engine // Create a nvLLM TRT Engine from an instance of the engine
// This requires the raw engine pointer to be an instantiated object at the exact same // This requires the raw engine pointer to be an instantiated object at the exact same
// commit version as the version of TRTLLM used to build the nvLLM C API. // commit version as the version of TRTLLM used to build the nvLLM C API.
// This is a workaround to enable the Triton TensorRT LLM backend to use nvLLM. // This is a workaround to enable the Dynamo TensorRT LLM backend to use nvLLM.
nvllm_trt_engine_t nvllm_trt_engine_unsafe_create_from_executor(void* engine); nvllm_trt_engine_t nvllm_trt_engine_unsafe_create_from_executor(void* engine);
// Source: Enqueue a streaming request via a json message to the request queue // Source: Enqueue a streaming request via a json message to the request queue
......
...@@ -100,7 +100,7 @@ wheels = [ ...@@ -100,7 +100,7 @@ wheels = [
] ]
[[package]] [[package]]
name = "triton-distributed-rs" name = "dynamo"
version = "0.1.3" version = "0.1.3"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
......
...@@ -27,8 +27,7 @@ pub type StatsHandler = ...@@ -27,8 +27,7 @@ pub type StatsHandler =
pub type EndpointStatsHandler = pub type EndpointStatsHandler =
Box<dyn FnMut(endpoint::Stats) -> serde_json::Value + Send + Sync + 'static>; Box<dyn FnMut(endpoint::Stats) -> serde_json::Value + Send + Sync + 'static>;
// TODO(rename) - pending rename of project pub const PROJECT_NAME: &str = "Dynamo";
pub const PROJECT_NAME: &str = "Triton";
#[derive(Educe, Builder, Dissolve)] #[derive(Educe, Builder, Dissolve)]
#[educe(Debug)] #[educe(Debug)]
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
//! Triton //! Dynamo
#![allow(dead_code)] #![allow(dead_code)]
#![allow(unused_imports)] #![allow(unused_imports)]
...@@ -49,7 +49,7 @@ pub use futures::stream; ...@@ -49,7 +49,7 @@ pub use futures::stream;
pub use tokio_util::sync::CancellationToken; pub use tokio_util::sync::CancellationToken;
pub use worker::Worker; pub use worker::Worker;
/// Types of Tokio runtimes that can be used to construct a Triton [Runtime]. /// Types of Tokio runtimes that can be used to construct a Dynamo [Runtime].
#[derive(Clone)] #[derive(Clone)]
enum RuntimeType { enum RuntimeType {
Shared(Arc<tokio::runtime::Runtime>), Shared(Arc<tokio::runtime::Runtime>),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment