"...configs/git@developer.sourcefind.cn:OpenDAS/dcnv3.git" did not exist on "88dbd1ae88ff3417a05cff8717077f0da1abec7f"
Commit 1f6ccc7f authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat: `Frontend` component uses served_model_name instead of model (#302)

parent 476174f3
...@@ -163,7 +163,7 @@ This will print out something like ...@@ -163,7 +163,7 @@ This will print out something like
Service Configuration: Service Configuration:
{ {
"Frontend": { "Frontend": {
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "served_model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"endpoint": "dynamo.Processor.chat/completions", "endpoint": "dynamo.Processor.chat/completions",
"port": 8000 "port": 8000
}, },
...@@ -189,7 +189,7 @@ Service Configuration: ...@@ -189,7 +189,7 @@ Service Configuration:
} }
Environment Variable that would be set: Environment Variable that would be set:
DYNAMO_SERVICE_CONFIG={"Frontend": {"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "endpoint": "dynamo.Processor.chat/completions", "port": 8000}, "Processor": {"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "block-size": 64, DYNAMO_SERVICE_CONFIG={"Frontend": {"served_model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "endpoint": "dynamo.Processor.chat/completions", "port": 8000}, "Processor": {"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "block-size": 64,
"max-model-len": 16384, "router": "round-robin"}, "VllmWorker": {"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "enforce-eager": true, "block-size": 64, "max-model-len": 16384, "max-num-batched-tokens": 16384, "enable-prefix-caching": "max-model-len": 16384, "router": "round-robin"}, "VllmWorker": {"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "enforce-eager": true, "block-size": 64, "max-model-len": 16384, "max-num-batched-tokens": 16384, "enable-prefix-caching":
true, "router": "random", "tensor-parallel-size": 1, "ServiceArgs": {"workers": 1}}} true, "router": "random", "tensor-parallel-size": 1, "ServiceArgs": {"workers": 1}}}
``` ```
......
...@@ -17,7 +17,6 @@ import subprocess ...@@ -17,7 +17,6 @@ import subprocess
from pathlib import Path from pathlib import Path
from components.processor import Processor from components.processor import Processor
from components.routerless.worker import VllmWorkerRouterLess
from components.worker import VllmWorker from components.worker import VllmWorker
from pydantic import BaseModel from pydantic import BaseModel
...@@ -37,7 +36,7 @@ def get_http_binary_path(): ...@@ -37,7 +36,7 @@ def get_http_binary_path():
class FrontendConfig(BaseModel): class FrontendConfig(BaseModel):
model: str served_model_name: str
endpoint: str endpoint: str
port: int = 8080 port: int = 8080
...@@ -50,7 +49,6 @@ class FrontendConfig(BaseModel): ...@@ -50,7 +49,6 @@ class FrontendConfig(BaseModel):
# todo this should be called ApiServer # todo this should be called ApiServer
class Frontend: class Frontend:
worker = depends(VllmWorker) worker = depends(VllmWorker)
worker_routerless = depends(VllmWorkerRouterLess)
processor = depends(Processor) processor = depends(Processor)
def __init__(self): def __init__(self):
...@@ -58,7 +56,13 @@ class Frontend: ...@@ -58,7 +56,13 @@ class Frontend:
frontend_config = FrontendConfig(**config.get("Frontend", {})) frontend_config = FrontendConfig(**config.get("Frontend", {}))
subprocess.run( subprocess.run(
["llmctl", "http", "remove", "chat-models", frontend_config.model] [
"llmctl",
"http",
"remove",
"chat-models",
frontend_config.served_model_name,
]
) )
subprocess.run( subprocess.run(
[ [
...@@ -66,7 +70,7 @@ class Frontend: ...@@ -66,7 +70,7 @@ class Frontend:
"http", "http",
"add", "add",
"chat-models", "chat-models",
frontend_config.model, frontend_config.served_model_name,
frontend_config.endpoint, frontend_config.endpoint,
] ]
) )
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
Frontend: Frontend:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions endpoint: dynamo.Processor.chat/completions
port: 8000 port: 8000
...@@ -35,3 +35,5 @@ VllmWorker: ...@@ -35,3 +35,5 @@ VllmWorker:
tensor-parallel-size: 1 tensor-parallel-size: 1
ServiceArgs: ServiceArgs:
workers: 1 workers: 1
resources:
gpu: 1
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
Frontend: Frontend:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions endpoint: dynamo.Processor.chat/completions
port: 8000 port: 8000
...@@ -40,3 +40,5 @@ VllmWorker: ...@@ -40,3 +40,5 @@ VllmWorker:
tensor-parallel-size: 1 tensor-parallel-size: 1
ServiceArgs: ServiceArgs:
workers: 1 workers: 1
resources:
gpu: 1
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
Frontend: Frontend:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions endpoint: dynamo.Processor.chat/completions
port: 8000 port: 8000
...@@ -24,19 +24,23 @@ Processor: ...@@ -24,19 +24,23 @@ Processor:
router: round-robin router: round-robin
VllmWorker: VllmWorker:
# vllm enging args
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
max-model-len: 16384 max-model-len: 16384
# dynamo args
remote-prefill: true remote-prefill: true
conditional-disagg: true conditional-disagg: true
max-local-prefill-length: 10 max-local-prefill-length: 10
ServiceArgs:
workers: 1
resources:
gpu: 1
PrefillWorker: PrefillWorker:
# vllm enging args
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
max-model-len: 16384 max-model-len: 16384
max-num-batched-tokens: 16384 max-num-batched-tokens: 16384
ServiceArgs:
workers: 1
resources:
gpu: 1
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
Frontend: Frontend:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions endpoint: dynamo.Processor.chat/completions
port: 8000 port: 8000
...@@ -48,3 +48,7 @@ PrefillWorker: ...@@ -48,3 +48,7 @@ PrefillWorker:
block-size: 64 block-size: 64
max-model-len: 16384 max-model-len: 16384
max-num-batched-tokens: 16384 max-num-batched-tokens: 16384
ServiceArgs:
workers: 1
resources:
gpu: 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment