Unverified Commit 3733f585 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat(backends): Python llama.cpp engine (#1925)

parent 6a1350c7
# llama.cpp engine for Dynamo
Usage:
- `pip install -r requirements.txt` # Need a recent pip, `uv pip` might be too old.
- `python -m dynamo.llama_cpp --model-path /data/models/Qwen3-0.6B-Q8_0.gguf [args]`
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
llama-cpp-python -C cmake.args="-DGGML_CUDA=on"
uvloop
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from dynamo.llama_cpp.main import main
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Usage: `python -m dynamo.llama_cpp --model-path /data/models/Qwen3-0.6B-Q8_0.gguf [args]`
import argparse
import logging
import sys
from typing import Optional
import uvloop
from llama_cpp import Llama
from dynamo.llm import ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
configure_dynamo_logging()
class Config:
"""Command line parameters or defaults"""
namespace: str
component: str
endpoint: str
model_path: str
model_name: Optional[str]
context_length: int
@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
config = cmd_line_args()
component = runtime.namespace(config.namespace).component(config.component)
await component.create_service()
model_type = ModelType.Chat # llama.cpp does the pre-processing
endpoint = component.endpoint(config.endpoint)
await register_llm(model_type, endpoint, config.model_path, config.model_name)
# Initialize the engine
# For more parameters see:
# https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#high-level-api
kwargs = {
"model_path": config.model_path,
"n_gpu_layers": -1, # GPU if we can
"n_threads": 16, # Otherwise give it some CPU
}
if config.context_length:
kwargs["n_ctx"] = config.context_length
engine = Llama(**kwargs)
await endpoint.serve_endpoint(RequestHandler(engine).generate)
class RequestHandler:
def __init__(self, engine):
self.engine_client = engine
async def generate(self, request):
gen = self.engine_client.create_chat_completion(
request["messages"], stream=True
)
# TODO this is a synchronous generator in an async method.
# Move it to a thread so it doesn't block the event loop.
for res in gen:
logging.debug(f"res: {res}")
yield res
def cmd_line_args():
parser = argparse.ArgumentParser(
description="llama.cpp server integrated with Dynamo LLM."
)
parser.add_argument(
"--model-path",
type=str,
required=True,
help="Path to a local GGUF file.",
)
parser.add_argument(
"--endpoint",
type=str,
default=DEFAULT_ENDPOINT,
help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
)
parser.add_argument(
"--model-name",
type=str,
default="",
help="Name to serve the model under. Defaults to deriving it from model path.",
)
parser.add_argument(
"--context-length",
type=int,
default=None,
help="Max model context length. Defaults to models max, usually model_max_length from tokenizer_config.json. Reducing this reduces VRAM requirements.",
)
args = parser.parse_args()
config = Config()
config.model_path = args.model_path
if args.model_name:
config.model_name = args.model_name
else:
# This becomes an `Option` on the Rust side
config.model_name = None
endpoint_str = args.endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
logging.error(
f"Invalid endpoint format: '{args.endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
sys.exit(1)
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
config.namespace = parsed_namespace
config.component = parsed_component_name
config.endpoint = parsed_endpoint_name
config.context_length = args.context_length
return config
def main():
uvloop.run(worker())
if __name__ == "__main__":
main()
...@@ -79,7 +79,7 @@ requires = ["hatchling"] ...@@ -79,7 +79,7 @@ requires = ["hatchling"]
build-backend = "hatchling.build" build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel] [tool.hatch.build.targets.wheel]
packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/ingress/src/dynamo"] packages = ["deploy/sdk/src/dynamo", "components/planner/src/dynamo", "components/ingress/src/dynamo", "components/backends/llama_cpp/src/dynamo"]
# This section is for including the binaries in the wheel package # This section is for including the binaries in the wheel package
# but doesn't make them executable scripts in the venv bin directory # but doesn't make them executable scripts in the venv bin directory
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment