Unverified Commit 675a9bf5 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Remove TRT-LLM C++ engine in favor of Python one (#747)

parent d797b4ba
...@@ -513,26 +513,6 @@ dependencies = [ ...@@ -513,26 +513,6 @@ dependencies = [
"which", "which",
] ]
[[package]]
name = "bindgen"
version = "0.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
dependencies = [
"bitflags 2.9.0",
"cexpr",
"clang-sys",
"itertools 0.13.0",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
"rustc-hash 1.1.0",
"shlex",
"syn 2.0.100",
]
[[package]] [[package]]
name = "bindgen_cuda" name = "bindgen_cuda"
version = "0.1.5" version = "0.1.5"
...@@ -1571,29 +1551,6 @@ dependencies = [ ...@@ -1571,29 +1551,6 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "dynamo-engine-trtllm"
version = "0.1.1"
dependencies = [
"anyhow",
"async-openai",
"async-stream",
"async-trait",
"bindgen 0.70.1",
"cmake",
"derive_builder",
"dynamo-llm",
"dynamo-runtime",
"futures",
"serde",
"serde_json",
"serde_repr",
"thiserror 2.0.12",
"tokio",
"tokio-util",
"tracing",
]
[[package]] [[package]]
name = "dynamo-engine-vllm0_7" name = "dynamo-engine-vllm0_7"
version = "0.1.1" version = "0.1.1"
...@@ -1711,7 +1668,6 @@ dependencies = [ ...@@ -1711,7 +1668,6 @@ dependencies = [
"dynamo-engine-mistralrs", "dynamo-engine-mistralrs",
"dynamo-engine-python", "dynamo-engine-python",
"dynamo-engine-sglang", "dynamo-engine-sglang",
"dynamo-engine-trtllm",
"dynamo-engine-vllm0_7", "dynamo-engine-vllm0_7",
"dynamo-engine-vllm0_8", "dynamo-engine-vllm0_8",
"dynamo-llm", "dynamo-llm",
...@@ -3440,7 +3396,7 @@ version = "0.1.102" ...@@ -3440,7 +3396,7 @@ version = "0.1.102"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0522f9894e22dd988dd2e34222bda7acba53a0dcce744ca6d8ddce905ba33a4e" checksum = "0522f9894e22dd988dd2e34222bda7acba53a0dcce744ca6d8ddce905ba33a4e"
dependencies = [ dependencies = [
"bindgen 0.69.5", "bindgen",
"cc", "cc",
"cmake", "cmake",
"find_cuda_helper", "find_cuda_helper",
......
...@@ -6,11 +6,11 @@ ...@@ -6,11 +6,11 @@
* [Multi-node](#multi-node) * [Multi-node](#multi-node)
* [Compiling from Source](#compiling-from-source) * [Compiling from Source](#compiling-from-source)
* [Setup](#setup) * [Setup](#setup)
* [sglang](#sglang) * [Sglang](#sglang)
* [llama_cpp](#llama_cpp) * [lama.cpp](#llama_cpp)
* [vllm](#vllm) * [Vllm](#vllm)
* [Python bring-your-own-engine](#python-bring-your-own-engine) * [Python bring-your-own-engine](#python-bring-your-own-engine)
* [trtllm](#trtllm) * [TensorRT-LLM](#tensorrt-llm-engine)
* [Echo Engines](#echo-engines) * [Echo Engines](#echo-engines)
* [Batch mode](#batch-mode) * [Batch mode](#batch-mode)
* [Defaults](#defaults) * [Defaults](#defaults)
...@@ -325,7 +325,7 @@ MAIN: ['my_engine.py', '--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '-- ...@@ -325,7 +325,7 @@ MAIN: ['my_engine.py', '--model-path', '/opt/models/Llama-3.2-3B-Instruct/', '--
This allows quick iteration on the engine setup. Note how the `-n` `1` is included. Flags `--leader-addr` and `--model-config` will also be added if provided to `dynamo-run`. This allows quick iteration on the engine setup. Note how the `-n` `1` is included. Flags `--leader-addr` and `--model-config` will also be added if provided to `dynamo-run`.
#### TensorRT-LLM `pystr` engine #### TensorRT-LLM engine
To run a TRT-LLM model with dynamo-run we have included a python based [async engine] (/examples/tensorrt_llm/engines/agg_engine.py). To run a TRT-LLM model with dynamo-run we have included a python based [async engine] (/examples/tensorrt_llm/engines/agg_engine.py).
To configure the TensorRT-LLM async engine please see [llm_api_config.yaml](/examples/tensorrt_llm/configs/llm_api_config.yaml). The file defines the options that need to be passed to the LLM engine. Follow the steps below to serve trtllm on dynamo run. To configure the TensorRT-LLM async engine please see [llm_api_config.yaml](/examples/tensorrt_llm/configs/llm_api_config.yaml). The file defines the options that need to be passed to the LLM engine. Follow the steps below to serve trtllm on dynamo run.
...@@ -386,24 +386,6 @@ async def generate(request): ...@@ -386,24 +386,6 @@ async def generate(request):
`pytok` supports the same ways of passing command line arguments as `pystr` - `initialize` or `main` with `sys.argv`. `pytok` supports the same ways of passing command line arguments as `pystr` - `initialize` or `main` with `sys.argv`.
### trtllm
TensorRT-LLM. Requires `clang` and `libclang-dev`.
1. Build:
```
cargo build --features trtllm
```
2. Run:
```
dynamo-run in=text out=trtllm --model-path /app/trtllm_engine/ --model-config ~/llm_models/Llama-3.2-3B-Instruct/
```
Note that TRT-LLM uses it's own `.engine` format for weights.
The `--model-path` you give to `dynamo-run` must contain the `config.json` (TRT-LLM's , not the model's) and `rank0.engine` (plus other ranks if relevant).
### Echo Engines ### Echo Engines
Dynamo includes two echo engines for testing and debugging purposes: Dynamo includes two echo engines for testing and debugging purposes:
......
...@@ -32,7 +32,6 @@ mistralrs = ["dep:dynamo-engine-mistralrs"] ...@@ -32,7 +32,6 @@ mistralrs = ["dep:dynamo-engine-mistralrs"]
llamacpp = ["dep:dynamo-engine-llamacpp"] llamacpp = ["dep:dynamo-engine-llamacpp"]
vllm = ["dep:dynamo-engine-vllm0_7", "dep:dynamo-engine-vllm0_8", "dep:netlink-packet-route", "dep:rtnetlink"] vllm = ["dep:dynamo-engine-vllm0_7", "dep:dynamo-engine-vllm0_8", "dep:netlink-packet-route", "dep:rtnetlink"]
sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"] sglang = ["dep:dynamo-engine-sglang", "dep:netlink-packet-route", "dep:rtnetlink"]
trtllm = ["dep:dynamo-engine-trtllm"]
python = ["dep:dynamo-engine-python"] python = ["dep:dynamo-engine-python"]
cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"] cuda = ["dynamo-engine-llamacpp/cuda", "dynamo-engine-mistralrs/cuda"]
...@@ -48,7 +47,6 @@ dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = tru ...@@ -48,7 +47,6 @@ dynamo-engine-mistralrs = { path = "../../lib/engines/mistralrs", optional = tru
dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true } dynamo-engine-sglang = { path = "../../lib/engines/sglang", optional = true }
dynamo-engine-vllm0_7 = { path = "../../lib/engines/vllm0_7", optional = true } dynamo-engine-vllm0_7 = { path = "../../lib/engines/vllm0_7", optional = true }
dynamo-engine-vllm0_8 = { path = "../../lib/engines/vllm0_8", optional = true } dynamo-engine-vllm0_8 = { path = "../../lib/engines/vllm0_8", optional = true }
dynamo-engine-trtllm = { path = "../../lib/engines/trtllm", optional = true }
dynamo-engine-python = { path = "../../lib/engines/python", optional = true } dynamo-engine-python = { path = "../../lib/engines/python", optional = true }
anyhow = { workspace = true } anyhow = { workspace = true }
......
...@@ -55,7 +55,7 @@ pub struct Flags { ...@@ -55,7 +55,7 @@ pub struct Flags {
#[arg(long)] #[arg(long)]
pub model_config: Option<PathBuf>, pub model_config: Option<PathBuf>,
/// sglang, vllm, trtllm /// sglang, vllm
/// ///
/// How many GPUs to use at once, total across all nodes. /// How many GPUs to use at once, total across all nodes.
/// This must divide by num_nodes, and each node must use the same number of GPUs. /// This must divide by num_nodes, and each node must use the same number of GPUs.
......
...@@ -421,28 +421,6 @@ pub async fn run( ...@@ -421,28 +421,6 @@ pub async fn run(
card: Box::new(card), card: Box::new(card),
} }
} }
#[cfg(feature = "trtllm")]
Output::TrtLLM => {
let Some(model_path) = model_path else {
anyhow::bail!("out=trtllm requires flag --model-path=<full-path-to-model-dir>");
};
if !model_path.is_dir() {
anyhow::bail!(
"--model-path should point at a directory containing `.engine` files."
);
}
// Safety: Earlier we build maybe_card from model_path, which we checked right above
let card = maybe_card.clone().unwrap();
let engine = dynamo_engine_trtllm::make_engine(
model_path.display(),
flags.tensor_parallel_size,
)?;
EngineConfig::StaticCore {
service_name: card.service_name.clone(),
engine,
card: Box::new(card),
}
}
#[cfg(feature = "python")] #[cfg(feature = "python")]
Output::PythonStr(path_str) => { Output::PythonStr(path_str) => {
let Some(model_name) = model_name else { let Some(model_name) = model_name else {
......
...@@ -122,10 +122,6 @@ pub enum Output { ...@@ -122,10 +122,6 @@ pub enum Output {
/// Run inference using vllm 0.7.X /// Run inference using vllm 0.7.X
Vllm0_7, Vllm0_7,
#[cfg(feature = "trtllm")]
/// Run inference using trtllm
TrtLLM,
/// Run inference using a user supplied python file that accepts and returns /// Run inference using a user supplied python file that accepts and returns
/// strings. It does it's own pre-processing. /// strings. It does it's own pre-processing.
#[cfg(feature = "python")] #[cfg(feature = "python")]
...@@ -161,9 +157,6 @@ impl TryFrom<&str> for Output { ...@@ -161,9 +157,6 @@ impl TryFrom<&str> for Output {
#[cfg(feature = "vllm")] #[cfg(feature = "vllm")]
"vllm0_7" => Ok(Output::Vllm0_7), "vllm0_7" => Ok(Output::Vllm0_7),
#[cfg(feature = "trtllm")]
"trtllm" => Ok(Output::TrtLLM),
"echo_full" => Ok(Output::EchoFull), "echo_full" => Ok(Output::EchoFull),
"echo_core" => Ok(Output::EchoCore), "echo_core" => Ok(Output::EchoCore),
...@@ -212,9 +205,6 @@ impl fmt::Display for Output { ...@@ -212,9 +205,6 @@ impl fmt::Display for Output {
#[cfg(feature = "vllm")] #[cfg(feature = "vllm")]
Output::Vllm0_7 => "vllm0_7", Output::Vllm0_7 => "vllm0_7",
#[cfg(feature = "trtllm")]
Output::TrtLLM => "trtllm",
Output::EchoFull => "echo_full", Output::EchoFull => "echo_full",
Output::EchoCore => "echo_core", Output::EchoCore => "echo_core",
...@@ -295,11 +285,6 @@ impl Output { ...@@ -295,11 +285,6 @@ impl Output {
out.push(Output::PythonTok("file.py".to_string()).to_string()); out.push(Output::PythonTok("file.py".to_string()).to_string());
} }
#[cfg(feature = "trtllm")]
{
out.push(Output::TrtLLM.to_string());
}
out out
} }
} }
---
# Refer to the following link for the explanation of each params:
# http://releases.llvm.org/12.0.0/tools/clang/docs/ClangFormatStyleOptions.html
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -2
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: false
AllowAllConstructorInitializersOnNextLine: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: false # Allows placing breakpoint
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Empty
AllowShortLoopsOnASingleLine: false
# This is deprecated
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: true
AfterControlStatement: true
AfterEnum: true
AfterFunction: true
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: true
AfterUnion: true
AfterExternBlock: false
BeforeCatch: false
BeforeElse: true
IndentBraces: false
# disabling the below splits, else, they'll just add to the vertical length of source files!
SplitEmptyFunction: false
SplitEmptyRecord: false
SplitEmptyNamespace: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Custom
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: AfterColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 120
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
ConstructorInitializerIndentWidth: 2
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeBlocks: Regroup
IncludeCategories:
# The order of the groups is
# 0 - Main include file for .cpp
# 1 - source relative files `#include "./some_header.hpp"` (Grouped with 2)
# 2 - source relative files starting with internal/public `#include "internal/some_header.hpp"`
# 3 - Python MRC public API files `#include "pymrc/mrc_header.hpp"`
# 4 - MRC public API files `#include "mrc/mrc_header.hpp"`
# 5 - NVRPC public API files `#include "nvrpc/some_header.hpp"`
# 6 - External installed libraries `#include <external_lib/some_header.hpp>`
# 7 - System includes `#include <string>`
# First match any Python MRC public API headers with quotes
- Regex: '^"pymrc\/.*\.(h|hpp)"'
Priority: 3
# Next match any MRC public API headers with quotes
- Regex: '^"mrc\/.*\.(h|hpp)"'
Priority: 4
# Next match public NVRPC headers with quotes
- Regex: '^<nvrpc\/.*\.(h|hpp)>'
Priority: 5
# Next find any headers in internal or public
- Regex: '^"(internal|public)\/.*\.(h|hpp)"'
Priority: 2
# Any other quoted includes need to be with internal/public but on top (Thats why this group is last)
- Regex: '^".*\.(h|hpp)"'
Priority: 1
# Last is system includes which dont have a '/' like <string> or <mutex>
- Regex: '<([a-z_])+>'
Priority: 7
# Finally, put all 3rd party includes before the system includes
- Regex: '^<.*'
Priority: 6
# IncludeIsMainSourceRegex: '$?'
IncludeIsMainRegex: '([-_](test|unittest))?$'
IndentCaseBlocks: false
IndentCaseLabels: false
IndentPPDirectives: BeforeHash
IndentWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PackConstructorInitializers: CurrentLine
PenaltyBreakAssignment: 80
PenaltyBreakBeforeFirstCallParameter: 0
PenaltyBreakComment: 10
PenaltyBreakFirstLessLess: 10
PenaltyBreakString: 0
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 100
PenaltyReturnTypeOnItsOwnLine: 600
PointerAlignment: Left
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
- Language: TextProto
Delimiters:
- pb
- PB
- proto
- PROTO
EnclosingFunctions:
- EqualsProto
- EquivToProto
- PARSE_PARTIAL_TEXT_PROTO
- PARSE_TEST_PROTO
- PARSE_TEXT_PROTO
- ParseTextOrDie
- ParseTextProtoOrDie
CanonicalDelimiter: ''
BasedOnStyle: google
# Enabling comment reflow causes doxygen comments to be messed up in their formats!
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: c++20
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
# Be consistent with indent-width, even for people who use tab for indentation!
TabWidth: 4
UseTab: Never
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.17)
project(
nvllm
VERSION 0.1.0.0
LANGUAGES CXX
)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/set_ifndef.cmake)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_POSTION_INDEPENDENT_CODE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
option(USE_STUBS "Build with stub implementations instead of real CUDA code" OFF)
if (USE_STUBS)
add_definitions(-DUSE_STUBS)
set(SOURCE_FILES
src/nvllm_trt.cpp
src/engine_stub/engine.cpp
)
add_library(tensorrt_llm SHARED src/engine_stub/tensorrt_llm.cpp)
else()
#SET(TRTLLM_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../submodules/tensorrt_llm" CACHE STRING "TRTLLM_SRC_DIR: /../../submodules/tensorrt_llm")
SET(TRTLLM_LIB_DIR "/usr/local/lib" CACHE STRING "TRTLLM_LIB_DIR: /usr/local/lib")
#include(${TRTLLM_SRC_DIR}/cpp/cmake/modules/find_library_create_target.cmake)
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;/opt/hpcx/ompi/lib:/usr/local/cuda/lib64:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib:/src/tensorrt_llm/cpp/build/tensorrt_llm/plugins")
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
include(FetchContent)
FetchContent_Declare(
json
GIT_REPOSITORY https://github.com/nlohmann/json.git
GIT_TAG v3.11.2
)
FetchContent_Declare(
spdlog
GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG v1.15.0
)
# Make nlohmann/json available
FetchContent_MakeAvailable(json)
FetchContent_MakeAvailable(spdlog)
set_property(TARGET spdlog PROPERTY POSITION_INDEPENDENT_CODE ON)
add_library(tensorrt_llm SHARED IMPORTED)
set_target_properties(
tensorrt_llm
PROPERTIES
IMPORTED_LOCATION "${TRTLLM_LIB_DIR}/libtensorrt_llm.so"
)
add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED)
set_target_properties(
nvinfer_plugin_tensorrt_llm
PROPERTIES
IMPORTED_LOCATION "${TRTLLM_LIB_DIR}/libnvinfer_plugin_tensorrt_llm.so"
)
add_library(xxhash STATIC IMPORTED)
set_target_properties(
xxhash
PROPERTIES
IMPORTED_LOCATION "/usr/lib/x86_64-linux-gnu/libxxhash.a"
)
set(SOURCE_FILES
src/nvllm_trt.cpp
src/engine_trt/engine.cpp
src/engine_trt/request.cpp
src/engine_trt/response.cpp
src/engine_trt/config.cpp
src/engine_trt/kv_event.cpp
src/engine_trt/stats.cpp
${PROTO_SRCS} ${PROTO_HDRS}
# ... other source files ...
)
endif()
function(set_library_target_properties target)
target_include_directories(
${target}
PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/>
$<INSTALL_INTERFACE:include/>
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src
${CMAKE_BINARY_DIR}
/usr/local/cuda-12.6/targets/x86_64-linux/include
/usr/local/tensorrt/include/
)
target_compile_features(${target} PRIVATE cxx_std_17)
set_target_properties(${target} PROPERTIES OUTPUT_NAME nvllm_trt)
target_compile_options(
${target}
PRIVATE
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-Wall
-Wextra
-Wno-unused-parameter
-Wno-type-limits>
-Wno-deprecated-declarations
$<$<CXX_COMPILER_ID:MSVC>:/Wall
/D_WIN32_WINNT=0x0A00
/EHsc>)
if (USE_STUBS)
else()
target_link_libraries(
${target}
PRIVATE tensorrt_llm
${Protobuf_LIBRARIES}
xxhash
# ${MPI_LIBRARIES}
# ${CUDA_LIBRARIES}
# nvinfer
nvinfer_plugin_tensorrt_llm
nlohmann_json::nlohmann_json
spdlog::spdlog
)
endif()
# target_link_options(${target} PRIVATE "-static")
target_link_libraries(${target} PUBLIC
)
endfunction()
add_library(nvllm_trt SHARED ${SOURCE_FILES})
set_library_target_properties(nvllm_trt)
include(CMakePackageConfigHelpers)
configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/nvllmConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/nvllmConfig.cmake
INSTALL_DESTINATION lib/cmake/nvllm
)
write_basic_package_version_file(
"nvllmConfigVersion.cmake"
VERSION ${PROJECT_VERSION}
COMPATIBILITY AnyNewerVersion
)
# Installation rules
install(TARGETS nvllm_trt
EXPORT nvllmConfig # This should match the name used in configure_package_config_file
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
RUNTIME DESTINATION bin
INCLUDES DESTINATION include
)
# Install the nvllmConfig.cmake and nvllmConfigVersion.cmake files
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/nvllmConfig.cmake # Corrected the file name
${CMAKE_CURRENT_BINARY_DIR}/nvllmConfigVersion.cmake
DESTINATION lib/cmake/nvllm
)
# # Install config.h
# install(FILES "${PROJECT_BINARY_DIR}/config.h"
# DESTINATION include/nvidia/nvllm)
# Install header files
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
DESTINATION include)
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
#
macro(find_library_create_target target_name lib libtype hints)
message(
STATUS
"========================= Importing and creating target ${target_name} =========================="
)
message(STATUS "Looking for library ${lib}")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
find_library(
${lib}_LIB_PATH ${lib}${TRT_DEBUG_POSTFIX}
HINTS ${hints}
NO_DEFAULT_PATH)
endif()
find_library(${lib}_LIB_PATH ${lib} HINTS ${hints} NO_DEFAULT_PATH)
find_library(${lib}_LIB_PATH ${lib})
message(STATUS "Library that was found ${${lib}_LIB_PATH}")
add_library(${target_name} ${libtype} IMPORTED)
set_target_properties(
${target_name} PROPERTIES IMPORTED_LOCATION ${${lib}_LIB_PATH}
IMPORTED_IMPLIB ${${lib}_LIB_PATH})
message(
STATUS
"=========================================================================================="
)
endmacro()
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
#
function(set_ifndef variable value)
if(NOT DEFINED ${variable})
set(${variable}
${value}
PARENT_SCOPE)
endif()
endfunction()
#ifndef __NVIDIA_NVLLM_TRT_C_API__
#define __NVIDIA_NVLLM_TRT_C_API__
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
typedef enum
{
NVLLM_TRT_ENGINE_SUCCESS = 0, // No error
NVLLM_TRT_ENGINE_INVALID_REQUEST = 1, // Invalid request error
NVLLM_TRT_ENGINE_SHUTDOWN_REQUIRED = 2, // Shutdown and join required before destroying
NVLLM_TRT_ENGINE_SHUTDOWN_IN_PROGRESS = 3, // Shutdown in progress
} nvllm_trt_engine_error_t;
// struct nvllm_trt_engine {};
// Forward declaration of the C++ class
typedef struct nvllm_trt_engine nvllm_trt_engine;
typedef nvllm_trt_engine* nvllm_trt_engine_t;
typedef uint64_t request_id_t;
typedef uint64_t client_id_t;
// Set the MPI Communicator for the TensorRT LLM Engine
// This function should be called before creating the engine
int nvllm_trt_mpi_session_set_communicator(void* world_comm_ptr);
// Functions to interact with nvllm_trt_engine_s
nvllm_trt_engine_t nvllm_trt_engine_create(const char* config_proto);
// Create a nvLLM TRT Engine from an instance of the engine
// This requires the raw engine pointer to be an instantiated object at the exact same
// commit version as the version of TRTLLM used to build the nvLLM C API.
// This is a workaround to enable the Dynamo TensorRT LLM backend to use nvLLM.
nvllm_trt_engine_t nvllm_trt_engine_unsafe_create_from_executor(void* engine);
// Source: Enqueue a streaming request via a json message to the request queue
request_id_t nvllm_trt_engine_enqueue_request(nvllm_trt_engine_t engine, client_id_t client_id, const char* req_proto);
// Sink: Pull off streaming responses from the response queue
char* nvllm_trt_engine_await_responses(nvllm_trt_engine_t engine);
// Sink: Pull off KvEvents from the event queue
char* nvllm_trt_engine_await_kv_events(nvllm_trt_engine_t engine);
// Get basic iteration stats
char* nvllm_trt_engine_await_iter_stats(nvllm_trt_engine_t engine);
// Free the memory allocated by nvllm_trt_engine_await_responses
void nvllm_trt_engine_free_responses(char* responses);
// Sink: Pull off streaming responses from the response queue
void nvllm_trt_engine_cancel_request(nvllm_trt_engine_t engine, uint64_t request_id);
// Initiate the shutdown sequence
void nvllm_trt_engine_shutdown(nvllm_trt_engine_t engine);
// // Await for the shutdown to complete; shutdown will be requested if not already requested
// void nvllm_trt_engine_join(nvllm_trt_engine_t engine);
// Destroy the engine
int nvllm_trt_engine_destroy(nvllm_trt_engine_t engine);
// Returns true (non-zero) once the engine has started pulling requests
// There is currently no stopping, so once an engine has started,
// it will always return true, even when complete.
// This call does not block; the user should use some backoff strategy
// to poll for detecting the start of the engine.
int nvllm_trt_engine_is_ready(nvllm_trt_engine_t engine);
// Returns true (non-zero) once the engine has stopped pulling requests
int nvllm_trt_engine_has_completed(nvllm_trt_engine_t engine);
// // Returns the major version number of the trtllm library
// int trtllm_version_major();
// // Returns the minor version number of the trtllm library
// int trtllm_version_minor();
// // Returns the patch version number of the trtllm library
// int trtllm_version_patch();
#ifdef __cplusplus
}
#endif
#endif // __NVIDIA_NVLLM_TRT_C_API__
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <optional>
#include <string>
namespace nvidia::nvllm::trt {
class StreamingEngine
{
public:
StreamingEngine(const std::string& config_proto);
StreamingEngine(void* engine);
~StreamingEngine();
// accepts a string of a serialized proto::Request
// forms the internal request object and enqueues it
// returns a request_id provided by the engine; this must be used to cancel the request
// accepts a client_id which can be use to identify the response
uint64_t enqueue_request(uint64_t client_id, const std::string& json_request);
// awaits the presence of a response
// converts the internal format to a json and returns the string
std::string await_responses();
// awaits the presence of a kv events
std::optional<std::string> await_kv_events();
// Awaits iteration stats
std::optional<std::string> await_iter_stats();
// cancel request
void cancel_request(uint64_t request_id);
// called to start the shutdown sequence
void shutdown();
// returns true once the engine as started pulling requests
// there is currently no stopping, so once an engine has_started,
// it will always return true, even when complete
bool is_ready() const;
// returns true if the StreamingEngine has been both shutdown and joined
bool has_completed() const;
private:
class Impl;
std::unique_ptr<Impl> m_impl;
};
} // namespace nvidia::nvllm::trt
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Public API for the StreamingEngine class
#include "nvidia/nvllm/nvllm_trt.h"
// Internal Private Implementation
#include "api/engine.hpp"
#include <optional>
extern "C" {
bool initTrtLlmPlugins(void* logger, char const* libNamespace);
}
namespace nvidia::nvllm::trt {
class StreamingEngine::Impl
{
public:
Impl(const std::string& config_proto);
Impl(void* engine);
~Impl() = default;
uint64_t enqueue_request(uint64_t client_id, const std::string& req_proto)
{
std::abort();
return 911;
}
void cancel_request(uint64_t request_id) {}
std::string await_responses()
{
std::abort();
return {};
}
std::optional<std::string> await_kv_events()
{
std::abort();
return std::nullopt;
}
std::optional<std::string> await_iter_stats()
{
std::abort();
return std::nullopt;
}
void shutdown()
{
std::abort();
}
bool is_ready() const
{
std::abort();
return false;
}
bool has_completed() const
{
std::abort();
return false;
}
};
// Private Engine Impl
StreamingEngine::Impl::Impl(const std::string& config_proto)
{
initTrtLlmPlugins(nullptr, nullptr);
}
StreamingEngine::Impl::Impl(void* engine)
{
initTrtLlmPlugins(nullptr, nullptr);
}
// Public Engine Impl
StreamingEngine::StreamingEngine(const std::string& config_proto) :
m_impl{std::make_unique<Impl>(config_proto)} {} // namespace nvidia::nvllm::trt
StreamingEngine::StreamingEngine(void* engine) :
m_impl{std::make_unique<Impl>(engine)} {} // namespace nvidia::nvllm::trt
StreamingEngine::~StreamingEngine()
{
if (!m_impl->has_completed())
{
m_impl->shutdown();
}
}
uint64_t StreamingEngine::enqueue_request(uint64_t client_id, const std::string& req_proto)
{
return m_impl->enqueue_request(client_id, req_proto);
}
std::string StreamingEngine::await_responses()
{
return m_impl->await_responses();
}
std::optional<std::string> StreamingEngine::await_kv_events()
{
return m_impl->await_kv_events();
}
std::optional<std::string> StreamingEngine::await_iter_stats()
{
return m_impl->await_iter_stats();
}
void StreamingEngine::cancel_request(uint64_t request_id)
{
m_impl->cancel_request(request_id);
}
void StreamingEngine::shutdown()
{
m_impl->shutdown();
}
bool StreamingEngine::is_ready() const
{
return m_impl->is_ready();
}
bool StreamingEngine::has_completed() const
{
return m_impl->has_completed();
}
} // namespace nvidia::nvllm::trt
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
extern "C" {
bool initTrtLlmPlugins(void* logger, char const* libNamespace) {}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "engine_trt/config.hpp"
#include <nlohmann/json.hpp>
#include <spdlog/spdlog.h>
#include <cstdint>
#include <optional>
#include <string>
#include <vector>
using json = nlohmann::json;
namespace ex = tensorrt_llm::executor;
namespace nvidia::nvllm::trt {
struct ExecutorConfig
{
std::string model_path;
std::string log_level;
std::optional<bool> enable_chunked_context;
std::optional<bool> normalize_log_probs;
std::optional<uint32_t> iter_stats_max_iterations;
};
// Custom to_json function
inline void to_json(json& j, const ExecutorConfig& e)
{
j = json{{"model_path", e.model_path}, {"log_level", e.log_level}};
if (e.enable_chunked_context)
{
j["enable_chunked_context"] = e.enable_chunked_context.value();
}
if (e.normalize_log_probs)
{
j["normalize_log_probs"] = e.normalize_log_probs.value();
}
if (e.iter_stats_max_iterations)
{
j["iter_stats_max_iterations"] = e.iter_stats_max_iterations.value();
}
}
// Custom from_json function
inline void from_json(const json& j, ExecutorConfig& e)
{
j.at("model_path").get_to(e.model_path);
j.at("log_level").get_to(e.log_level);
if (j.contains("enable_chunked_context"))
{
e.enable_chunked_context = j.at("enable_chunked_context").get<bool>();
}
else
{
e.enable_chunked_context = std::nullopt;
}
if (j.contains("normalize_log_probs"))
{
e.normalize_log_probs = j.at("normalize_log_probs").get<bool>();
}
else
{
e.normalize_log_probs = std::nullopt;
}
if (j.contains("iter_stats_max_iterations"))
{
e.iter_stats_max_iterations = j.at("iter_stats_max_iterations").get<uint32_t>();
}
else
{
e.iter_stats_max_iterations = std::nullopt;
}
}
Config deserialize_config(const std::string& config_json)
{
auto config_in = json::parse(config_json).get<ExecutorConfig>();
auto model_path = config_in.model_path;
auto log_level = config_in.log_level;
auto config = ex::ExecutorConfig();
// todo - expose max num tokens
// todo - expose from engine block reuse
if (config_in.enable_chunked_context)
{
spdlog::info("Enable chunked context: {}", config_in.enable_chunked_context.value() ? "true" : "false");
config.setEnableChunkedContext(config_in.enable_chunked_context.value());
}
return {model_path, log_level, config};
}
} // namespace nvidia::nvllm::trt
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensorrt_llm/executor/executor.h"
namespace nvidia::nvllm::trt {
struct Config
{
std::string model_path;
std::string log_level;
tensorrt_llm::executor::ExecutorConfig config;
};
Config deserialize_config(const std::string& request);
} // namespace nvidia::nvllm::trt
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Public API for the StreamingEngine class
#include "nvidia/nvllm/nvllm_trt.h"
// Internal Private Implementation
#include "api/engine.hpp"
#include "engine_trt/config.hpp"
#include "engine_trt/kv_event.hpp"
#include "engine_trt/request.hpp"
#include "engine_trt/response.hpp"
#include "engine_trt/stats.hpp"
// TensorRT LLM Executor
#include "NvInfer.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
// Third-party
#include <spdlog/sinks/stdout_color_sinks.h>
#include <spdlog/spdlog.h>
namespace ex = tensorrt_llm::executor;
namespace nvidia::nvllm::trt {
/// Customize the logger for TensorRT LLM using a module-specific spdlog logger
class TRTLogger : public nvinfer1::ILogger
{
public:
TRTLogger(std::shared_ptr<spdlog::logger> logger) : m_logger(logger) {}
void log(nvinfer1::ILogger::Severity severity, const char* msg) noexcept override
{
if (severity <= nvinfer1::ILogger::Severity::kERROR)
{
m_logger->error("{}", msg);
}
else if (severity == nvinfer1::ILogger::Severity::kWARNING)
{
m_logger->warn("{}", msg);
}
else
{
m_logger->info("{}", msg);
}
}
private:
std::shared_ptr<spdlog::logger> m_logger;
};
class StreamingEngine::Impl
{
public:
Impl(const std::string& config_proto);
Impl(void* engine);
~Impl() = default;
/// Enqueues a request to the executor
/// In this opionionated implementation, [`client_id`] is required to be unique
uint64_t enqueue_request(uint64_t client_id, const std::string& req_json)
{
spdlog::trace("enqueue_request - client_id: {}", client_id);
auto request = deserialize_request(req_json);
request.setClientId(client_id);
auto request_id = m_executor->enqueueRequest(request);
spdlog::trace("request_id: {} with client_id {} was enqueued", request_id, client_id);
return request_id;
}
/// Cancellation is by [`request_id`], not [`client_id`]
void cancel_request(uint64_t request_id)
{
spdlog::trace("cancel_request: {}", request_id);
m_executor->cancelRequest(request_id);
}
/// Issues a shutdown request to the executor. This is a blocking call.
/// We protect it with a mutex to ensure that it is only called once.
void shutdown()
{
std::lock_guard<std::mutex> lock(m_mutex);
if (m_has_completed)
{
return;
}
m_executor->shutdown();
m_has_completed = true;
}
/// Returns true if the executor is ready to accept requests.
/// Not sure of TensorRT LLM's behavior when the executor is shutdown, so we
/// return false if the executor has completed.
bool is_ready() const
{
std::lock_guard<std::mutex> lock(m_mutex);
if (m_has_completed)
{
return false;
}
return m_executor->canEnqueueRequests();
}
/// Returns true if the executor has completed.
bool has_completed() const
{
std::lock_guard<std::mutex> lock(m_mutex);
return m_has_completed;
}
/// Awaits on the executor for responses. This is a blocking call.
/// TensorRT LLM will throw an exception if a thread is blocked on the calls and the
/// executor is shutdown.
std::string await_responses()
{
spdlog::trace("blocking on await_responses");
std::deque<ex::Response> responses;
bool shutdown = false;
try
{
auto v_responses = m_executor->awaitResponses();
spdlog::trace("received {} responses", v_responses.size());
for (auto& response : v_responses)
{
responses.push_back(std::move(response));
}
} catch (const std::exception& e)
{
spdlog::trace("Exception caught awaiting responses; shutting down");
shutdown = true;
}
return serialize_responses(std::move(responses), shutdown);
}
/// Awaits for KV events. This is a blocking call with a timeout of 250ms.
/// The current implementation will not throw an exception if the executor is shutdown,
/// so we need timeout the call to ensure that calling thread can shutdown properly.
std::optional<std::string> await_kv_events()
{
if (m_kv_cache_event_manager == nullptr)
{
auto manager = m_executor->getKVCacheEventManager();
if (manager)
{
m_kv_cache_event_manager = *manager;
}
}
if (m_kv_cache_event_manager == nullptr)
{
return std::nullopt;
}
try
{
auto events = m_kv_cache_event_manager->getLatestEvents({std::chrono::milliseconds(250)});
if (!events.empty())
{
spdlog::trace("received {} on kv_events", events.size());
}
return {serialize_kv_events(std::move(events), false)};
} catch (const std::exception& e)
{
spdlog::trace("Exception caught awaiting kv events; shutting down");
return {serialize_kv_events({}, true)};
}
}
// Awaits iteration stats
std::optional<std::string> await_iter_stats()
{
auto iter_stats = m_executor->getLatestIterationStats();
return serialize_iter_stats(iter_stats);
}
private:
std::unique_ptr<ex::Executor> m_executor;
std::shared_ptr<ex::KVCacheEventManager> m_kv_cache_event_manager = nullptr;
bool m_has_completed = false;
mutable std::mutex m_mutex;
};
// Private Engine Impl
StreamingEngine::Impl::Impl(void* engine)
{
auto nvllm_logger = spdlog::stdout_color_mt("nvllm");
spdlog::set_default_logger(nvllm_logger);
spdlog::info("Instantiating nvLLM from raw TensorRT LLM Executor pointer");
m_executor.reset(reinterpret_cast<ex::Executor*>(engine));
}
StreamingEngine::Impl::Impl(const std::string& config_json)
{
auto nvllm_logger = spdlog::stdout_color_mt("nvllm");
auto trtllm_logger = spdlog::stdout_color_mt("trtllm");
spdlog::set_default_logger(nvllm_logger);
auto config = deserialize_config(config_json);
if (config.log_level == "error")
{
spdlog::set_level(spdlog::level::err);
nvllm_logger->set_level(spdlog::level::err);
trtllm_logger->set_level(spdlog::level::err);
}
else if (config.log_level == "warn")
{
spdlog::set_level(spdlog::level::warn);
nvllm_logger->set_level(spdlog::level::warn);
trtllm_logger->set_level(spdlog::level::warn);
}
else if (config.log_level == "info")
{
spdlog::set_level(spdlog::level::info);
nvllm_logger->set_level(spdlog::level::info);
trtllm_logger->set_level(spdlog::level::info);
}
else if (config.log_level == "debug")
{
spdlog::set_level(spdlog::level::debug);
nvllm_logger->set_level(spdlog::level::debug);
trtllm_logger->set_level(spdlog::level::debug);
}
else if (config.log_level == "trace")
{
spdlog::set_level(spdlog::level::trace);
nvllm_logger->set_level(spdlog::level::trace);
trtllm_logger->set_level(spdlog::level::trace);
}
else
{
spdlog::set_level(spdlog::level::err);
nvllm_logger->set_level(spdlog::level::err);
trtllm_logger->set_level(spdlog::level::err);
}
TRTLogger* trtLogger = new TRTLogger(trtllm_logger);
initTrtLlmPlugins(trtLogger);
auto kv_config = config.config.getKvCacheConfig();
spdlog::info("Enabled block reuse: true");
kv_config.setEnableBlockReuse(true);
kv_config.setEventBufferMaxSize(65536);
config.config.setKvCacheConfig(kv_config);
m_executor = std::make_unique<ex::Executor>(config.model_path, ex::ModelType::kDECODER_ONLY, config.config);
}
// Public Engine Impl
StreamingEngine::StreamingEngine(const std::string& config_proto) :
m_impl{std::make_unique<Impl>(config_proto)} {} // namespace nvidia::nvllm::trt
StreamingEngine::StreamingEngine(void* engine) :
m_impl{std::make_unique<Impl>(engine)} {} // namespace nvidia::nvllm::trt
StreamingEngine::~StreamingEngine()
{
if (!m_impl->has_completed())
{
m_impl->shutdown();
}
}
uint64_t StreamingEngine::enqueue_request(uint64_t client_id, const std::string& req_proto)
{
return m_impl->enqueue_request(client_id, req_proto);
}
std::string StreamingEngine::await_responses()
{
return m_impl->await_responses();
}
std::optional<std::string> StreamingEngine::await_kv_events()
{
return m_impl->await_kv_events();
}
std::optional<std::string> StreamingEngine::await_iter_stats()
{
return m_impl->await_iter_stats();
}
void StreamingEngine::cancel_request(uint64_t request_id)
{
m_impl->cancel_request(request_id);
}
void StreamingEngine::shutdown()
{
m_impl->shutdown();
}
bool StreamingEngine::is_ready() const
{
return m_impl->is_ready();
}
bool StreamingEngine::has_completed() const
{
return m_impl->has_completed();
}
} // namespace nvidia::nvllm::trt
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "engine_trt/kv_event.hpp"
#include <nlohmann/json.hpp>
#include <spdlog/spdlog.h>
#include <xxhash.h>
#include <optional>
#include <string>
#include <vector>
using json = nlohmann::json;
namespace ex = tensorrt_llm::executor;
namespace tensorrt_llm::executor {
// Serialization for KVCacheRemovedData
void to_json(json& j, const KVCacheRemovedData& data)
{
j = json{{"block_hashes", data.blockHashes}};
}
void from_json(const json& j, KVCacheRemovedData& data)
{
j.at("block_hashes").get_to(data.blockHashes);
}
} // namespace tensorrt_llm::executor
namespace nvidia::nvllm::trt {
using IdType = ex::IdType;
using TokenIdType = ex::TokenIdType;
struct KVCacheStoredBlockData
{
KVCacheStoredBlockData() = default;
KVCacheStoredBlockData(const ex::KVCacheStoredBlockData& data)
{
std::vector<TokenIdType> tokens;
for (auto& token : data.tokens)
{
tokens.push_back(token.tokenId);
}
auto size = tokens.size() * sizeof(TokenIdType);
auto hash = XXH3_64bits_withSeed(tokens.data(), size, 1337);
this->block_hash = data.blockHash;
this->tokens_hash = hash;
this->lora_id = data.loraId;
}
/// @brief The hash of the block
IdType block_hash;
/// @brief The tokens in the block
IdType tokens_hash;
/// @brief The Lora ID of the block
IdType lora_id;
};
// Serialization for KVCacheStoredBlockData
void to_json(json& j, const KVCacheStoredBlockData& data)
{
j = json{
{"block_hash", data.block_hash},
{"tokens_hash", data.tokens_hash},
{"lora_id", data.lora_id},
};
}
void from_json(const json& j, KVCacheStoredBlockData& data)
{
j.at("block_hash").get_to(data.block_hash);
j.at("tokens_hash").get_to(data.tokens_hash);
j.at("lora_id").get_to(data.lora_id);
}
struct KVCacheStoredData
{
KVCacheStoredData() = default;
KVCacheStoredData(ex::KVCacheStoredData&& data) : parent_hash(std::move(data.parentHash))
{
for (auto& block : data.blocks)
{
blocks.emplace_back(block);
}
}
/// @brief The parent of this sequence of stored blocks
std::optional<IdType> parent_hash;
/// @brief A sequence of blocks. The parent of block `i` is block `i-1`
std::vector<KVCacheStoredBlockData> blocks;
};
using KVCacheRemovedData = ex::KVCacheRemovedData;
// Serialization for KVCacheStoredData
void to_json(json& j, const KVCacheStoredData& data)
{
j = json{{"blocks", data.blocks}};
if (data.parent_hash)
{
j["parent_hash"] = data.parent_hash.value();
}
}
void from_json(const json& j, KVCacheStoredData& data)
{
j.at("blocks").get_to(data.blocks);
if (j.contains("parent_hash"))
{
data.parent_hash = j.at("parent_hash").get<IdType>();
}
}
struct KVCacheEventData
{
KVCacheEventData() = default;
explicit KVCacheEventData(ex::KVCacheEventData&& data)
{
if (std::holds_alternative<ex::KVCacheStoredData>(data))
{
stored = KVCacheStoredData(std::move(std::get<ex::KVCacheStoredData>(data)));
}
else if (std::holds_alternative<ex::KVCacheRemovedData>(data))
{
removed = std::move(std::get<ex::KVCacheRemovedData>(data));
}
}
std::optional<KVCacheStoredData> stored;
std::optional<KVCacheRemovedData> removed;
};
// Serialization for KVCacheEventData
void to_json(json& j, const KVCacheEventData& data)
{
if (data.stored)
{
j["stored"] = data.stored.value();
}
else if (data.removed)
{
j["removed"] = data.removed.value();
}
}
void from_json(const json& j, KVCacheEventData& data)
{
if (j.contains("stored"))
{
data.stored = {j.at("stored").get<KVCacheStoredData>()};
}
else if (j.contains("removed"))
{
data.removed = {j.at("removed").get<KVCacheRemovedData>()};
}
}
struct KVCacheEvent
{
KVCacheEvent(IdType eventId, KVCacheEventData data);
KVCacheEvent(ex::KVCacheEvent&& event) : event_id(std::move(event.eventId)), data(std::move(event.data)) {}
/// @brief The unique id of this event
IdType event_id;
/// @brief The data corresponding to this event
KVCacheEventData data;
};
inline void to_json(json& j, const KVCacheEvent& event)
{
j = json{{"event_id", event.event_id}, {"data", event.data}};
}
inline void from_json(const json& j, KVCacheEvent& event)
{
j.at("event_id").get_to(event.event_id);
j.at("data").get_to(event.data);
}
struct KVCacheEvents
{
std::vector<KVCacheEvent> events;
bool shutdown;
};
inline void to_json(json& j, const KVCacheEvents& events)
{
j = json{{"events", events.events}, {"shutdown", events.shutdown}};
}
// inline void from_json(const json& j, KVCacheEvents& events)
// {
// j.at("events").get_to(events.events);
// j.at("shutdown").get_to(events.shutdown);
// }
std::string serialize_kv_events(std::deque<tensorrt_llm::executor::KVCacheEvent> events_in, bool shutdown)
{
std::vector<KVCacheEvent> events_out;
while (!events_in.empty())
{
auto event = events_in.front();
events_in.pop_front();
if (std::holds_alternative<ex::KVCacheCreatedData>(event.data) ||
std::holds_alternative<ex::KVCacheUpdatedData>(event.data))
{
continue;
}
events_out.emplace_back(std::move(event));
}
KVCacheEvents events{std::move(events_out), shutdown};
return json(events).dump();
}
} // namespace nvidia::nvllm::trt
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "tensorrt_llm/executor/executor.h"
namespace nvidia::nvllm::trt {
std::string serialize_kv_events(std::deque<tensorrt_llm::executor::KVCacheEvent> responses, bool shutdown);
} // namespace nvidia::nvllm::trt
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment