Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
65a2dfab
Commit
65a2dfab
authored
Feb 20, 2025
by
Biswa Panda
Committed by
GitHub
Feb 20, 2025
Browse files
feat: add local model card (#216)
parent
7f85dcc3
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
951 additions
and
9 deletions
+951
-9
.github/workflows/copyright-check.ps1
.github/workflows/copyright-check.ps1
+1
-1
applications/llm/bin/tio/Cargo.lock
applications/llm/bin/tio/Cargo.lock
+2
-0
examples/rust/Cargo.lock
examples/rust/Cargo.lock
+3
-1
llm/rust/Cargo.lock
llm/rust/Cargo.lock
+5
-2
llm/rust/triton-llm/Cargo.toml
llm/rust/triton-llm/Cargo.toml
+11
-2
llm/rust/triton-llm/src/common.rs
llm/rust/triton-llm/src/common.rs
+16
-0
llm/rust/triton-llm/src/common/versioned.rs
llm/rust/triton-llm/src/common/versioned.rs
+21
-0
llm/rust/triton-llm/src/lib.rs
llm/rust/triton-llm/src/lib.rs
+2
-0
llm/rust/triton-llm/src/model_card.rs
llm/rust/triton-llm/src/model_card.rs
+17
-0
llm/rust/triton-llm/src/model_card/create.rs
llm/rust/triton-llm/src/model_card/create.rs
+172
-0
llm/rust/triton-llm/src/model_card/model.rs
llm/rust/triton-llm/src/model_card/model.rs
+288
-0
llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/config.json
...data/sample-models/mock-llama-3.1-8b-instruct/config.json
+27
-0
llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/generation_config.json
...-models/mock-llama-3.1-8b-instruct/generation_config.json
+9
-0
llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/tokenizer.json
...a/sample-models/mock-llama-3.1-8b-instruct/tokenizer.json
+209
-0
llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/tokenizer_config.json
...e-models/mock-llama-3.1-8b-instruct/tokenizer_config.json
+94
-0
llm/rust/triton-llm/tests/model_card.rs
llm/rust/triton-llm/tests/model_card.rs
+70
-0
python-wheel/Cargo.lock
python-wheel/Cargo.lock
+2
-0
runtime/rust/src/lib.rs
runtime/rust/src/lib.rs
+1
-1
runtime/rust/src/slug.rs
runtime/rust/src/slug.rs
+0
-0
runtime/rust/src/transports/nats.rs
runtime/rust/src/transports/nats.rs
+1
-2
No files found.
.github/workflows/copyright-check.ps1
View file @
65a2dfab
...
@@ -122,7 +122,7 @@ $global:copyright_results = @{
...
@@ -122,7 +122,7 @@ $global:copyright_results = @{
$ignored_files
= @('.clang-format', '.gitattributes', '.gitignore', '.gitkeep', '.patch', 'Cargo.lock', 'LICENSE', 'uv.lock', 'rust-toolchain.toml')
$ignored_files
= @('.clang-format', '.gitattributes', '.gitignore', '.gitkeep', '.patch', 'Cargo.lock', 'LICENSE', 'uv.lock', 'rust-toolchain.toml')
write-debug "
<
copyright-check
>
ignored_files
=
[
'$($ignored_files -join "'
,
'")'
]
.
"
write-debug "
<
copyright-check
>
ignored_files
=
[
'$($ignored_files -join "'
,
'")'
]
.
"
$ignored_paths
= @('.github', '.mypy_cache', '.pytest_cache')
$ignored_paths
= @('.github', '.mypy_cache', '.pytest_cache'
, 'llm/rust/triton-llm/tests/data/sample-models'
)
write-debug "
<
copyright-check
>
ignored_paths
=
[
'$($ignored_paths -join "'
,
'")'
]
.
"
write-debug "
<
copyright-check
>
ignored_paths
=
[
'$($ignored_paths -join "'
,
'")'
]
.
"
$ignored_types
= @('.bat', '.gif', '.ico', '.ipynb', '.jpg', '.jpeg', '.patch', '.png', '.pyc', '.pyi', '.rst', '.zip', '.md')
$ignored_types
= @('.bat', '.gif', '.ico', '.ipynb', '.jpg', '.jpeg', '.patch', '.png', '.pyc', '.pyi', '.rst', '.zip', '.md')
write-debug "
<
copyright-check
>
ignored_types
=
[
'$($ignored_types -join "'
,
'")'
]
.
"
write-debug "
<
copyright-check
>
ignored_types
=
[
'$($ignored_types -join "'
,
'")'
]
.
"
...
...
applications/llm/bin/tio/Cargo.lock
View file @
65a2dfab
...
@@ -687,6 +687,7 @@ dependencies = [
...
@@ -687,6 +687,7 @@ dependencies = [
"iana-time-zone",
"iana-time-zone",
"js-sys",
"js-sys",
"num-traits",
"num-traits",
"serde",
"wasm-bindgen",
"wasm-bindgen",
"windows-targets 0.52.6",
"windows-targets 0.52.6",
]
]
...
@@ -5125,6 +5126,7 @@ dependencies = [
...
@@ -5125,6 +5126,7 @@ dependencies = [
"async-stream",
"async-stream",
"async-trait",
"async-trait",
"axum 0.8.1",
"axum 0.8.1",
"blake3",
"bytes",
"bytes",
"chrono",
"chrono",
"derive_builder",
"derive_builder",
...
...
examples/rust/Cargo.lock
View file @
65a2dfab
# This file is automatically @generated by Cargo.
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
# It is not intended for manual editing.
version =
3
version =
4
[[package]]
[[package]]
name = "addr2line"
name = "addr2line"
...
@@ -464,6 +464,7 @@ dependencies = [
...
@@ -464,6 +464,7 @@ dependencies = [
"iana-time-zone",
"iana-time-zone",
"js-sys",
"js-sys",
"num-traits",
"num-traits",
"serde",
"wasm-bindgen",
"wasm-bindgen",
"windows-targets",
"windows-targets",
]
]
...
@@ -3104,6 +3105,7 @@ dependencies = [
...
@@ -3104,6 +3105,7 @@ dependencies = [
"async-stream",
"async-stream",
"async-trait",
"async-trait",
"axum 0.8.1",
"axum 0.8.1",
"blake3",
"bytes",
"bytes",
"chrono",
"chrono",
"derive_builder",
"derive_builder",
...
...
llm/rust/Cargo.lock
View file @
65a2dfab
...
@@ -712,6 +712,7 @@ dependencies = [
...
@@ -712,6 +712,7 @@ dependencies = [
"iana-time-zone",
"iana-time-zone",
"js-sys",
"js-sys",
"num-traits",
"num-traits",
"serde",
"wasm-bindgen",
"wasm-bindgen",
"windows-targets 0.52.6",
"windows-targets 0.52.6",
]
]
...
@@ -4713,9 +4714,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
...
@@ -4713,9 +4714,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
[[package]]
[[package]]
name = "tempfile"
name = "tempfile"
version = "3.1
6.0
"
version = "3.1
7.1
"
source = "registry+https://github.com/rust-lang/crates.io-index"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "
38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91
"
checksum = "
22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230
"
dependencies = [
dependencies = [
"cfg-if 1.0.0",
"cfg-if 1.0.0",
"fastrand",
"fastrand",
...
@@ -5283,6 +5284,7 @@ dependencies = [
...
@@ -5283,6 +5284,7 @@ dependencies = [
"async-stream",
"async-stream",
"async-trait",
"async-trait",
"axum 0.8.1",
"axum 0.8.1",
"blake3",
"bytes",
"bytes",
"chrono",
"chrono",
"derive_builder",
"derive_builder",
...
@@ -5298,6 +5300,7 @@ dependencies = [
...
@@ -5298,6 +5300,7 @@ dependencies = [
"rstest",
"rstest",
"serde",
"serde",
"serde_json",
"serde_json",
"tempfile",
"thiserror 2.0.11",
"thiserror 2.0.11",
"tokio",
"tokio",
"tokio-stream",
"tokio-stream",
...
...
llm/rust/triton-llm/Cargo.toml
View file @
65a2dfab
...
@@ -48,8 +48,16 @@ validator = { workspace = true }
...
@@ -48,8 +48,16 @@ validator = { workspace = true }
uuid
=
{
workspace
=
true
}
uuid
=
{
workspace
=
true
}
xxhash-rust
=
{
workspace
=
true
}
xxhash-rust
=
{
workspace
=
true
}
blake3
=
"1"
# protocols
# protocols
chrono
=
{
version
=
"0.4"
}
chrono
=
{
version
=
"0.4"
,
default-features
=
false
,
features
=
[
"alloc"
,
"std"
,
"clock"
,
"now"
,
"serde"
,
]
}
serde_json
=
{
version
=
"1"
}
serde_json
=
{
version
=
"1"
}
regex
=
"1"
regex
=
"1"
unicode-segmentation
=
"1.12"
unicode-segmentation
=
"1.12"
...
@@ -67,4 +75,5 @@ mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", rev = "5e6
...
@@ -67,4 +75,5 @@ mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git", rev = "5e6
insta
=
{
version
=
"1.41"
,
features
=
[
"glob"
,
"json"
,
"redactions"
]}
insta
=
{
version
=
"1.41"
,
features
=
[
"glob"
,
"json"
,
"redactions"
]}
proptest
=
"1.5.0"
proptest
=
"1.5.0"
reqwest
=
{
version
=
"0.12"
,
default-features
=
false
,
features
=
[
"json"
,
"stream"
,
"rustls-tls"
]
}
reqwest
=
{
version
=
"0.12"
,
default-features
=
false
,
features
=
[
"json"
,
"stream"
,
"rustls-tls"
]
}
rstest
=
"0.18.2"
rstest
=
"0.18.2"
\ No newline at end of file
tempfile
=
"3.17.1"
\ No newline at end of file
llm/rust/triton-llm/src/common.rs
0 → 100644
View file @
65a2dfab
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub
mod
versioned
;
llm/rust/triton-llm/src/common/versioned.rs
0 → 100644
View file @
65a2dfab
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/// A trait allowing to get/set a revision on an object.
/// NATS uses this to ensure atomic updates.
pub
trait
Versioned
{
fn
revision
(
&
self
)
->
u64
;
fn
set_revision
(
&
mut
self
,
r
:
u64
);
}
llm/rust/triton-llm/src/lib.rs
View file @
65a2dfab
...
@@ -23,3 +23,5 @@ pub mod http;
...
@@ -23,3 +23,5 @@ pub mod http;
pub
mod
kv_router
;
pub
mod
kv_router
;
pub
mod
protocols
;
pub
mod
protocols
;
pub
mod
types
;
pub
mod
types
;
pub
mod
model_card
;
pub
mod
common
;
\ No newline at end of file
llm/rust/triton-llm/src/model_card.rs
0 → 100644
View file @
65a2dfab
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub
mod
create
;
pub
mod
model
;
\ No newline at end of file
llm/rust/triton-llm/src/model_card/create.rs
0 → 100644
View file @
65a2dfab
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
std
::
collections
::
HashMap
;
use
std
::
path
::
Path
;
use
std
::
fs
;
use
crate
::
model_card
::
model
::
ModelDeploymentCard
;
use
anyhow
::{
Context
,
Result
};
use
crate
::
model_card
::
model
::{
ModelInfoType
,
TokenizerKind
,
PromptFormatterArtifact
,
File
};
impl
ModelDeploymentCard
{
/// Creates a ModelDeploymentCard from a local directory path.
///
/// Currently HuggingFace format is supported and following files are expected:
/// - config.json: Model configuration in HuggingFace format
/// - tokenizer.json: Tokenizer configuration in HuggingFace format
/// - tokenizer_config.json: Optional prompt formatter configuration
///
/// # Arguments
/// * `local_root_dir` - Path to the local model directory
///
/// # Errors
/// Returns an error if:
/// - The path doesn't exist or isn't a directory
/// - The path contains invalid Unicode characters
/// - Required model files are missing or invalid
pub
async
fn
from_local_path
(
local_root_dir
:
impl
AsRef
<
Path
>
)
->
anyhow
::
Result
<
Self
>
{
let
local_root_dir
=
local_root_dir
.as_ref
();
check_valid_local_repo_path
(
local_root_dir
)
?
;
let
repo_id
=
local_root_dir
.canonicalize
()
?
.to_str
()
.ok_or_else
(||
anyhow
::
anyhow!
(
"Path contains invalid Unicode"
))
?
.to_string
();
let
model_name
=
local_root_dir
.file_name
()
.and_then
(|
n
|
n
.to_str
())
.ok_or_else
(||
anyhow
::
anyhow!
(
"Invalid model directory name"
))
?
;
Self
::
from_repo
(
&
repo_id
,
model_name
)
.await
}
/// TODO: This will be implemented after nova-hub is integrated with the model-card
/// TODO: Attempt to auto-detect model type and construct an MDC from a NGC repo
pub
async
fn
from_ngc_repo
(
_
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Err
(
anyhow
::
anyhow!
(
"ModelDeploymentCard::from_ngc_repo is not implemented"
))
}
pub
async
fn
from_repo
(
repo_id
:
&
str
,
model_name
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
{
display_name
:
model_name
.to_string
(),
service_name
:
model_name
.to_string
(),
model_info
:
ModelInfoType
::
from_repo
(
repo_id
)
.await
?
,
tokenizer
:
TokenizerKind
::
from_repo
(
repo_id
)
.await
?
,
prompt_formatter
:
PromptFormatterArtifact
::
from_repo
(
repo_id
)
.await
?
,
prompt_context
:
None
,
// TODO - auto-detect prompt context
revision
:
0
,
last_published
:
None
,
requires_preprocessing
:
true
,
})
}
}
impl
ModelInfoType
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Self
>
{
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract model info from repo {}"
,
repo_id
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfConfigJson
(
check_for_file
(
repo
,
"config.json"
)
.await
?
,
))
}
}
impl
PromptFormatterArtifact
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Option
<
Self
>>
{
// we should only error if we expect a prompt formatter and it's not found
// right now, we don't know when to expect it, so we just return Ok(Some/None)
Ok
(
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract prompt format from repo {}"
,
repo_id
))
.ok
())
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfTokenizerConfigJson
(
check_for_file
(
repo
,
"tokenizer_config.json"
)
.await
?
,
))
}
}
impl
TokenizerKind
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Self
>
{
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract tokenizer kind from repo {}"
,
repo_id
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfTokenizerJson
(
check_for_file
(
repo
,
"tokenizer.json"
)
.await
?
,
))
}
}
/// Checks if the provided path contains the expected file.
async
fn
check_for_file
(
repo_id
:
&
str
,
file
:
&
str
)
->
anyhow
::
Result
<
File
>
{
let
mut
files
=
check_for_files
(
repo_id
,
vec!
[
file
.to_string
()])
.await
?
;
let
file
=
files
.remove
(
file
)
.ok_or
(
anyhow
::
anyhow!
(
"file {} not found"
,
file
))
?
;
Ok
(
file
)
}
async
fn
check_for_files
(
repo_id
:
&
str
,
files
:
Vec
<
String
>
)
->
Result
<
HashMap
<
String
,
File
>>
{
let
dir_entries
=
fs
::
read_dir
(
repo_id
)
.with_context
(||
format!
(
"Failed to read directory: {}"
,
repo_id
))
?
;
let
mut
found_files
=
HashMap
::
new
();
for
entry
in
dir_entries
{
let
entry
=
entry
.with_context
(||
format!
(
"Failed to read directory entry in {}"
,
repo_id
))
?
;
let
path
=
entry
.path
();
let
file_name
=
path
.file_name
()
.and_then
(|
n
|
n
.to_str
())
.ok_or_else
(||
anyhow
::
anyhow!
(
"Invalid file name in {}"
,
repo_id
))
?
;
if
files
.contains
(
&
file_name
.to_string
())
{
found_files
.insert
(
file_name
.to_string
(),
path
.to_str
()
.ok_or_else
(||
anyhow
::
anyhow!
(
"Invalid path"
))
?
.to_string
(),
);
}
}
Ok
(
found_files
)
}
/// Checks if the provided path is a valid local repository path.
///
/// # Arguments
/// * `path` - Path to validate
///
/// # Errors
/// Returns an error if the path doesn't exist or isn't a directory
fn
check_valid_local_repo_path
(
path
:
impl
AsRef
<
Path
>
)
->
Result
<
()
>
{
let
path
=
path
.as_ref
();
if
!
path
.exists
()
{
return
Err
(
anyhow
::
anyhow!
(
"Model path does not exist: {}"
,
path
.display
()));
}
if
!
path
.is_dir
()
{
return
Err
(
anyhow
::
anyhow!
(
"Model path is not a directory: {}"
,
path
.display
()));
}
Ok
(())
}
\ No newline at end of file
llm/rust/triton-llm/src/model_card/model.rs
0 → 100644
View file @
65a2dfab
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! # Model Deployment Card
//!
//! The ModelDeploymentCard (MDC) is the primary model configuration structure that will be available to any
//! component that needs to interact with the model or its dependent artifacts.
//!
//! The ModelDeploymentCard contains LLM model deployment configuration information:
//! - Display name and service name for the model
//! - Model information (ModelInfoType)
//! - Tokenizer configuration (TokenizerKind)
//! - Prompt formatter settings (PromptFormatterArtifact)
//! - Various metadata like revision, publish time, etc.
use
anyhow
::
Result
;
use
either
::
Either
;
use
crate
::
protocols
::
TokenIdType
;
use
serde
::{
Deserialize
,
Serialize
};
use
std
::
sync
::
Arc
;
use
std
::
fmt
;
use
std
::
path
::
Path
;
use
std
::
time
::
Duration
;
use
derive_builder
::
Builder
;
use
triton_distributed
::
slug
::
Slug
;
pub
const
BUCKET_NAME
:
&
str
=
"mdc"
;
/// Delete model deployment cards that haven't been re-published after this long.
/// Cleans up if the worker stopped.
pub
const
BUCKET_TTL
:
Duration
=
Duration
::
from_secs
(
5
*
60
);
/// If a model deployment card hasn't been refreshed in this much time the worker is likely gone
const
CARD_MAX_AGE
:
chrono
::
TimeDelta
=
chrono
::
TimeDelta
::
minutes
(
5
);
pub
type
File
=
String
;
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
ModelInfoType
{
HfConfigJson
(
File
),
}
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
TokenizerKind
{
HfTokenizerJson
(
File
),
}
/// Supported types of prompt formatters.
///
/// We need a way to associate the prompt formatter template definition with an associated
/// data model which is expected for rendering.
///
/// All current prompt formatters are Jinja2 templates which use the OpenAI ChatCompletionRequest
/// format. However, we currently do not have a discovery path to know if the model supports tool use
/// unless we inspect the template.
///
/// TODO(): Add an enum for the PromptFormatDataModel with at minimum arms for:
/// - OaiChat
/// - OaiChatToolUse
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
PromptFormatterArtifact
{
HfTokenizerConfigJson
(
File
),
}
#[derive(Serialize,
Deserialize,
Clone,
Debug,
PartialEq,
Eq,
Hash)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
PromptContextMixin
{
/// Support OAI Chat Messages and Tools
OaiChat
,
/// Enables templates with `{{datatime}}` to be rendered with the current date and time.
Llama3DateTime
,
}
#[derive(Serialize,
Deserialize,
Clone,
Debug,
Builder)]
pub
struct
ModelDeploymentCard
{
/// Human readable model name, e.g. "Meta Llama 3.1 8B Instruct"
pub
display_name
:
String
,
/// Identifier to expect in OpenAI compatible HTTP request, e.g. "meta-llama/Meta-Llama-3.1-8B-Instruct"
/// This will get slugified for use in NATS.
pub
service_name
:
String
,
/// Model information
pub
model_info
:
ModelInfoType
,
/// Tokenizer configuration
pub
tokenizer
:
TokenizerKind
,
/// Prompt Formatter configuration
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
prompt_formatter
:
Option
<
PromptFormatterArtifact
>
,
/// Prompt Formatter Config
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
prompt_context
:
Option
<
Vec
<
PromptContextMixin
>>
,
/// When this card was last advertised by a worker. None if not yet published.
pub
last_published
:
Option
<
chrono
::
DateTime
<
chrono
::
Utc
>>
,
/// Incrementing count of how many times we published this card
#[serde(default,
skip_serializing)]
pub
revision
:
u64
,
/// Does this model expect preprocessing (tokenization, etc) to be already done?
/// If this is true they get a BackendInput JSON. If this is false they get
/// a ChatCompletionRequest JSON.
#[serde(default)]
pub
requires_preprocessing
:
bool
,
}
impl
ModelDeploymentCard
{
pub
fn
builder
()
->
ModelDeploymentCardBuilder
{
ModelDeploymentCardBuilder
::
default
()
}
/// A URL and NATS friendly and very likely unique ID for this model.
/// Mostly human readable. a-z, 0-9, _ and - only.
/// Pass the service_name.
pub
fn
service_name_slug
(
s
:
&
str
)
->
Slug
{
Slug
::
from_string
(
s
)
}
pub
fn
set_service_name
(
&
mut
self
,
service_name
:
&
str
)
{
self
.service_name
=
service_name
.to_string
();
}
/// How often we should check if a model deployment card expired because it's workers are gone
pub
fn
expiry_check_period
()
->
Duration
{
match
CARD_MAX_AGE
.to_std
()
{
Ok
(
duration
)
=>
duration
/
3
,
Err
(
_
)
=>
{
// Only happens if CARD_MAX_AGE is negative, which it isn't
unreachable!
(
"Cannot run card expiry watcher, invalid CARD_MAX_AGE"
);
}
}
}
pub
fn
slug
(
&
self
)
->
Slug
{
ModelDeploymentCard
::
service_name_slug
(
&
self
.service_name
)
}
/// Load a model deployment card from a JSON file
pub
fn
load_from_json_file
<
P
:
AsRef
<
Path
>>
(
file
:
P
)
->
std
::
io
::
Result
<
Self
>
{
let
mut
card
:
ModelDeploymentCard
=
serde_json
::
from_str
(
&
std
::
fs
::
read_to_string
(
file
)
?
)
?
;
card
.requires_preprocessing
=
false
;
Ok
(
card
)
}
/// Load a model deployment card from a JSON string
pub
fn
load_from_json_str
(
json
:
&
str
)
->
Result
<
Self
,
anyhow
::
Error
>
{
Ok
(
serde_json
::
from_str
(
json
)
?
)
}
/// Save the model deployment card to a JSON file
pub
fn
save_to_json_file
(
&
self
,
file
:
&
str
)
->
Result
<
(),
anyhow
::
Error
>
{
std
::
fs
::
write
(
file
,
self
.to_json
()
?
)
?
;
Ok
(())
}
/// Serialize the model deployment card to a JSON string
pub
fn
to_json
(
&
self
)
->
Result
<
String
,
anyhow
::
Error
>
{
Ok
(
serde_json
::
to_string
(
self
)
?
)
}
pub
fn
mdcsum
(
&
self
)
->
String
{
let
json
=
self
.to_json
()
.unwrap
();
format!
(
"{}"
,
blake3
::
hash
(
json
.as_bytes
()))
}
/// Was this card last published a long time ago, suggesting the worker is gone?
pub
fn
is_expired
(
&
self
)
->
bool
{
if
let
Some
(
last_published
)
=
self
.last_published
.as_ref
()
{
chrono
::
Utc
::
now
()
-
last_published
>
CARD_MAX_AGE
}
else
{
false
}
}
}
impl
fmt
::
Display
for
ModelDeploymentCard
{
fn
fmt
(
&
self
,
f
:
&
mut
fmt
::
Formatter
<
'_
>
)
->
fmt
::
Result
{
write!
(
f
,
"{}"
,
self
.slug
())
}
}
pub
trait
ModelInfo
:
Send
+
Sync
{
/// Model type
fn
model_type
(
&
self
)
->
String
;
/// Token ID for the beginning of sequence
fn
bos_token_id
(
&
self
)
->
TokenIdType
;
/// Token ID for the end of sequence
fn
eos_token_ids
(
&
self
)
->
Vec
<
TokenIdType
>
;
/// Maximum position embeddings / max sequence length
fn
max_position_embeddings
(
&
self
)
->
usize
;
/// Vocabulary size
fn
vocab_size
(
&
self
)
->
usize
;
}
impl
ModelInfoType
{
pub
async
fn
get_model_info
(
&
self
)
->
Result
<
Arc
<
dyn
ModelInfo
>>
{
match
self
{
Self
::
HfConfigJson
(
info
)
=>
HFConfigJsonFile
::
from_file
(
info
)
.await
,
}
}
}
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
struct
HFConfigJsonFile
{
bos_token_id
:
TokenIdType
,
#[serde(with
=
"either::serde_untagged"
)]
eos_token_id
:
Either
<
TokenIdType
,
Vec
<
TokenIdType
>>
,
/// denotes the mixin to the flattened data model which can be present
/// in the config.json file
architectures
:
Vec
<
String
>
,
/// general model type
model_type
:
String
,
/// max sequence length
max_position_embeddings
:
usize
,
/// number of layers in the model
num_hidden_layers
:
usize
,
/// number of attention heads in the model
num_attention_heads
:
usize
,
/// Vocabulary size
vocab_size
:
usize
,
}
impl
HFConfigJsonFile
{
async
fn
from_file
(
file
:
&
File
)
->
Result
<
Arc
<
dyn
ModelInfo
>>
{
let
contents
=
std
::
fs
::
read_to_string
(
&
file
)
?
;
let
config
:
Self
=
serde_json
::
from_str
(
&
contents
)
?
;
Ok
(
Arc
::
new
(
config
))
}
}
impl
ModelInfo
for
HFConfigJsonFile
{
fn
model_type
(
&
self
)
->
String
{
self
.model_type
.clone
()
}
fn
bos_token_id
(
&
self
)
->
TokenIdType
{
self
.bos_token_id
}
fn
eos_token_ids
(
&
self
)
->
Vec
<
TokenIdType
>
{
match
&
self
.eos_token_id
{
Either
::
Left
(
eos_token_id
)
=>
vec!
[
*
eos_token_id
],
Either
::
Right
(
eos_token_ids
)
=>
eos_token_ids
.clone
(),
}
}
fn
max_position_embeddings
(
&
self
)
->
usize
{
self
.max_position_embeddings
}
fn
vocab_size
(
&
self
)
->
usize
{
self
.vocab_size
}
}
llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/config.json
0 → 100644
View file @
65a2dfab
{
"architectures"
:
[
"LlamaForCausalLM"
],
"attention_bias"
:
false
,
"attention_dropout"
:
0.0
,
"bos_token_id"
:
128000
,
"eos_token_id"
:
128009
,
"hidden_act"
:
"silu"
,
"hidden_size"
:
4096
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
14336
,
"max_position_embeddings"
:
8192
,
"model_type"
:
"llama"
,
"num_attention_heads"
:
32
,
"num_hidden_layers"
:
32
,
"num_key_value_heads"
:
8
,
"pretraining_tp"
:
1
,
"rms_norm_eps"
:
1e-05
,
"rope_scaling"
:
null
,
"rope_theta"
:
500000.0
,
"tie_word_embeddings"
:
false
,
"torch_dtype"
:
"bfloat16"
,
"transformers_version"
:
"4.40.0.dev0"
,
"use_cache"
:
true
,
"vocab_size"
:
128256
}
llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/generation_config.json
0 → 100644
View file @
65a2dfab
{
"bos_token_id"
:
128000
,
"eos_token_id"
:
[
128001
,
128009
],
"do_sample"
:
true
,
"temperature"
:
0.6
,
"max_length"
:
4096
,
"top_p"
:
0.9
,
"transformers_version"
:
"4.40.0.dev0"
}
llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/tokenizer.json
0 → 100644
View file @
65a2dfab
{
"version"
:
"1.0"
,
"truncation"
:
null
,
"padding"
:
null
,
"added_tokens"
:
[
{
"id"
:
128000
,
"content"
:
"<|begin_of_text|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128001
,
"content"
:
"<|end_of_text|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128002
,
"content"
:
"<|reserved_special_token_0|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128003
,
"content"
:
"<|reserved_special_token_1|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128004
,
"content"
:
"<|reserved_special_token_2|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128005
,
"content"
:
"<|reserved_special_token_3|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128006
,
"content"
:
"<|start_header_id|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128007
,
"content"
:
"<|end_header_id|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128008
,
"content"
:
"<|reserved_special_token_4|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128009
,
"content"
:
"<|eot_id|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
},
{
"id"
:
128010
,
"content"
:
"<|reserved_special_token_5|>"
,
"single_word"
:
false
,
"lstrip"
:
false
,
"rstrip"
:
false
,
"normalized"
:
false
,
"special"
:
true
}
],
"normalizer"
:
null
,
"pre_tokenizer"
:
{
"type"
:
"Sequence"
,
"pretokenizers"
:
[
{
"type"
:
"Split"
,
"pattern"
:
{
"Regex"
:
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^
\\
r
\\
n
\\
p{L}
\\
p{N}]?
\\
p{L}+|
\\
p{N}{1,3}| ?[^
\\
s
\\
p{L}
\\
p{N}]+[
\\
r
\\
n]*|
\\
s*[
\\
r
\\
n]+|
\\
s+(?!
\\
S)|
\\
s+"
},
"behavior"
:
"Isolated"
,
"invert"
:
false
},
{
"type"
:
"ByteLevel"
,
"add_prefix_space"
:
false
,
"trim_offsets"
:
true
,
"use_regex"
:
false
}
]
},
"post_processor"
:
{
"type"
:
"Sequence"
,
"processors"
:
[
{
"type"
:
"ByteLevel"
,
"add_prefix_space"
:
true
,
"trim_offsets"
:
false
,
"use_regex"
:
true
},
{
"type"
:
"TemplateProcessing"
,
"single"
:
[
{
"SpecialToken"
:
{
"id"
:
"<|begin_of_text|>"
,
"type_id"
:
0
}
},
{
"Sequence"
:
{
"id"
:
"A"
,
"type_id"
:
0
}
}
],
"pair"
:
[
{
"SpecialToken"
:
{
"id"
:
"<|begin_of_text|>"
,
"type_id"
:
0
}
},
{
"Sequence"
:
{
"id"
:
"A"
,
"type_id"
:
0
}
},
{
"SpecialToken"
:
{
"id"
:
"<|begin_of_text|>"
,
"type_id"
:
1
}
},
{
"Sequence"
:
{
"id"
:
"B"
,
"type_id"
:
1
}
}
],
"special_tokens"
:
{
"<|begin_of_text|>"
:
{
"id"
:
"<|begin_of_text|>"
,
"ids"
:
[
128000
],
"tokens"
:
[
"<|begin_of_text|>"
]
}
}
}
]
},
"decoder"
:
{
"type"
:
"ByteLevel"
,
"add_prefix_space"
:
true
,
"trim_offsets"
:
true
,
"use_regex"
:
true
},
"model"
:
{
"type"
:
"BPE"
,
"dropout"
:
null
,
"unk_token"
:
null
,
"continuing_subword_prefix"
:
null
,
"end_of_word_suffix"
:
null
,
"fuse_unk"
:
false
,
"byte_fallback"
:
false
,
"ignore_merges"
:
true
,
"vocab"
:
{},
"merges"
:
[]
}
}
\ No newline at end of file
llm/rust/triton-llm/tests/data/sample-models/mock-llama-3.1-8b-instruct/tokenizer_config.json
0 → 100644
View file @
65a2dfab
{
"added_tokens_decoder"
:
{
"128000"
:
{
"content"
:
"<|begin_of_text|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
},
"128001"
:
{
"content"
:
"<|end_of_text|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
},
"128002"
:
{
"content"
:
"<|reserved_special_token_0|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
},
"128003"
:
{
"content"
:
"<|reserved_special_token_1|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
},
"128004"
:
{
"content"
:
"<|reserved_special_token_2|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
},
"128005"
:
{
"content"
:
"<|reserved_special_token_3|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
},
"128006"
:
{
"content"
:
"<|start_header_id|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
},
"128007"
:
{
"content"
:
"<|end_header_id|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
},
"128008"
:
{
"content"
:
"<|reserved_special_token_4|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
},
"128009"
:
{
"content"
:
"<|eot_id|>"
,
"lstrip"
:
false
,
"normalized"
:
false
,
"rstrip"
:
false
,
"single_word"
:
false
,
"special"
:
true
}
},
"bos_token"
:
"<|begin_of_text|>"
,
"chat_template"
:
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
\n\n
'+ message['content'] | trim %}{% if loop.first %}{% set content = bos_token + content %}{% endif %}{% if not loop.last %}{% set content = content + '<|eot_id|>'%}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\n\n
' }}{% endif %}"
,
"clean_up_tokenization_spaces"
:
true
,
"eos_token"
:
"<|eot_id|>"
,
"model_input_names"
:
[
"input_ids"
,
"attention_mask"
],
"model_max_length"
:
1000000000000000019884624838656
,
"tokenizer_class"
:
"PreTrainedTokenizerFast"
}
llm/rust/triton-llm/tests/model_card.rs
0 → 100644
View file @
65a2dfab
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
triton_llm
::
model_card
::
model
::{
ModelDeploymentCard
,
ModelInfoType
,
TokenizerKind
,
PromptFormatterArtifact
};
use
tempfile
::
tempdir
;
#[tokio::test]
async
fn
test_model_info_from_hf_like_local_repo
()
{
let
path
=
"tests/data/sample-models/mock-llama-3.1-8b-instruct"
;
let
mdc
=
ModelDeploymentCard
::
from_local_path
(
path
)
.await
.unwrap
();
let
info
=
mdc
.model_info
.get_model_info
()
.await
.unwrap
();
assert_eq!
(
info
.model_type
(),
"llama"
);
assert_eq!
(
info
.bos_token_id
(),
128000
);
assert_eq!
(
info
.eos_token_ids
(),
vec!
[
128009
]);
assert_eq!
(
info
.max_position_embeddings
(),
8192
);
assert_eq!
(
info
.vocab_size
(),
128256
);
}
#[tokio::test]
async
fn
test_model_info_from_non_existent_local_repo
()
{
let
path
=
"tests/data/sample-models/this-model-does-not-exist"
;
let
result
=
ModelDeploymentCard
::
from_local_path
(
path
)
.await
;
assert
!
(
result
.is_err
());
}
#[tokio::test]
async
fn
test_tokenizer_from_hf_like_local_repo
()
{
let
path
=
"tests/data/sample-models/mock-llama-3.1-8b-instruct"
;
let
mdc
=
ModelDeploymentCard
::
from_local_path
(
path
)
.await
.unwrap
();
// Verify tokenizer file was found
match
mdc
.tokenizer
{
TokenizerKind
::
HfTokenizerJson
(
_
)
=>
(),
_
=>
panic!
(
"Expected HfTokenizerJson"
),
}
}
#[tokio::test]
async
fn
test_prompt_formatter_from_hf_like_local_repo
()
{
let
path
=
"tests/data/sample-models/mock-llama-3.1-8b-instruct"
;
let
mdc
=
ModelDeploymentCard
::
from_local_path
(
path
)
.await
.unwrap
();
// Verify prompt formatter was found
match
mdc
.prompt_formatter
{
Some
(
PromptFormatterArtifact
::
HfTokenizerConfigJson
(
_
))
=>
(),
_
=>
panic!
(
"Expected HfTokenizerConfigJson prompt formatter"
),
}
}
#[tokio::test]
async
fn
test_missing_required_files
()
{
// Create empty temp directory
let
temp_dir
=
tempdir
()
.unwrap
();
let
result
=
ModelDeploymentCard
::
from_local_path
(
temp_dir
.path
())
.await
;
assert
!
(
result
.is_err
());
let
err
=
result
.unwrap_err
()
.to_string
();
// Should fail because config.json is missing
assert
!
(
err
.contains
(
"unable to extract"
));
}
\ No newline at end of file
python-wheel/Cargo.lock
View file @
65a2dfab
...
@@ -469,6 +469,7 @@ dependencies = [
...
@@ -469,6 +469,7 @@ dependencies = [
"iana-time-zone",
"iana-time-zone",
"js-sys",
"js-sys",
"num-traits",
"num-traits",
"serde",
"wasm-bindgen",
"wasm-bindgen",
"windows-targets",
"windows-targets",
]
]
...
@@ -3189,6 +3190,7 @@ dependencies = [
...
@@ -3189,6 +3190,7 @@ dependencies = [
"async-stream",
"async-stream",
"async-trait",
"async-trait",
"axum 0.8.1",
"axum 0.8.1",
"blake3",
"bytes",
"bytes",
"chrono",
"chrono",
"derive_builder",
"derive_builder",
...
...
runtime/rust/src/lib.rs
View file @
65a2dfab
...
@@ -38,11 +38,11 @@ pub mod protocols;
...
@@ -38,11 +38,11 @@ pub mod protocols;
pub
mod
runnable
;
pub
mod
runnable
;
pub
mod
runtime
;
pub
mod
runtime
;
pub
mod
service
;
pub
mod
service
;
pub
mod
slug
;
pub
mod
transports
;
pub
mod
transports
;
pub
mod
worker
;
pub
mod
worker
;
pub
mod
distributed
;
pub
mod
distributed
;
pub
use
futures
::
stream
;
pub
use
futures
::
stream
;
pub
use
tokio_util
::
sync
::
CancellationToken
;
pub
use
tokio_util
::
sync
::
CancellationToken
;
pub
use
worker
::
Worker
;
pub
use
worker
::
Worker
;
...
...
runtime/rust/src/
transports/nats/
slug.rs
→
runtime/rust/src/slug.rs
View file @
65a2dfab
File moved
runtime/rust/src/transports/nats.rs
View file @
65a2dfab
...
@@ -38,8 +38,7 @@ use std::path::PathBuf;
...
@@ -38,8 +38,7 @@ use std::path::PathBuf;
use
tokio
::
time
;
use
tokio
::
time
;
use
validator
::{
Validate
,
ValidationError
};
use
validator
::{
Validate
,
ValidationError
};
mod
slug
;
pub
use
crate
::
slug
::
Slug
;
pub
use
slug
::
Slug
;
use
tracing
as
log
;
use
tracing
as
log
;
#[derive(Clone)]
#[derive(Clone)]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment