Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
1954fcfa
Unverified
Commit
1954fcfa
authored
Aug 07, 2025
by
Graham King
Committed by
GitHub
Aug 07, 2025
Browse files
chore: Remove service_name from ModelDeploymentCard (#2349)
parent
ccc8815b
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
892 additions
and
1001 deletions
+892
-1001
lib/bindings/python/rust/llm/model_card.rs
lib/bindings/python/rust/llm/model_card.rs
+1
-13
lib/llm/src/backend.rs
lib/llm/src/backend.rs
+1
-13
lib/llm/src/local_model.rs
lib/llm/src/local_model.rs
+4
-1
lib/llm/src/migration.rs
lib/llm/src/migration.rs
+1
-13
lib/llm/src/model_card.rs
lib/llm/src/model_card.rs
+879
-3
lib/llm/src/model_card/create.rs
lib/llm/src/model_card/create.rs
+0
-247
lib/llm/src/model_card/model.rs
lib/llm/src/model_card/model.rs
+0
-657
lib/llm/src/preprocessor.rs
lib/llm/src/preprocessor.rs
+1
-1
lib/llm/src/preprocessor/prompt/template.rs
lib/llm/src/preprocessor/prompt/template.rs
+1
-13
lib/llm/tests/backend.rs
lib/llm/tests/backend.rs
+1
-13
lib/llm/tests/model_card.rs
lib/llm/tests/model_card.rs
+1
-13
lib/llm/tests/preprocessor.rs
lib/llm/tests/preprocessor.rs
+1
-13
lib/runtime/src/slug.rs
lib/runtime/src/slug.rs
+1
-1
No files found.
lib/bindings/python/rust/llm/model_card.rs
View file @
1954fcfa
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
super
::
*
;
use
llm_rs
::
model_card
::
model
::
ModelDeploymentCard
as
RsModelDeploymentCard
;
use
llm_rs
::
model_card
::
ModelDeploymentCard
as
RsModelDeploymentCard
;
#[pyclass]
#[derive(Clone)]
...
...
lib/llm/src/backend.rs
View file @
1954fcfa
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Backend
//!
...
...
@@ -33,7 +21,7 @@ use anyhow::{Error, Result};
use
futures
::
stream
::{
self
,
StreamExt
};
use
tracing
as
log
;
use
crate
::
model_card
::
model
::
{
ModelDeploymentCard
,
TokenizerKind
};
use
crate
::
model_card
::{
ModelDeploymentCard
,
TokenizerKind
};
use
dynamo_runtime
::{
pipeline
::{
async_trait
,
AsyncEngineContextProvider
,
ManyOut
,
Operator
,
ResponseStream
,
...
...
lib/llm/src/local_model.rs
View file @
1954fcfa
...
...
@@ -251,12 +251,15 @@ impl LocalModel {
&
self
.full_path
}
/// Human friendly model name. This is the correct name.
pub
fn
display_name
(
&
self
)
->
&
str
{
&
self
.card.display_name
}
/// The name under which we make this model available over HTTP.
/// A slugified version of the model's name, for use in NATS, etcd, etc.
pub
fn
service_name
(
&
self
)
->
&
str
{
&
self
.card.s
ervice_name
self
.card
.s
lug
()
.as_ref
()
}
pub
fn
request_template
(
&
self
)
->
Option
<
RequestTemplate
>
{
...
...
lib/llm/src/migration.rs
View file @
1954fcfa
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
std
::
sync
::
Arc
;
...
...
@@ -23,7 +11,7 @@ use async_nats::client::{
};
use
crate
::{
model_card
::
model
::
ModelDeploymentCard
,
model_card
::
ModelDeploymentCard
,
protocols
::
common
::
llm_backend
::{
LLMEngineOutput
,
PreprocessedRequest
},
};
...
...
lib/llm/src/model_card.rs
View file @
1954fcfa
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub
mod
create
;
pub
mod
model
;
pub
use
model
::
ModelDeploymentCard
;
//! # Model Deployment Card
//!
//! The ModelDeploymentCard (MDC) is the primary model configuration structure that will be available to any
//! component that needs to interact with the model or its dependent artifacts.
//!
//! The ModelDeploymentCard contains LLM model deployment configuration information:
//! - Display name and service name for the model
//! - Model information (ModelInfoType)
//! - Tokenizer configuration (TokenizerKind)
//! - Prompt formatter settings (PromptFormatterArtifact)
//! - Various metadata like revision, publish time, etc.
use
std
::
fmt
;
use
std
::
fs
::
File
;
use
std
::
path
::{
Path
,
PathBuf
};
use
std
::
sync
::
Arc
;
use
std
::
time
::
Duration
;
use
anyhow
::{
Context
,
Result
};
use
derive_builder
::
Builder
;
use
dynamo_runtime
::{
slug
::
Slug
,
storage
::
key_value_store
::
Versioned
,
transports
::
nats
};
use
serde
::{
Deserialize
,
Serialize
};
use
tokenizers
::
Tokenizer
as
HfTokenizer
;
use
url
::
Url
;
use
crate
::
gguf
::{
Content
,
ContentConfig
,
ModelConfigLike
};
use
crate
::
protocols
::
TokenIdType
;
/// Identify model deployment cards in the key-value store
pub
const
ROOT_PATH
:
&
str
=
"mdc"
;
/// If a model deployment card hasn't been refreshed in this much time the worker is likely gone
const
CARD_MAX_AGE
:
chrono
::
TimeDelta
=
chrono
::
TimeDelta
::
minutes
(
5
);
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
ModelInfoType
{
HfConfigJson
(
String
),
GGUF
(
PathBuf
),
}
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
TokenizerKind
{
HfTokenizerJson
(
String
),
GGUF
(
Box
<
HfTokenizer
>
),
}
/// Supported types of prompt formatters.
///
/// We need a way to associate the prompt formatter template definition with an associated
/// data model which is expected for rendering.
///
/// All current prompt formatters are Jinja2 templates which use the OpenAI ChatCompletionRequest
/// format. However, we currently do not have a discovery path to know if the model supports tool use
/// unless we inspect the template.
///
/// TODO(): Add an enum for the PromptFormatDataModel with at minimum arms for:
/// - OaiChat
/// - OaiChatToolUse
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
PromptFormatterArtifact
{
HfTokenizerConfigJson
(
String
),
HfChatTemplate
(
String
),
GGUF
(
PathBuf
),
}
#[derive(Serialize,
Deserialize,
Clone,
Debug,
PartialEq,
Eq,
Hash)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
PromptContextMixin
{
/// Support OAI Chat Messages and Tools
OaiChat
,
/// Enables templates with `{{datetime}}` to be rendered with the current date and time.
Llama3DateTime
,
}
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
GenerationConfig
{
HfGenerationConfigJson
(
String
),
GGUF
(
PathBuf
),
}
#[derive(Serialize,
Deserialize,
Clone,
Debug,
Builder,
Default)]
pub
struct
ModelDeploymentCard
{
/// Human readable model name, e.g. "Meta Llama 3.1 8B Instruct"
pub
display_name
:
String
,
// Cache the Slugified display_name so we can share references to it
slug
:
Slug
,
/// Model information
pub
model_info
:
Option
<
ModelInfoType
>
,
/// Tokenizer configuration
pub
tokenizer
:
Option
<
TokenizerKind
>
,
/// Prompt Formatter configuration
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
prompt_formatter
:
Option
<
PromptFormatterArtifact
>
,
/// chat template may be stored as a separate file instead of in `prompt_formatter`.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
chat_template_file
:
Option
<
PromptFormatterArtifact
>
,
/// Generation config - default sampling params
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
gen_config
:
Option
<
GenerationConfig
>
,
/// Prompt Formatter Config
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
prompt_context
:
Option
<
Vec
<
PromptContextMixin
>>
,
/// When this card was last advertised by a worker. None if not yet published.
pub
last_published
:
Option
<
chrono
::
DateTime
<
chrono
::
Utc
>>
,
/// Incrementing count of how many times we published this card
#[serde(default,
skip_serializing)]
pub
revision
:
u64
,
/// Max context (in number of tokens) this model can handle
pub
context_length
:
u32
,
/// Size of a KV cache block - vllm only currently
/// Passed to the engine and the KV router.
pub
kv_cache_block_size
:
u32
,
/// How many times a request can be migrated to another worker if the HTTP server lost
/// connection to the current worker.
pub
migration_limit
:
u32
,
/// User-defined metadata for custom worker behavior
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
user_data
:
Option
<
serde_json
::
Value
>
,
}
impl
ModelDeploymentCard
{
pub
fn
builder
()
->
ModelDeploymentCardBuilder
{
ModelDeploymentCardBuilder
::
default
()
}
/// Create a ModelDeploymentCard where only the name is filled in.
///
/// Single-process setups don't need an MDC to communicate model details, but it
/// simplifies the code to assume we always have one. This is how you get one in those
/// cases. A quasi-null object: <https://en.wikipedia.org/wiki/Null_object_pattern>
pub
fn
with_name_only
(
name
:
&
str
)
->
ModelDeploymentCard
{
ModelDeploymentCard
{
display_name
:
name
.to_string
(),
slug
:
Slug
::
from_string
(
name
),
..
Default
::
default
()
}
}
/// How often we should check if a model deployment card expired because it's workers are gone
pub
fn
expiry_check_period
()
->
Duration
{
match
CARD_MAX_AGE
.to_std
()
{
Ok
(
duration
)
=>
duration
/
3
,
Err
(
_
)
=>
{
// Only happens if CARD_MAX_AGE is negative, which it isn't
unreachable!
(
"Cannot run card expiry watcher, invalid CARD_MAX_AGE"
);
}
}
}
/// Load a model deployment card from a JSON file
pub
fn
load_from_json_file
<
P
:
AsRef
<
Path
>>
(
file
:
P
)
->
std
::
io
::
Result
<
Self
>
{
Ok
(
serde_json
::
from_str
(
&
std
::
fs
::
read_to_string
(
file
)
?
)
?
)
}
/// Load a model deployment card from a JSON string
pub
fn
load_from_json_str
(
json
:
&
str
)
->
Result
<
Self
,
anyhow
::
Error
>
{
Ok
(
serde_json
::
from_str
(
json
)
?
)
}
//
// Methods
//
/// Save the model deployment card to a JSON file
pub
fn
save_to_json_file
(
&
self
,
file
:
&
str
)
->
Result
<
(),
anyhow
::
Error
>
{
std
::
fs
::
write
(
file
,
self
.to_json
()
?
)
?
;
Ok
(())
}
pub
fn
slug
(
&
self
)
->
&
Slug
{
&
self
.slug
}
/// Serialize the model deployment card to a JSON string
pub
fn
to_json
(
&
self
)
->
Result
<
String
,
anyhow
::
Error
>
{
Ok
(
serde_json
::
to_string
(
self
)
?
)
}
pub
fn
mdcsum
(
&
self
)
->
String
{
let
json
=
self
.to_json
()
.unwrap
();
format!
(
"{}"
,
blake3
::
hash
(
json
.as_bytes
()))
}
/// Was this card last published a long time ago, suggesting the worker is gone?
pub
fn
is_expired
(
&
self
)
->
bool
{
if
let
Some
(
last_published
)
=
self
.last_published
.as_ref
()
{
chrono
::
Utc
::
now
()
-
last_published
>
CARD_MAX_AGE
}
else
{
false
}
}
/// Is this a full model card with tokenizer?
/// There are cases where we have a placeholder card (see `with_name_only`).
pub
fn
has_tokenizer
(
&
self
)
->
bool
{
self
.tokenizer
.is_some
()
}
pub
fn
tokenizer_hf
(
&
self
)
->
anyhow
::
Result
<
HfTokenizer
>
{
match
&
self
.tokenizer
{
Some
(
TokenizerKind
::
HfTokenizerJson
(
file
))
=>
{
HfTokenizer
::
from_file
(
file
)
.map_err
(
anyhow
::
Error
::
msg
)
}
Some
(
TokenizerKind
::
GGUF
(
t
))
=>
Ok
(
*
t
.clone
()),
None
=>
{
anyhow
::
bail!
(
"Blank ModelDeploymentCard does not have a tokenizer"
);
}
}
}
pub
fn
is_gguf
(
&
self
)
->
bool
{
match
&
self
.model_info
{
Some
(
info
)
=>
info
.is_gguf
(),
None
=>
false
,
}
}
/// Move the files this MDC uses into the NATS object store.
/// Updates the URI's to point to NATS.
pub
async
fn
move_to_nats
(
&
mut
self
,
nats_client
:
nats
::
Client
)
->
Result
<
()
>
{
let
nats_addr
=
nats_client
.addr
();
let
bucket_name
=
self
.slug
()
.clone
();
tracing
::
debug!
(
nats_addr
,
%
bucket_name
,
"Uploading model deployment card fields to NATS"
);
macro_rules!
nats_upload
{
(
$field:expr
,
$enum_variant:path
,
$filename:literal
)
=>
{
if
let
Some
(
$enum_variant
(
src_file
))
=
$field
.take
()
{
if
!
nats
::
is_nats_url
(
&
src_file
)
{
let
target
=
format!
(
"nats://{nats_addr}/{bucket_name}/{}"
,
$filename
);
nats_client
.object_store_upload
(
&
std
::
path
::
PathBuf
::
from
(
&
src_file
),
url
::
Url
::
parse
(
&
target
)
?
,
)
.await
?
;
$field
=
Some
(
$enum_variant
(
target
));
}
}
};
}
nats_upload!
(
self
.model_info
,
ModelInfoType
::
HfConfigJson
,
"config.json"
);
nats_upload!
(
self
.prompt_formatter
,
PromptFormatterArtifact
::
HfTokenizerConfigJson
,
"tokenizer_config.json"
);
nats_upload!
(
self
.chat_template_file
,
PromptFormatterArtifact
::
HfChatTemplate
,
"chat_template.jinja"
);
nats_upload!
(
self
.tokenizer
,
TokenizerKind
::
HfTokenizerJson
,
"tokenizer.json"
);
nats_upload!
(
self
.gen_config
,
GenerationConfig
::
HfGenerationConfigJson
,
"generation_config.json"
);
Ok
(())
}
/// Move the files this MDC uses from the NATS object store to local disk.
/// Updates the URI's to point to the created files.
///
/// The returned TempDir must be kept alive, it cleans up on drop.
pub
async
fn
move_from_nats
(
&
mut
self
,
nats_client
:
nats
::
Client
)
->
Result
<
tempfile
::
TempDir
>
{
let
nats_addr
=
nats_client
.addr
();
let
bucket_name
=
self
.slug
();
let
target_dir
=
tempfile
::
TempDir
::
with_prefix
(
bucket_name
.to_string
())
?
;
tracing
::
debug!
(
nats_addr
,
%
bucket_name
,
target_dir
=
%
target_dir
.path
()
.display
(),
"Downloading model deployment card fields from NATS"
);
macro_rules!
nats_download
{
(
$field:expr
,
$enum_variant:path
,
$filename:literal
)
=>
{
if
let
Some
(
$enum_variant
(
src_url
))
=
$field
.take
()
{
if
nats
::
is_nats_url
(
&
src_url
)
{
let
target
=
target_dir
.path
()
.join
(
$filename
);
nats_client
.object_store_download
(
Url
::
parse
(
&
src_url
)
?
,
&
target
)
.await
?
;
$field
=
Some
(
$enum_variant
(
target
.display
()
.to_string
()));
}
}
};
}
nats_download!
(
self
.model_info
,
ModelInfoType
::
HfConfigJson
,
"config.json"
);
nats_download!
(
self
.prompt_formatter
,
PromptFormatterArtifact
::
HfTokenizerConfigJson
,
"tokenizer_config.json"
);
nats_download!
(
self
.chat_template_file
,
PromptFormatterArtifact
::
HfChatTemplate
,
"chat_template.jinja"
);
nats_download!
(
self
.tokenizer
,
TokenizerKind
::
HfTokenizerJson
,
"tokenizer.json"
);
nats_download!
(
self
.gen_config
,
GenerationConfig
::
HfGenerationConfigJson
,
"generation_config.json"
);
Ok
(
target_dir
)
}
/// Delete this card from the key-value store and it's URLs from the object store
pub
async
fn
delete_from_nats
(
&
mut
self
,
nats_client
:
nats
::
Client
)
->
Result
<
()
>
{
let
nats_addr
=
nats_client
.addr
();
let
bucket_name
=
self
.slug
();
tracing
::
trace!
(
nats_addr
,
%
bucket_name
,
"Delete model deployment card from NATS"
);
nats_client
.object_store_delete_bucket
(
bucket_name
.as_ref
())
.await
}
/// Allow user to override the name we register this model under.
/// Corresponds to vllm's `--served-model-name`.
pub
fn
set_name
(
&
mut
self
,
name
:
&
str
)
{
self
.display_name
=
name
.to_string
();
self
.slug
=
Slug
::
from_string
(
name
);
}
/// Build an in-memory ModelDeploymentCard from either:
/// - a folder containing config.json, tokenizer.json and token_config.json
/// - a GGUF file
pub
async
fn
load
(
config_path
:
impl
AsRef
<
Path
>
)
->
anyhow
::
Result
<
ModelDeploymentCard
>
{
let
config_path
=
config_path
.as_ref
();
if
config_path
.is_dir
()
{
Self
::
from_local_path
(
config_path
)
.await
}
else
{
Self
::
from_gguf
(
config_path
)
.await
}
}
/// Creates a ModelDeploymentCard from a local directory path.
///
/// Currently HuggingFace format is supported and following files are expected:
/// - config.json: Model configuration in HuggingFace format
/// - tokenizer.json: Tokenizer configuration in HuggingFace format
/// - tokenizer_config.json: Optional prompt formatter configuration
///
/// # Arguments
/// * `local_root_dir` - Path to the local model directory
///
/// # Errors
/// Returns an error if:
/// - The path doesn't exist or isn't a directory
/// - The path contains invalid Unicode characters
/// - Required model files are missing or invalid
async
fn
from_local_path
(
local_root_dir
:
impl
AsRef
<
Path
>
)
->
anyhow
::
Result
<
Self
>
{
let
local_root_dir
=
local_root_dir
.as_ref
();
check_valid_local_repo_path
(
local_root_dir
)
?
;
let
repo_id
=
local_root_dir
.canonicalize
()
?
.to_str
()
.ok_or_else
(||
anyhow
::
anyhow!
(
"Path contains invalid Unicode"
))
?
.to_string
();
let
model_name
=
local_root_dir
.file_name
()
.and_then
(|
n
|
n
.to_str
())
.ok_or_else
(||
anyhow
::
anyhow!
(
"Invalid model directory name"
))
?
;
Self
::
from_repo
(
&
repo_id
,
model_name
)
.await
}
async
fn
from_gguf
(
gguf_file
:
&
Path
)
->
anyhow
::
Result
<
Self
>
{
let
model_name
=
gguf_file
.iter
()
.next_back
()
.map
(|
n
|
n
.to_string_lossy
()
.to_string
());
let
Some
(
model_name
)
=
model_name
else
{
// I think this would only happy on an empty path
anyhow
::
bail!
(
"Could not extract model name from path '{}'"
,
gguf_file
.display
()
);
};
// TODO: we do this in HFConfig also, unify
let
content
=
load_gguf
(
gguf_file
)
?
;
let
context_length
=
content
.get_metadata
()[
&
format!
(
"{}.context_length"
,
content
.arch
())]
.to_u32
()
.unwrap_or
(
0
);
tracing
::
debug!
(
context_length
,
"Loaded context length from GGUF"
);
Ok
(
Self
{
display_name
:
model_name
.to_string
(),
slug
:
Slug
::
from_string
(
model_name
),
model_info
:
Some
(
ModelInfoType
::
GGUF
(
gguf_file
.to_path_buf
())),
tokenizer
:
Some
(
TokenizerKind
::
from_gguf
(
gguf_file
)
?
),
gen_config
:
None
,
// AFAICT there is no equivalent in a GGUF
prompt_formatter
:
Some
(
PromptFormatterArtifact
::
GGUF
(
gguf_file
.to_path_buf
())),
chat_template_file
:
None
,
prompt_context
:
None
,
// TODO - auto-detect prompt context
revision
:
0
,
last_published
:
None
,
context_length
,
kv_cache_block_size
:
0
,
migration_limit
:
0
,
user_data
:
None
,
})
}
#[allow(dead_code)]
async
fn
from_ngc_repo
(
_
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Err
(
anyhow
::
anyhow!
(
"ModelDeploymentCard::from_ngc_repo is not implemented"
))
}
async
fn
from_repo
(
repo_id
:
&
str
,
model_name
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
// This is usually the right choice
let
context_length
=
crate
::
file_json_field
(
&
PathBuf
::
from
(
repo_id
)
.join
(
"config.json"
),
"max_position_embeddings"
,
)
// But sometimes this is
.or_else
(|
_
|
{
crate
::
file_json_field
(
&
PathBuf
::
from
(
repo_id
)
.join
(
"tokenizer_config.json"
),
"model_max_length"
,
)
})
// If neither of those are present let the engine default it
.unwrap_or
(
0
);
Ok
(
Self
{
display_name
:
model_name
.to_string
(),
slug
:
Slug
::
from_string
(
model_name
),
model_info
:
Some
(
ModelInfoType
::
from_repo
(
repo_id
)
.await
?
),
tokenizer
:
Some
(
TokenizerKind
::
from_repo
(
repo_id
)
.await
?
),
gen_config
:
GenerationConfig
::
from_repo
(
repo_id
)
.await
.ok
(),
// optional
prompt_formatter
:
PromptFormatterArtifact
::
from_repo
(
repo_id
)
.await
?
,
chat_template_file
:
PromptFormatterArtifact
::
chat_template_from_repo
(
repo_id
)
.await
?
,
prompt_context
:
None
,
// TODO - auto-detect prompt context
revision
:
0
,
last_published
:
None
,
context_length
,
kv_cache_block_size
:
0
,
// set later
migration_limit
:
0
,
user_data
:
None
,
})
}
}
impl
Versioned
for
ModelDeploymentCard
{
fn
revision
(
&
self
)
->
u64
{
self
.revision
}
fn
set_revision
(
&
mut
self
,
revision
:
u64
)
{
self
.last_published
=
Some
(
chrono
::
Utc
::
now
());
self
.revision
=
revision
;
}
}
impl
fmt
::
Display
for
ModelDeploymentCard
{
fn
fmt
(
&
self
,
f
:
&
mut
fmt
::
Formatter
<
'_
>
)
->
fmt
::
Result
{
write!
(
f
,
"{}"
,
self
.slug
())
}
}
pub
trait
ModelInfo
:
Send
+
Sync
{
/// Model type
fn
model_type
(
&
self
)
->
String
;
/// Token ID for the beginning of sequence
fn
bos_token_id
(
&
self
)
->
TokenIdType
;
/// Token ID for the end of sequence
fn
eos_token_ids
(
&
self
)
->
Vec
<
TokenIdType
>
;
/// Maximum position embeddings / max sequence length
/// TODO: This is only used in a single test, no other code. Remove?
fn
max_position_embeddings
(
&
self
)
->
Option
<
usize
>
;
/// Vocabulary size
/// TODO: This is only used in a single test, no other code. Remove?
fn
vocab_size
(
&
self
)
->
Option
<
usize
>
;
}
impl
ModelInfoType
{
pub
async
fn
get_model_info
(
&
self
)
->
Result
<
Arc
<
dyn
ModelInfo
>>
{
match
self
{
Self
::
HfConfigJson
(
info
)
=>
HFConfig
::
from_json_file
(
info
)
.await
,
Self
::
GGUF
(
path
)
=>
HFConfig
::
from_gguf
(
path
),
}
}
pub
fn
is_gguf
(
&
self
)
->
bool
{
matches!
(
self
,
Self
::
GGUF
(
_
))
}
}
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
struct
HFConfig
{
/// denotes the mixin to the flattened data model which can be present
/// in the config.json file
architectures
:
Vec
<
String
>
,
/// general model type
model_type
:
String
,
text_config
:
Option
<
HFTextConfig
>
,
// Sometimes it's inside HFTextConfig, sometimes it's here
eos_token_id
:
Option
<
serde_json
::
Value
>
,
}
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
struct
HFTextConfig
{
// It can take multiple attempts to load this, so Option
bos_token_id
:
Option
<
TokenIdType
>
,
// We set this once bos_token_id is loaded so we don't have to deal with Option
#[serde(default)]
final_bos_token_id
:
TokenIdType
,
eos_token_id
:
Option
<
serde_json
::
Value
>
,
#[serde(default)]
final_eos_token_ids
:
Vec
<
TokenIdType
>
,
/// max sequence length
max_position_embeddings
:
Option
<
usize
>
,
/// number of layers in the model
num_hidden_layers
:
usize
,
/// number of attention heads in the model
num_attention_heads
:
Option
<
usize
>
,
/// Vocabulary size
vocab_size
:
Option
<
usize
>
,
}
impl
HFConfig
{
async
fn
from_json_file
(
file
:
&
str
)
->
Result
<
Arc
<
dyn
ModelInfo
>>
{
let
file_pathbuf
=
PathBuf
::
from
(
file
);
let
contents
=
std
::
fs
::
read_to_string
(
file
)
?
;
let
mut
config
:
Self
=
serde_json
::
from_str
(
&
contents
)
?
;
if
config
.text_config
.is_none
()
{
let
text_config
:
HFTextConfig
=
serde_json
::
from_str
(
&
contents
)
?
;
config
.text_config
=
Some
(
text_config
);
}
// Sometimes bos_token_id is in generation_config.json not config.json
let
Some
(
text_config
)
=
config
.text_config
.as_mut
()
else
{
anyhow
::
bail!
(
"Missing text config fields (model_type, eos_token_ids, etc) in config.json"
);
};
if
text_config
.bos_token_id
.is_none
()
{
let
bos_token_id
=
crate
::
file_json_field
::
<
TokenIdType
>
(
&
Path
::
join
(
file_pathbuf
.parent
()
.unwrap_or
(
&
PathBuf
::
from
(
""
)),
"generation_config.json"
,
),
"bos_token_id"
,
)
.context
(
"missing bos_token_id in generation_config.json and config.json, cannot load"
,
)
?
;
text_config
.bos_token_id
=
Some
(
bos_token_id
);
}
// Now that we have it for sure, set it in the non-Option field
let
final_bos_token_id
=
text_config
.bos_token_id
.take
()
.unwrap
();
text_config
.final_bos_token_id
=
final_bos_token_id
;
// TODO: refactor this when we switch to per-architecture tokenization
let
final_eos_token_ids
:
Vec
<
TokenIdType
>
=
config
.eos_token_id
.as_ref
()
.or
(
text_config
.eos_token_id
.as_ref
())
.and_then
(|
v
|
{
if
v
.is_number
()
{
v
.as_number
()
.and_then
(|
n
|
n
.as_u64
())
.map
(|
n
|
vec!
[
n
as
TokenIdType
])
}
else
if
v
.is_array
()
{
let
arr
=
v
.as_array
()
.unwrap
();
// Safety: We just checked
Some
(
arr
.iter
()
.filter_map
(|
inner_v
|
{
inner_v
.as_number
()
.and_then
(|
n
|
n
.as_u64
())
.map
(|
n
|
n
as
TokenIdType
)
})
.collect
(),
)
}
else
{
tracing
::
error!
(
?
v
,
file
,
"eos_token_id is not a number or an array, cannot use"
);
None
}
})
.or_else
(||
{
// Maybe it's in generation_config.json
crate
::
file_json_field
(
&
Path
::
join
(
file_pathbuf
.parent
()
.unwrap_or
(
&
PathBuf
::
from
(
""
)),
"generation_config.json"
,
),
"eos_token_id"
,
)
.inspect_err
(
|
err
|
tracing
::
warn!
(
%
err
,
"Missing eos_token_id in generation_config.json"
),
)
.ok
()
})
.ok_or_else
(||
{
anyhow
::
anyhow!
(
"missing eos_token_id in config.json and generation_config.json, cannot load"
)
})
?
;
text_config
.final_eos_token_ids
=
final_eos_token_ids
;
Ok
(
Arc
::
new
(
config
))
}
fn
from_gguf
(
gguf_file
:
&
Path
)
->
Result
<
Arc
<
dyn
ModelInfo
>>
{
let
content
=
load_gguf
(
gguf_file
)
?
;
let
model_config_metadata
:
ContentConfig
=
(
&
content
)
.into
();
let
num_hidden_layers
=
content
.get_metadata
()[
&
format!
(
"{}.block_count"
,
content
.arch
())]
.to_u32
()
?
as
usize
;
let
bos_token_id
=
content
.get_metadata
()[
"tokenizer.ggml.bos_token_id"
]
.to_u32
()
?
;
let
eos_token_id
=
content
.get_metadata
()[
"tokenizer.ggml.eos_token_id"
]
.to_u32
()
?
;
// to_vec returns a Vec that's already there, so it's cheap
let
vocab_size
=
content
.get_metadata
()[
"tokenizer.ggml.tokens"
]
.to_vec
()
?
.len
();
let
arch
=
content
.arch
()
.to_string
();
Ok
(
Arc
::
new
(
HFConfig
{
architectures
:
vec!
[
format!
(
"{}ForCausalLM"
,
capitalize
(
&
arch
))],
// "general.architecture"
model_type
:
arch
,
text_config
:
Some
(
HFTextConfig
{
bos_token_id
:
None
,
final_bos_token_id
:
bos_token_id
,
eos_token_id
:
None
,
final_eos_token_ids
:
vec!
[
eos_token_id
],
// "llama.context_length"
max_position_embeddings
:
Some
(
model_config_metadata
.max_seq_len
()),
// "llama.block_count"
num_hidden_layers
,
// "llama.attention.head_count"
num_attention_heads
:
Some
(
model_config_metadata
.num_attn_heads
()),
// "tokenizer.ggml.tokens".len()
vocab_size
:
Some
(
vocab_size
),
}),
eos_token_id
:
None
,
}))
}
}
impl
ModelInfo
for
HFConfig
{
fn
model_type
(
&
self
)
->
String
{
self
.model_type
.clone
()
}
fn
bos_token_id
(
&
self
)
->
TokenIdType
{
self
.text_config
.as_ref
()
.unwrap
()
.final_bos_token_id
}
fn
eos_token_ids
(
&
self
)
->
Vec
<
TokenIdType
>
{
self
.text_config
.as_ref
()
.unwrap
()
.final_eos_token_ids
.clone
()
}
fn
max_position_embeddings
(
&
self
)
->
Option
<
usize
>
{
self
.text_config
.as_ref
()
.unwrap
()
.max_position_embeddings
}
fn
vocab_size
(
&
self
)
->
Option
<
usize
>
{
self
.text_config
.as_ref
()
.unwrap
()
.vocab_size
}
}
impl
TokenizerKind
{
pub
fn
from_gguf
(
gguf_file
:
&
Path
)
->
anyhow
::
Result
<
Self
>
{
let
content
=
load_gguf
(
gguf_file
)
?
;
let
out
=
crate
::
gguf
::
convert_gguf_to_hf_tokenizer
(
&
content
)
.with_context
(||
gguf_file
.display
()
.to_string
())
?
;
Ok
(
TokenizerKind
::
GGUF
(
Box
::
new
(
out
.tokenizer
)))
}
}
pub
(
crate
)
fn
load_gguf
(
gguf_file
:
&
Path
)
->
anyhow
::
Result
<
Content
>
{
let
filename
=
gguf_file
.display
()
.to_string
();
let
mut
f
=
File
::
open
(
gguf_file
)
.with_context
(||
filename
.clone
())
?
;
// vec because GGUF can be split into multiple files (shards)
let
mut
readers
=
vec!
[
&
mut
f
];
crate
::
gguf
::
Content
::
from_readers
(
&
mut
readers
)
.with_context
(||
filename
.clone
())
}
fn
capitalize
(
s
:
&
str
)
->
String
{
let
mut
chars
=
s
.chars
();
match
chars
.next
()
{
None
=>
String
::
new
(),
Some
(
first
)
=>
first
.to_uppercase
()
.collect
::
<
String
>
()
+
&
chars
.as_str
()
.to_lowercase
(),
}
}
impl
ModelInfoType
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Self
>
{
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract model info from repo {}"
,
repo_id
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfConfigJson
(
check_for_file
(
repo
,
"config.json"
)
.await
?
,
))
}
}
impl
PromptFormatterArtifact
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Option
<
Self
>>
{
// we should only error if we expect a prompt formatter and it's not found
// right now, we don't know when to expect it, so we just return Ok(Some/None)
Ok
(
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract prompt format from repo {}"
,
repo_id
))
.ok
())
}
pub
async
fn
chat_template_from_repo
(
repo_id
:
&
str
)
->
Result
<
Option
<
Self
>>
{
Ok
(
Self
::
chat_template_try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract prompt format from repo {}"
,
repo_id
))
.ok
())
}
async
fn
chat_template_try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfChatTemplate
(
check_for_file
(
repo
,
"chat_template.jinja"
)
.await
?
,
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfTokenizerConfigJson
(
check_for_file
(
repo
,
"tokenizer_config.json"
)
.await
?
,
))
}
}
impl
TokenizerKind
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Self
>
{
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract tokenizer kind from repo {}"
,
repo_id
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfTokenizerJson
(
check_for_file
(
repo
,
"tokenizer.json"
)
.await
?
,
))
}
}
impl
GenerationConfig
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Self
>
{
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract generation config from repo {repo_id}"
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfGenerationConfigJson
(
check_for_file
(
repo
,
"generation_config.json"
)
.await
?
,
))
}
}
/// Checks if the provided path contains the expected file.
async
fn
check_for_file
(
repo_id
:
&
str
,
file
:
&
str
)
->
anyhow
::
Result
<
String
>
{
let
p
=
PathBuf
::
from
(
repo_id
)
.join
(
file
);
let
name
=
p
.display
()
.to_string
();
if
!
p
.exists
()
{
anyhow
::
bail!
(
"File not found: {name}"
)
}
Ok
(
name
)
}
/// Checks if the provided path is a valid local repository path.
///
/// # Arguments
/// * `path` - Path to validate
///
/// # Errors
/// Returns an error if the path doesn't exist or isn't a directory
fn
check_valid_local_repo_path
(
path
:
impl
AsRef
<
Path
>
)
->
Result
<
()
>
{
let
path
=
path
.as_ref
();
if
!
path
.exists
()
{
return
Err
(
anyhow
::
anyhow!
(
"Model path does not exist: {}"
,
path
.display
()
));
}
if
!
path
.is_dir
()
{
return
Err
(
anyhow
::
anyhow!
(
"Model path is not a directory: {}"
,
path
.display
()
));
}
Ok
(())
}
#[cfg(test)]
mod
tests
{
use
super
::
HFConfig
;
use
std
::
path
::
Path
;
#[tokio::test]
pub
async
fn
test_config_json_llama3
()
->
anyhow
::
Result
<
()
>
{
let
config_file
=
Path
::
new
(
env!
(
"CARGO_MANIFEST_DIR"
))
.join
(
"tests/data/sample-models/mock-llama-3.1-8b-instruct/config.json"
);
let
config
=
HFConfig
::
from_json_file
(
&
config_file
.display
()
.to_string
())
.await
?
;
assert_eq!
(
config
.bos_token_id
(),
128000
);
Ok
(())
}
#[tokio::test]
pub
async
fn
test_config_json_llama4
()
->
anyhow
::
Result
<
()
>
{
let
config_file
=
Path
::
new
(
env!
(
"CARGO_MANIFEST_DIR"
))
.join
(
"tests/data/sample-models/Llama-4-Scout-17B-16E-Instruct/config.json"
);
let
config
=
HFConfig
::
from_json_file
(
&
config_file
.display
()
.to_string
())
.await
?
;
assert_eq!
(
config
.bos_token_id
(),
200000
);
Ok
(())
}
}
lib/llm/src/model_card/create.rs
deleted
100644 → 0
View file @
ccc8815b
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use
crate
::
model_card
::
model
::
ModelDeploymentCard
;
use
anyhow
::{
Context
,
Result
};
use
std
::
path
::{
Path
,
PathBuf
};
use
crate
::
model_card
::
model
::{
ModelInfoType
,
PromptFormatterArtifact
,
TokenizerKind
};
use
super
::
model
::
GenerationConfig
;
impl
ModelDeploymentCard
{
/// Allow user to override the name we register this model under.
/// Corresponds to vllm's `--served-model-name`.
pub
fn
set_name
(
&
mut
self
,
name
:
&
str
)
{
self
.display_name
=
name
.to_string
();
self
.service_name
=
name
.to_string
();
}
/// Build an in-memory ModelDeploymentCard from either:
/// - a folder containing config.json, tokenizer.json and token_config.json
/// - a GGUF file
pub
async
fn
load
(
config_path
:
impl
AsRef
<
Path
>
)
->
anyhow
::
Result
<
ModelDeploymentCard
>
{
let
config_path
=
config_path
.as_ref
();
if
config_path
.is_dir
()
{
Self
::
from_local_path
(
config_path
)
.await
}
else
{
Self
::
from_gguf
(
config_path
)
.await
}
}
/// Creates a ModelDeploymentCard from a local directory path.
///
/// Currently HuggingFace format is supported and following files are expected:
/// - config.json: Model configuration in HuggingFace format
/// - tokenizer.json: Tokenizer configuration in HuggingFace format
/// - tokenizer_config.json: Optional prompt formatter configuration
///
/// # Arguments
/// * `local_root_dir` - Path to the local model directory
///
/// # Errors
/// Returns an error if:
/// - The path doesn't exist or isn't a directory
/// - The path contains invalid Unicode characters
/// - Required model files are missing or invalid
async
fn
from_local_path
(
local_root_dir
:
impl
AsRef
<
Path
>
)
->
anyhow
::
Result
<
Self
>
{
let
local_root_dir
=
local_root_dir
.as_ref
();
check_valid_local_repo_path
(
local_root_dir
)
?
;
let
repo_id
=
local_root_dir
.canonicalize
()
?
.to_str
()
.ok_or_else
(||
anyhow
::
anyhow!
(
"Path contains invalid Unicode"
))
?
.to_string
();
let
model_name
=
local_root_dir
.file_name
()
.and_then
(|
n
|
n
.to_str
())
.ok_or_else
(||
anyhow
::
anyhow!
(
"Invalid model directory name"
))
?
;
Self
::
from_repo
(
&
repo_id
,
model_name
)
.await
}
async
fn
from_gguf
(
gguf_file
:
&
Path
)
->
anyhow
::
Result
<
Self
>
{
let
model_name
=
gguf_file
.iter
()
.next_back
()
.map
(|
n
|
n
.to_string_lossy
()
.to_string
());
let
Some
(
model_name
)
=
model_name
else
{
// I think this would only happy on an empty path
anyhow
::
bail!
(
"Could not extract model name from path '{}'"
,
gguf_file
.display
()
);
};
// TODO: we do this in HFConfig also, unify
let
content
=
super
::
model
::
load_gguf
(
gguf_file
)
?
;
let
context_length
=
content
.get_metadata
()[
&
format!
(
"{}.context_length"
,
content
.arch
())]
.to_u32
()
.unwrap_or
(
0
);
tracing
::
debug!
(
context_length
,
"Loaded context length from GGUF"
);
Ok
(
Self
{
display_name
:
model_name
.to_string
(),
service_name
:
model_name
.to_string
(),
model_info
:
Some
(
ModelInfoType
::
GGUF
(
gguf_file
.to_path_buf
())),
tokenizer
:
Some
(
TokenizerKind
::
from_gguf
(
gguf_file
)
?
),
gen_config
:
None
,
// AFAICT there is no equivalent in a GGUF
prompt_formatter
:
Some
(
PromptFormatterArtifact
::
GGUF
(
gguf_file
.to_path_buf
())),
chat_template_file
:
None
,
prompt_context
:
None
,
// TODO - auto-detect prompt context
revision
:
0
,
last_published
:
None
,
context_length
,
kv_cache_block_size
:
0
,
migration_limit
:
0
,
user_data
:
None
,
})
}
#[allow(dead_code)]
async
fn
from_ngc_repo
(
_
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Err
(
anyhow
::
anyhow!
(
"ModelDeploymentCard::from_ngc_repo is not implemented"
))
}
async
fn
from_repo
(
repo_id
:
&
str
,
model_name
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
// This is usually the right choice
let
context_length
=
crate
::
file_json_field
(
&
PathBuf
::
from
(
repo_id
)
.join
(
"config.json"
),
"max_position_embeddings"
,
)
// But sometimes this is
.or_else
(|
_
|
{
crate
::
file_json_field
(
&
PathBuf
::
from
(
repo_id
)
.join
(
"tokenizer_config.json"
),
"model_max_length"
,
)
})
// If neither of those are present let the engine default it
.unwrap_or
(
0
);
Ok
(
Self
{
display_name
:
model_name
.to_string
(),
service_name
:
model_name
.to_string
(),
model_info
:
Some
(
ModelInfoType
::
from_repo
(
repo_id
)
.await
?
),
tokenizer
:
Some
(
TokenizerKind
::
from_repo
(
repo_id
)
.await
?
),
gen_config
:
GenerationConfig
::
from_repo
(
repo_id
)
.await
.ok
(),
// optional
prompt_formatter
:
PromptFormatterArtifact
::
from_repo
(
repo_id
)
.await
?
,
chat_template_file
:
PromptFormatterArtifact
::
chat_template_from_repo
(
repo_id
)
.await
?
,
prompt_context
:
None
,
// TODO - auto-detect prompt context
revision
:
0
,
last_published
:
None
,
context_length
,
kv_cache_block_size
:
0
,
// set later
migration_limit
:
0
,
user_data
:
None
,
})
}
}
impl
ModelInfoType
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Self
>
{
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract model info from repo {}"
,
repo_id
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfConfigJson
(
check_for_file
(
repo
,
"config.json"
)
.await
?
,
))
}
}
impl
PromptFormatterArtifact
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Option
<
Self
>>
{
// we should only error if we expect a prompt formatter and it's not found
// right now, we don't know when to expect it, so we just return Ok(Some/None)
Ok
(
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract prompt format from repo {}"
,
repo_id
))
.ok
())
}
pub
async
fn
chat_template_from_repo
(
repo_id
:
&
str
)
->
Result
<
Option
<
Self
>>
{
Ok
(
Self
::
chat_template_try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract prompt format from repo {}"
,
repo_id
))
.ok
())
}
async
fn
chat_template_try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfChatTemplate
(
check_for_file
(
repo
,
"chat_template.jinja"
)
.await
?
,
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfTokenizerConfigJson
(
check_for_file
(
repo
,
"tokenizer_config.json"
)
.await
?
,
))
}
}
impl
TokenizerKind
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Self
>
{
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract tokenizer kind from repo {}"
,
repo_id
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfTokenizerJson
(
check_for_file
(
repo
,
"tokenizer.json"
)
.await
?
,
))
}
}
impl
GenerationConfig
{
pub
async
fn
from_repo
(
repo_id
:
&
str
)
->
Result
<
Self
>
{
Self
::
try_is_hf_repo
(
repo_id
)
.await
.with_context
(||
format!
(
"unable to extract generation config from repo {repo_id}"
))
}
async
fn
try_is_hf_repo
(
repo
:
&
str
)
->
anyhow
::
Result
<
Self
>
{
Ok
(
Self
::
HfGenerationConfigJson
(
check_for_file
(
repo
,
"generation_config.json"
)
.await
?
,
))
}
}
/// Checks if the provided path contains the expected file.
async
fn
check_for_file
(
repo_id
:
&
str
,
file
:
&
str
)
->
anyhow
::
Result
<
String
>
{
let
p
=
PathBuf
::
from
(
repo_id
)
.join
(
file
);
let
name
=
p
.display
()
.to_string
();
if
!
p
.exists
()
{
anyhow
::
bail!
(
"File not found: {name}"
)
}
Ok
(
name
)
}
/// Checks if the provided path is a valid local repository path.
///
/// # Arguments
/// * `path` - Path to validate
///
/// # Errors
/// Returns an error if the path doesn't exist or isn't a directory
fn
check_valid_local_repo_path
(
path
:
impl
AsRef
<
Path
>
)
->
Result
<
()
>
{
let
path
=
path
.as_ref
();
if
!
path
.exists
()
{
return
Err
(
anyhow
::
anyhow!
(
"Model path does not exist: {}"
,
path
.display
()
));
}
if
!
path
.is_dir
()
{
return
Err
(
anyhow
::
anyhow!
(
"Model path is not a directory: {}"
,
path
.display
()
));
}
Ok
(())
}
lib/llm/src/model_card/model.rs
deleted
100644 → 0
View file @
ccc8815b
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! # Model Deployment Card
//!
//! The ModelDeploymentCard (MDC) is the primary model configuration structure that will be available to any
//! component that needs to interact with the model or its dependent artifacts.
//!
//! The ModelDeploymentCard contains LLM model deployment configuration information:
//! - Display name and service name for the model
//! - Model information (ModelInfoType)
//! - Tokenizer configuration (TokenizerKind)
//! - Prompt formatter settings (PromptFormatterArtifact)
//! - Various metadata like revision, publish time, etc.
use
std
::
fmt
;
use
std
::
fs
::
File
;
use
std
::
path
::{
Path
,
PathBuf
};
use
std
::
sync
::
Arc
;
use
std
::
time
::
Duration
;
use
anyhow
::{
Context
,
Result
};
use
derive_builder
::
Builder
;
use
dynamo_runtime
::{
slug
::
Slug
,
storage
::
key_value_store
::
Versioned
,
transports
::
nats
};
use
serde
::{
Deserialize
,
Serialize
};
use
tokenizers
::
Tokenizer
as
HfTokenizer
;
use
url
::
Url
;
use
crate
::
gguf
::{
Content
,
ContentConfig
,
ModelConfigLike
};
use
crate
::
protocols
::
TokenIdType
;
/// If a model deployment card hasn't been refreshed in this much time the worker is likely gone
const
CARD_MAX_AGE
:
chrono
::
TimeDelta
=
chrono
::
TimeDelta
::
minutes
(
5
);
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
ModelInfoType
{
HfConfigJson
(
String
),
GGUF
(
PathBuf
),
}
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
TokenizerKind
{
HfTokenizerJson
(
String
),
GGUF
(
Box
<
HfTokenizer
>
),
}
/// Supported types of prompt formatters.
///
/// We need a way to associate the prompt formatter template definition with an associated
/// data model which is expected for rendering.
///
/// All current prompt formatters are Jinja2 templates which use the OpenAI ChatCompletionRequest
/// format. However, we currently do not have a discovery path to know if the model supports tool use
/// unless we inspect the template.
///
/// TODO(): Add an enum for the PromptFormatDataModel with at minimum arms for:
/// - OaiChat
/// - OaiChatToolUse
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
PromptFormatterArtifact
{
HfTokenizerConfigJson
(
String
),
HfChatTemplate
(
String
),
GGUF
(
PathBuf
),
}
#[derive(Serialize,
Deserialize,
Clone,
Debug,
PartialEq,
Eq,
Hash)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
PromptContextMixin
{
/// Support OAI Chat Messages and Tools
OaiChat
,
/// Enables templates with `{{datetime}}` to be rendered with the current date and time.
Llama3DateTime
,
}
#[derive(Serialize,
Deserialize,
Clone,
Debug)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
GenerationConfig
{
HfGenerationConfigJson
(
String
),
GGUF
(
PathBuf
),
}
#[derive(Serialize,
Deserialize,
Clone,
Debug,
Builder,
Default)]
pub
struct
ModelDeploymentCard
{
/// Human readable model name, e.g. "Meta Llama 3.1 8B Instruct"
pub
display_name
:
String
,
/// Identifier to expect in OpenAI compatible HTTP request, e.g. "meta-llama/Meta-Llama-3.1-8B-Instruct"
/// This will get slugified for use in NATS.
pub
service_name
:
String
,
/// Model information
pub
model_info
:
Option
<
ModelInfoType
>
,
/// Tokenizer configuration
pub
tokenizer
:
Option
<
TokenizerKind
>
,
/// Prompt Formatter configuration
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
prompt_formatter
:
Option
<
PromptFormatterArtifact
>
,
/// chat template may be stored as a separate file instead of in `prompt_formatter`.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
chat_template_file
:
Option
<
PromptFormatterArtifact
>
,
/// Generation config - default sampling params
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
gen_config
:
Option
<
GenerationConfig
>
,
/// Prompt Formatter Config
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
prompt_context
:
Option
<
Vec
<
PromptContextMixin
>>
,
/// When this card was last advertised by a worker. None if not yet published.
pub
last_published
:
Option
<
chrono
::
DateTime
<
chrono
::
Utc
>>
,
/// Incrementing count of how many times we published this card
#[serde(default,
skip_serializing)]
pub
revision
:
u64
,
/// Max context (in number of tokens) this model can handle
pub
context_length
:
u32
,
/// Size of a KV cache block - vllm only currently
/// Passed to the engine and the KV router.
pub
kv_cache_block_size
:
u32
,
/// How many times a request can be migrated to another worker if the HTTP server lost
/// connection to the current worker.
pub
migration_limit
:
u32
,
/// User-defined metadata for custom worker behavior
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
user_data
:
Option
<
serde_json
::
Value
>
,
}
impl
ModelDeploymentCard
{
pub
fn
builder
()
->
ModelDeploymentCardBuilder
{
ModelDeploymentCardBuilder
::
default
()
}
/// Create a ModelDeploymentCard where only the name is filled in.
///
/// Single-process setups don't need an MDC to communicate model details, but it
/// simplifies the code to assume we always have one. This is how you get one in those
/// cases. A quasi-null object: <https://en.wikipedia.org/wiki/Null_object_pattern>
pub
fn
with_name_only
(
name
:
&
str
)
->
ModelDeploymentCard
{
ModelDeploymentCard
{
display_name
:
name
.to_string
(),
service_name
:
Slug
::
slugify
(
name
)
.to_string
(),
..
Default
::
default
()
}
}
/// How often we should check if a model deployment card expired because it's workers are gone
pub
fn
expiry_check_period
()
->
Duration
{
match
CARD_MAX_AGE
.to_std
()
{
Ok
(
duration
)
=>
duration
/
3
,
Err
(
_
)
=>
{
// Only happens if CARD_MAX_AGE is negative, which it isn't
unreachable!
(
"Cannot run card expiry watcher, invalid CARD_MAX_AGE"
);
}
}
}
/// Load a model deployment card from a JSON file
pub
fn
load_from_json_file
<
P
:
AsRef
<
Path
>>
(
file
:
P
)
->
std
::
io
::
Result
<
Self
>
{
Ok
(
serde_json
::
from_str
(
&
std
::
fs
::
read_to_string
(
file
)
?
)
?
)
}
/// Load a model deployment card from a JSON string
pub
fn
load_from_json_str
(
json
:
&
str
)
->
Result
<
Self
,
anyhow
::
Error
>
{
Ok
(
serde_json
::
from_str
(
json
)
?
)
}
//
// Methods
//
/// Save the model deployment card to a JSON file
pub
fn
save_to_json_file
(
&
self
,
file
:
&
str
)
->
Result
<
(),
anyhow
::
Error
>
{
std
::
fs
::
write
(
file
,
self
.to_json
()
?
)
?
;
Ok
(())
}
pub
fn
set_service_name
(
&
mut
self
,
service_name
:
&
str
)
{
self
.service_name
=
service_name
.to_string
();
}
pub
fn
slug
(
&
self
)
->
Slug
{
Slug
::
from_string
(
&
self
.display_name
)
}
/// Serialize the model deployment card to a JSON string
pub
fn
to_json
(
&
self
)
->
Result
<
String
,
anyhow
::
Error
>
{
Ok
(
serde_json
::
to_string
(
self
)
?
)
}
pub
fn
mdcsum
(
&
self
)
->
String
{
let
json
=
self
.to_json
()
.unwrap
();
format!
(
"{}"
,
blake3
::
hash
(
json
.as_bytes
()))
}
/// Was this card last published a long time ago, suggesting the worker is gone?
pub
fn
is_expired
(
&
self
)
->
bool
{
if
let
Some
(
last_published
)
=
self
.last_published
.as_ref
()
{
chrono
::
Utc
::
now
()
-
last_published
>
CARD_MAX_AGE
}
else
{
false
}
}
/// Is this a full model card with tokenizer?
/// There are cases where we have a placeholder card (see `with_name_only`).
pub
fn
has_tokenizer
(
&
self
)
->
bool
{
self
.tokenizer
.is_some
()
}
pub
fn
tokenizer_hf
(
&
self
)
->
anyhow
::
Result
<
HfTokenizer
>
{
match
&
self
.tokenizer
{
Some
(
TokenizerKind
::
HfTokenizerJson
(
file
))
=>
{
HfTokenizer
::
from_file
(
file
)
.map_err
(
anyhow
::
Error
::
msg
)
}
Some
(
TokenizerKind
::
GGUF
(
t
))
=>
Ok
(
*
t
.clone
()),
None
=>
{
anyhow
::
bail!
(
"Blank ModelDeploymentCard does not have a tokenizer"
);
}
}
}
pub
fn
is_gguf
(
&
self
)
->
bool
{
match
&
self
.model_info
{
Some
(
info
)
=>
info
.is_gguf
(),
None
=>
false
,
}
}
/// Move the files this MDC uses into the NATS object store.
/// Updates the URI's to point to NATS.
pub
async
fn
move_to_nats
(
&
mut
self
,
nats_client
:
nats
::
Client
)
->
Result
<
()
>
{
let
nats_addr
=
nats_client
.addr
();
let
bucket_name
=
self
.slug
();
tracing
::
debug!
(
nats_addr
,
%
bucket_name
,
"Uploading model deployment card fields to NATS"
);
macro_rules!
nats_upload
{
(
$field:expr
,
$enum_variant:path
,
$filename:literal
)
=>
{
if
let
Some
(
$enum_variant
(
src_file
))
=
$field
.take
()
{
if
!
nats
::
is_nats_url
(
&
src_file
)
{
let
target
=
format!
(
"nats://{nats_addr}/{bucket_name}/{}"
,
$filename
);
nats_client
.object_store_upload
(
&
std
::
path
::
PathBuf
::
from
(
&
src_file
),
url
::
Url
::
parse
(
&
target
)
?
,
)
.await
?
;
$field
=
Some
(
$enum_variant
(
target
));
}
}
};
}
nats_upload!
(
self
.model_info
,
ModelInfoType
::
HfConfigJson
,
"config.json"
);
nats_upload!
(
self
.prompt_formatter
,
PromptFormatterArtifact
::
HfTokenizerConfigJson
,
"tokenizer_config.json"
);
nats_upload!
(
self
.chat_template_file
,
PromptFormatterArtifact
::
HfChatTemplate
,
"chat_template.jinja"
);
nats_upload!
(
self
.tokenizer
,
TokenizerKind
::
HfTokenizerJson
,
"tokenizer.json"
);
nats_upload!
(
self
.gen_config
,
GenerationConfig
::
HfGenerationConfigJson
,
"generation_config.json"
);
Ok
(())
}
/// Move the files this MDC uses from the NATS object store to local disk.
/// Updates the URI's to point to the created files.
///
/// The returned TempDir must be kept alive, it cleans up on drop.
pub
async
fn
move_from_nats
(
&
mut
self
,
nats_client
:
nats
::
Client
)
->
Result
<
tempfile
::
TempDir
>
{
let
nats_addr
=
nats_client
.addr
();
let
bucket_name
=
self
.slug
();
let
target_dir
=
tempfile
::
TempDir
::
with_prefix
(
bucket_name
.to_string
())
?
;
tracing
::
debug!
(
nats_addr
,
%
bucket_name
,
target_dir
=
%
target_dir
.path
()
.display
(),
"Downloading model deployment card fields from NATS"
);
macro_rules!
nats_download
{
(
$field:expr
,
$enum_variant:path
,
$filename:literal
)
=>
{
if
let
Some
(
$enum_variant
(
src_url
))
=
$field
.take
()
{
if
nats
::
is_nats_url
(
&
src_url
)
{
let
target
=
target_dir
.path
()
.join
(
$filename
);
nats_client
.object_store_download
(
Url
::
parse
(
&
src_url
)
?
,
&
target
)
.await
?
;
$field
=
Some
(
$enum_variant
(
target
.display
()
.to_string
()));
}
}
};
}
nats_download!
(
self
.model_info
,
ModelInfoType
::
HfConfigJson
,
"config.json"
);
nats_download!
(
self
.prompt_formatter
,
PromptFormatterArtifact
::
HfTokenizerConfigJson
,
"tokenizer_config.json"
);
nats_download!
(
self
.chat_template_file
,
PromptFormatterArtifact
::
HfChatTemplate
,
"chat_template.jinja"
);
nats_download!
(
self
.tokenizer
,
TokenizerKind
::
HfTokenizerJson
,
"tokenizer.json"
);
nats_download!
(
self
.gen_config
,
GenerationConfig
::
HfGenerationConfigJson
,
"generation_config.json"
);
Ok
(
target_dir
)
}
/// Delete this card from the key-value store and it's URLs from the object store
pub
async
fn
delete_from_nats
(
&
mut
self
,
nats_client
:
nats
::
Client
)
->
Result
<
()
>
{
let
nats_addr
=
nats_client
.addr
();
let
bucket_name
=
self
.slug
();
tracing
::
trace!
(
nats_addr
,
%
bucket_name
,
"Delete model deployment card from NATS"
);
nats_client
.object_store_delete_bucket
(
bucket_name
.as_ref
())
.await
}
}
impl
Versioned
for
ModelDeploymentCard
{
fn
revision
(
&
self
)
->
u64
{
self
.revision
}
fn
set_revision
(
&
mut
self
,
revision
:
u64
)
{
self
.last_published
=
Some
(
chrono
::
Utc
::
now
());
self
.revision
=
revision
;
}
}
impl
fmt
::
Display
for
ModelDeploymentCard
{
fn
fmt
(
&
self
,
f
:
&
mut
fmt
::
Formatter
<
'_
>
)
->
fmt
::
Result
{
write!
(
f
,
"{}"
,
self
.slug
())
}
}
pub
trait
ModelInfo
:
Send
+
Sync
{
/// Model type
fn
model_type
(
&
self
)
->
String
;
/// Token ID for the beginning of sequence
fn
bos_token_id
(
&
self
)
->
TokenIdType
;
/// Token ID for the end of sequence
fn
eos_token_ids
(
&
self
)
->
Vec
<
TokenIdType
>
;
/// Maximum position embeddings / max sequence length
/// TODO: This is only used in a single test, no other code. Remove?
fn
max_position_embeddings
(
&
self
)
->
Option
<
usize
>
;
/// Vocabulary size
/// TODO: This is only used in a single test, no other code. Remove?
fn
vocab_size
(
&
self
)
->
Option
<
usize
>
;
}
impl
ModelInfoType
{
pub
async
fn
get_model_info
(
&
self
)
->
Result
<
Arc
<
dyn
ModelInfo
>>
{
match
self
{
Self
::
HfConfigJson
(
info
)
=>
HFConfig
::
from_json_file
(
info
)
.await
,
Self
::
GGUF
(
path
)
=>
HFConfig
::
from_gguf
(
path
),
}
}
pub
fn
is_gguf
(
&
self
)
->
bool
{
matches!
(
self
,
Self
::
GGUF
(
_
))
}
}
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
struct
HFConfig
{
/// denotes the mixin to the flattened data model which can be present
/// in the config.json file
architectures
:
Vec
<
String
>
,
/// general model type
model_type
:
String
,
text_config
:
Option
<
HFTextConfig
>
,
// Sometimes it's inside HFTextConfig, sometimes it's here
eos_token_id
:
Option
<
serde_json
::
Value
>
,
}
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
struct
HFTextConfig
{
// It can take multiple attempts to load this, so Option
bos_token_id
:
Option
<
TokenIdType
>
,
// We set this once bos_token_id is loaded so we don't have to deal with Option
#[serde(default)]
final_bos_token_id
:
TokenIdType
,
eos_token_id
:
Option
<
serde_json
::
Value
>
,
#[serde(default)]
final_eos_token_ids
:
Vec
<
TokenIdType
>
,
/// max sequence length
max_position_embeddings
:
Option
<
usize
>
,
/// number of layers in the model
num_hidden_layers
:
usize
,
/// number of attention heads in the model
num_attention_heads
:
Option
<
usize
>
,
/// Vocabulary size
vocab_size
:
Option
<
usize
>
,
}
impl
HFConfig
{
async
fn
from_json_file
(
file
:
&
str
)
->
Result
<
Arc
<
dyn
ModelInfo
>>
{
let
file_pathbuf
=
PathBuf
::
from
(
file
);
let
contents
=
std
::
fs
::
read_to_string
(
file
)
?
;
let
mut
config
:
Self
=
serde_json
::
from_str
(
&
contents
)
?
;
if
config
.text_config
.is_none
()
{
let
text_config
:
HFTextConfig
=
serde_json
::
from_str
(
&
contents
)
?
;
config
.text_config
=
Some
(
text_config
);
}
// Sometimes bos_token_id is in generation_config.json not config.json
let
Some
(
text_config
)
=
config
.text_config
.as_mut
()
else
{
anyhow
::
bail!
(
"Missing text config fields (model_type, eos_token_ids, etc) in config.json"
);
};
if
text_config
.bos_token_id
.is_none
()
{
let
bos_token_id
=
crate
::
file_json_field
::
<
TokenIdType
>
(
&
Path
::
join
(
file_pathbuf
.parent
()
.unwrap_or
(
&
PathBuf
::
from
(
""
)),
"generation_config.json"
,
),
"bos_token_id"
,
)
.context
(
"missing bos_token_id in generation_config.json and config.json, cannot load"
,
)
?
;
text_config
.bos_token_id
=
Some
(
bos_token_id
);
}
// Now that we have it for sure, set it in the non-Option field
let
final_bos_token_id
=
text_config
.bos_token_id
.take
()
.unwrap
();
text_config
.final_bos_token_id
=
final_bos_token_id
;
// TODO: refactor this when we switch to per-architecture tokenization
let
final_eos_token_ids
:
Vec
<
TokenIdType
>
=
config
.eos_token_id
.as_ref
()
.or
(
text_config
.eos_token_id
.as_ref
())
.and_then
(|
v
|
{
if
v
.is_number
()
{
v
.as_number
()
.and_then
(|
n
|
n
.as_u64
())
.map
(|
n
|
vec!
[
n
as
TokenIdType
])
}
else
if
v
.is_array
()
{
let
arr
=
v
.as_array
()
.unwrap
();
// Safety: We just checked
Some
(
arr
.iter
()
.filter_map
(|
inner_v
|
{
inner_v
.as_number
()
.and_then
(|
n
|
n
.as_u64
())
.map
(|
n
|
n
as
TokenIdType
)
})
.collect
(),
)
}
else
{
tracing
::
error!
(
?
v
,
file
,
"eos_token_id is not a number or an array, cannot use"
);
None
}
})
.or_else
(||
{
// Maybe it's in generation_config.json
crate
::
file_json_field
(
&
Path
::
join
(
file_pathbuf
.parent
()
.unwrap_or
(
&
PathBuf
::
from
(
""
)),
"generation_config.json"
,
),
"eos_token_id"
,
)
.inspect_err
(
|
err
|
tracing
::
warn!
(
%
err
,
"Missing eos_token_id in generation_config.json"
),
)
.ok
()
})
.ok_or_else
(||
{
anyhow
::
anyhow!
(
"missing eos_token_id in config.json and generation_config.json, cannot load"
)
})
?
;
text_config
.final_eos_token_ids
=
final_eos_token_ids
;
Ok
(
Arc
::
new
(
config
))
}
fn
from_gguf
(
gguf_file
:
&
Path
)
->
Result
<
Arc
<
dyn
ModelInfo
>>
{
let
content
=
load_gguf
(
gguf_file
)
?
;
let
model_config_metadata
:
ContentConfig
=
(
&
content
)
.into
();
let
num_hidden_layers
=
content
.get_metadata
()[
&
format!
(
"{}.block_count"
,
content
.arch
())]
.to_u32
()
?
as
usize
;
let
bos_token_id
=
content
.get_metadata
()[
"tokenizer.ggml.bos_token_id"
]
.to_u32
()
?
;
let
eos_token_id
=
content
.get_metadata
()[
"tokenizer.ggml.eos_token_id"
]
.to_u32
()
?
;
// to_vec returns a Vec that's already there, so it's cheap
let
vocab_size
=
content
.get_metadata
()[
"tokenizer.ggml.tokens"
]
.to_vec
()
?
.len
();
let
arch
=
content
.arch
()
.to_string
();
Ok
(
Arc
::
new
(
HFConfig
{
architectures
:
vec!
[
format!
(
"{}ForCausalLM"
,
capitalize
(
&
arch
))],
// "general.architecture"
model_type
:
arch
,
text_config
:
Some
(
HFTextConfig
{
bos_token_id
:
None
,
final_bos_token_id
:
bos_token_id
,
eos_token_id
:
None
,
final_eos_token_ids
:
vec!
[
eos_token_id
],
// "llama.context_length"
max_position_embeddings
:
Some
(
model_config_metadata
.max_seq_len
()),
// "llama.block_count"
num_hidden_layers
,
// "llama.attention.head_count"
num_attention_heads
:
Some
(
model_config_metadata
.num_attn_heads
()),
// "tokenizer.ggml.tokens".len()
vocab_size
:
Some
(
vocab_size
),
}),
eos_token_id
:
None
,
}))
}
}
impl
ModelInfo
for
HFConfig
{
fn
model_type
(
&
self
)
->
String
{
self
.model_type
.clone
()
}
fn
bos_token_id
(
&
self
)
->
TokenIdType
{
self
.text_config
.as_ref
()
.unwrap
()
.final_bos_token_id
}
fn
eos_token_ids
(
&
self
)
->
Vec
<
TokenIdType
>
{
self
.text_config
.as_ref
()
.unwrap
()
.final_eos_token_ids
.clone
()
}
fn
max_position_embeddings
(
&
self
)
->
Option
<
usize
>
{
self
.text_config
.as_ref
()
.unwrap
()
.max_position_embeddings
}
fn
vocab_size
(
&
self
)
->
Option
<
usize
>
{
self
.text_config
.as_ref
()
.unwrap
()
.vocab_size
}
}
impl
TokenizerKind
{
pub
fn
from_gguf
(
gguf_file
:
&
Path
)
->
anyhow
::
Result
<
Self
>
{
let
content
=
load_gguf
(
gguf_file
)
?
;
let
out
=
crate
::
gguf
::
convert_gguf_to_hf_tokenizer
(
&
content
)
.with_context
(||
gguf_file
.display
()
.to_string
())
?
;
Ok
(
TokenizerKind
::
GGUF
(
Box
::
new
(
out
.tokenizer
)))
}
}
pub
(
crate
)
fn
load_gguf
(
gguf_file
:
&
Path
)
->
anyhow
::
Result
<
Content
>
{
let
filename
=
gguf_file
.display
()
.to_string
();
let
mut
f
=
File
::
open
(
gguf_file
)
.with_context
(||
filename
.clone
())
?
;
// vec because GGUF can be split into multiple files (shards)
let
mut
readers
=
vec!
[
&
mut
f
];
crate
::
gguf
::
Content
::
from_readers
(
&
mut
readers
)
.with_context
(||
filename
.clone
())
}
fn
capitalize
(
s
:
&
str
)
->
String
{
s
.chars
()
.enumerate
()
.map
(|(
i
,
c
)|
{
if
i
==
0
{
c
.to_uppercase
()
.to_string
()
}
else
{
c
.to_lowercase
()
.to_string
()
}
})
.collect
()
}
#[cfg(test)]
mod
tests
{
use
super
::
HFConfig
;
use
std
::
path
::
Path
;
#[tokio::test]
pub
async
fn
test_config_json_llama3
()
->
anyhow
::
Result
<
()
>
{
let
config_file
=
Path
::
new
(
env!
(
"CARGO_MANIFEST_DIR"
))
.join
(
"tests/data/sample-models/mock-llama-3.1-8b-instruct/config.json"
);
let
config
=
HFConfig
::
from_json_file
(
&
config_file
.display
()
.to_string
())
.await
?
;
assert_eq!
(
config
.bos_token_id
(),
128000
);
Ok
(())
}
#[tokio::test]
pub
async
fn
test_config_json_llama4
()
->
anyhow
::
Result
<
()
>
{
let
config_file
=
Path
::
new
(
env!
(
"CARGO_MANIFEST_DIR"
))
.join
(
"tests/data/sample-models/Llama-4-Scout-17B-16E-Instruct/config.json"
);
let
config
=
HFConfig
::
from_json_file
(
&
config_file
.display
()
.to_string
())
.await
?
;
assert_eq!
(
config
.bos_token_id
(),
200000
);
Ok
(())
}
}
lib/llm/src/preprocessor.rs
View file @
1954fcfa
...
...
@@ -22,7 +22,7 @@ use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use
std
::{
collections
::
HashMap
,
sync
::
Arc
};
use
tracing
;
use
crate
::
model_card
::
model
::
{
ModelDeploymentCard
,
ModelInfo
,
TokenizerKind
};
use
crate
::
model_card
::{
ModelDeploymentCard
,
ModelInfo
,
TokenizerKind
};
use
crate
::
preprocessor
::
prompt
::
OAIChatLikeRequest
;
use
crate
::
tokenizers
::
Encoding
;
...
...
lib/llm/src/preprocessor/prompt/template.rs
View file @
1954fcfa
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
std
::{
collections
::
HashSet
,
sync
::
Arc
};
use
anyhow
::{
Context
,
Ok
,
Result
};
use
minijinja
::
Environment
;
use
crate
::
model_card
::
model
::
{
ModelDeploymentCard
,
PromptContextMixin
,
PromptFormatterArtifact
};
use
crate
::
model_card
::{
ModelDeploymentCard
,
PromptContextMixin
,
PromptFormatterArtifact
};
mod
context
;
mod
formatters
;
...
...
lib/llm/tests/backend.rs
View file @
1954fcfa
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
dynamo_llm
::
backend
::
Backend
;
use
dynamo_llm
::
model_card
::
model
::
ModelDeploymentCard
;
use
dynamo_llm
::
model_card
::
ModelDeploymentCard
;
#[tokio::test]
async
fn
test_sequence_factory
()
{
...
...
lib/llm/tests/model_card.rs
View file @
1954fcfa
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
dynamo_llm
::
model_card
::
model
::
{
ModelDeploymentCard
,
PromptFormatterArtifact
,
TokenizerKind
};
use
dynamo_llm
::
model_card
::{
ModelDeploymentCard
,
PromptFormatterArtifact
,
TokenizerKind
};
use
tempfile
::
tempdir
;
const
HF_PATH
:
&
str
=
"tests/data/sample-models/TinyLlama_v1.1"
;
...
...
lib/llm/tests/preprocessor.rs
View file @
1954fcfa
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
anyhow
::{
Ok
,
Result
};
use
dynamo_llm
::
model_card
::
model
::
{
ModelDeploymentCard
,
PromptContextMixin
};
use
dynamo_llm
::
model_card
::{
ModelDeploymentCard
,
PromptContextMixin
};
use
dynamo_llm
::
preprocessor
::
prompt
::
PromptFormatter
;
use
dynamo_llm
::
protocols
::
openai
::
chat_completions
::
NvCreateChatCompletionRequest
;
use
serde
::{
Deserialize
,
Serialize
};
...
...
lib/runtime/src/slug.rs
View file @
1954fcfa
...
...
@@ -21,7 +21,7 @@ const REPLACEMENT_CHAR: char = '_';
/// URL and NATS friendly string.
/// Only a-z, 0-9, - and _.
#[derive(Serialize,
Clone,
Debug,
Eq,
PartialEq)]
#[derive(Serialize,
Clone,
Debug,
Eq,
PartialEq
,
Default
)]
pub
struct
Slug
(
String
);
impl
Slug
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment