Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6d68030f
Unverified
Commit
6d68030f
authored
May 26, 2025
by
Naveassaf
Committed by
GitHub
May 26, 2025
Browse files
[Model] Add support for YARN in NemotronNAS models (#18427)
Signed-off-by:
Nave Assaf
<
nassaf@nvidia.com
>
parent
5a2c76cb
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
67 additions
and
16 deletions
+67
-16
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+21
-14
vllm/model_executor/models/nemotron_nas.py
vllm/model_executor/models/nemotron_nas.py
+46
-2
No files found.
vllm/model_executor/models/llama.py
View file @
6d68030f
...
...
@@ -162,20 +162,9 @@ class LlamaAttention(nn.Module):
prefix
=
f
"
{
prefix
}
.o_proj"
,
)
is_neox_style
=
True
is_gguf
=
quant_config
and
quant_config
.
get_name
()
==
"gguf"
if
is_gguf
and
config
.
model_type
==
"llama"
:
is_neox_style
=
False
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
base
=
rope_theta
,
self
.
_init_rotary_emb
(
config
,
rope_scaling
=
rope_scaling
,
is_neox_style
=
is_neox_style
,
partial_rotary_factor
=
self
.
partial_rotary_factor
,
)
quant_config
=
quant_config
)
if
hasattr
(
config
,
"interleaved_sliding_window"
):
interleaved_sliding_window
=
config
.
interleaved_sliding_window
...
...
@@ -214,6 +203,24 @@ class LlamaAttention(nn.Module):
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
def
_init_rotary_emb
(
self
,
config
:
LlamaConfig
,
rope_scaling
:
Optional
[
dict
[
str
,
Any
]],
quant_config
:
Optional
[
QuantizationConfig
])
->
None
:
is_neox_style
=
True
is_gguf
=
quant_config
and
quant_config
.
get_name
()
==
"gguf"
if
is_gguf
and
self
.
config
.
model_type
==
"llama"
:
is_neox_style
=
False
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
self
.
max_position_embeddings
,
base
=
self
.
rope_theta
,
rope_scaling
=
rope_scaling
,
is_neox_style
=
is_neox_style
,
partial_rotary_factor
=
self
.
partial_rotary_factor
,
)
class
LlamaDecoderLayer
(
nn
.
Module
):
...
...
vllm/model_executor/models/nemotron_nas.py
View file @
6d68030f
...
...
@@ -23,18 +23,20 @@
# limitations under the License.
"""Inference-only deci model compatible with HuggingFace weights."""
from
collections.abc
import
Iterable
from
typing
import
Optional
,
Union
from
typing
import
Any
,
Optional
,
Union
import
torch
from
torch
import
nn
from
transformers
import
LlamaConfig
from
vllm.attention
import
AttentionType
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -62,6 +64,48 @@ def _find_multiple(n: int, k: int) -> int:
return
n
+
k
-
(
n
%
k
)
class
DeciLMAttention
(
LlamaAttention
):
def
__init__
(
self
,
config
:
LlamaConfig
,
hidden_size
:
int
,
num_heads
:
int
,
num_kv_heads
:
int
,
rope_theta
:
float
=
10000
,
rope_scaling
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
max_position_embeddings
:
int
=
8192
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
bias
:
bool
=
False
,
bias_o_proj
:
bool
=
False
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
prefix
:
str
=
""
,
attn_type
:
str
=
AttentionType
.
DECODER
,
)
->
None
:
super
().
__init__
(
config
,
hidden_size
,
num_heads
,
num_kv_heads
,
rope_theta
,
rope_scaling
,
max_position_embeddings
,
quant_config
,
bias
,
bias_o_proj
,
cache_config
,
prefix
,
attn_type
)
def
_init_rotary_emb
(
self
,
config
,
rope_scaling
:
Optional
[
dict
[
str
,
Any
]],
quant_config
:
Optional
[
QuantizationConfig
])
->
None
:
# Enables YARN for Mistral and LLaMA4 derivatives.
is_neox_style
=
True
if
hasattr
(
config
,
"position_embedding_type"
):
is_neox_style
=
config
.
position_embedding_type
not
in
[
"mistral_yarn"
,
"rope_llama4"
]
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
self
.
max_position_embeddings
,
base
=
self
.
rope_theta
,
rope_scaling
=
rope_scaling
,
is_neox_style
=
is_neox_style
,
partial_rotary_factor
=
self
.
partial_rotary_factor
)
class
DeciLMDecoderLayer
(
nn
.
Module
):
def
__init__
(
...
...
@@ -98,7 +142,7 @@ class DeciLMDecoderLayer(nn.Module):
if
not
self
.
_is_no_op_attention
:
num_kv_heads
=
(
config
.
num_attention_heads
//
block_config
.
attention
.
n_heads_in_group
)
self
.
self_attn
=
Llama
Attention
(
self
.
self_attn
=
DeciLM
Attention
(
config
=
config
,
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment