Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1c3ffdbe
Unverified
Commit
1c3ffdbe
authored
Sep 21, 2025
by
Woosuk Kwon
Committed by
GitHub
Sep 21, 2025
Browse files
[V0 Deprecation] Remove V0 sampling metadata (#25345)
Signed-off-by:
Woosuk Kwon
<
woosuk@thinkingmachines.ai
>
parent
c438b295
Changes
141
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
26 additions
and
79 deletions
+26
-79
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
...ins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+3
-5
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
...ugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+3
-5
vllm/model_executor/__init__.py
vllm/model_executor/__init__.py
+0
-2
vllm/model_executor/layers/logits_processor.py
vllm/model_executor/layers/logits_processor.py
+0
-2
vllm/model_executor/models/apertus.py
vllm/model_executor/models/apertus.py
+1
-4
vllm/model_executor/models/arcee.py
vllm/model_executor/models/arcee.py
+3
-4
vllm/model_executor/models/arctic.py
vllm/model_executor/models/arctic.py
+1
-4
vllm/model_executor/models/aria.py
vllm/model_executor/models/aria.py
+2
-5
vllm/model_executor/models/aya_vision.py
vllm/model_executor/models/aya_vision.py
+1
-4
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+1
-4
vllm/model_executor/models/bailing_moe.py
vllm/model_executor/models/bailing_moe.py
+1
-4
vllm/model_executor/models/bamba.py
vllm/model_executor/models/bamba.py
+1
-4
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+1
-4
vllm/model_executor/models/bloom.py
vllm/model_executor/models/bloom.py
+1
-4
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+1
-4
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+1
-4
vllm/model_executor/models/cohere2_vision.py
vllm/model_executor/models/cohere2_vision.py
+1
-4
vllm/model_executor/models/commandr.py
vllm/model_executor/models/commandr.py
+2
-4
vllm/model_executor/models/dbrx.py
vllm/model_executor/models/dbrx.py
+1
-4
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/deepseek.py
+1
-4
No files found.
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
View file @
1c3ffdbe
...
...
@@ -9,7 +9,6 @@ from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
LlavaForConditionalGeneration
,
LlavaMultiModalProcessor
,
LlavaProcessingInfo
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
...
...
@@ -18,11 +17,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
dummy_inputs
=
LlavaDummyInputsBuilder
)
class
MyLlava
(
LlavaForConditionalGeneration
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
torch
.
Tensor
]:
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
Optional
[
torch
.
Tensor
]:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
logits
=
super
().
compute_logits
(
hidden_states
)
if
logits
is
not
None
:
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
...
...
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
View file @
1c3ffdbe
...
...
@@ -6,16 +6,14 @@ from typing import Optional
import
torch
from
vllm.model_executor.models.opt
import
OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
class
MyOPTForCausalLM
(
OPTForCausalLM
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
torch
.
Tensor
]:
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
Optional
[
torch
.
Tensor
]:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
logits
=
super
().
compute_logits
(
hidden_states
)
if
logits
is
not
None
:
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
...
...
vllm/model_executor/__init__.py
View file @
1c3ffdbe
...
...
@@ -3,11 +3,9 @@
from
vllm.model_executor.parameter
import
(
BasevLLMParameter
,
PackedvLLMParameter
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_random_seed
__all__
=
[
"SamplingMetadata"
,
"set_random_seed"
,
"BasevLLMParameter"
,
"PackedvLLMParameter"
,
...
...
vllm/model_executor/layers/logits_processor.py
View file @
1c3ffdbe
...
...
@@ -10,7 +10,6 @@ from vllm.distributed import (tensor_model_parallel_all_gather,
from
vllm.model_executor.custom_op
import
CustomOp
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.platforms
import
current_platform
...
...
@@ -50,7 +49,6 @@ class LogitsProcessor(CustomOp):
self
,
lm_head
:
VocabParallelEmbedding
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
Optional
[
SamplingMetadata
]
=
None
,
embedding_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Optional
[
torch
.
Tensor
]:
if
self
.
logits_as_input
:
...
...
vllm/model_executor/models/apertus.py
View file @
1c3ffdbe
...
...
@@ -48,7 +48,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
...
...
@@ -566,10 +565,8 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/arcee.py
View file @
1c3ffdbe
...
...
@@ -399,11 +399,10 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
inputs_embeds
=
inputs_embeds
)
return
model_output
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
)
->
Optional
[
torch
.
Tensor
]:
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
Optional
[
torch
.
Tensor
]:
# Compute final logits from hidden states (last pipeline rank only)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
...
...
vllm/model_executor/models/arctic.py
View file @
1c3ffdbe
...
...
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
...
...
@@ -456,10 +455,8 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/aria.py
View file @
1c3ffdbe
...
...
@@ -19,7 +19,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
)
...
...
@@ -644,10 +643,8 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
...
...
vllm/model_executor/models/aya_vision.py
View file @
1c3ffdbe
...
...
@@ -16,7 +16,6 @@ from transformers.models.got_ocr2.image_processing_got_ocr2 import (
get_optimal_tiled_canvas
)
from
vllm.config
import
VllmConfig
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalDataDict
,
MultiModalKwargsItems
from
vllm.multimodal.parse
import
(
ImageProcessorItems
,
ImageSize
,
...
...
@@ -464,7 +463,5 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
return
self
.
language_model
.
compute_logits
(
hidden_states
)
vllm/model_executor/models/baichuan.py
View file @
1c3ffdbe
...
...
@@ -46,7 +46,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
row_parallel_weight_loader
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
,
SupportsQuant
...
...
@@ -421,10 +420,8 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/bailing_moe.py
View file @
1c3ffdbe
...
...
@@ -51,7 +51,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
...
...
@@ -623,10 +622,8 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/bamba.py
View file @
1c3ffdbe
...
...
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
MambaCacheParams
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
LayerBlockType
...
...
@@ -571,10 +570,8 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/blip2.py
View file @
1c3ffdbe
...
...
@@ -12,7 +12,6 @@ from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig,
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargsItems
)
...
...
@@ -704,10 +703,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
return
self
.
language_model
.
compute_logits
(
hidden_states
)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
...
...
vllm/model_executor/models/bloom.py
View file @
1c3ffdbe
...
...
@@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
,
SupportsQuant
...
...
@@ -355,10 +354,8 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/chameleon.py
View file @
1c3ffdbe
...
...
@@ -28,7 +28,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
row_parallel_weight_loader
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
...
...
@@ -1046,10 +1045,8 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
# Disallow image tokens which does not include special
# begin-image and end-image tokens
...
...
vllm/model_executor/models/chatglm.py
View file @
1c3ffdbe
...
...
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
ChatGLMConfig
...
...
@@ -437,10 +436,8 @@ class ChatGLMBaseModel(nn.Module):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
...
...
vllm/model_executor/models/cohere2_vision.py
View file @
1c3ffdbe
...
...
@@ -21,7 +21,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModalDataDict
,
MultiModalKwargsItems
from
vllm.multimodal.parse
import
(
ImageProcessorItems
,
ImageSize
,
...
...
@@ -478,7 +477,5 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
return
self
.
language_model
.
compute_logits
(
hidden_states
)
vllm/model_executor/models/commandr.py
View file @
1c3ffdbe
...
...
@@ -46,7 +46,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
,
row_parallel_weight_loader
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
...
...
@@ -448,15 +447,14 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
is_not_lora
=
hasattr
(
self
.
model
.
embed_tokens
,
'weight'
)
if
is_not_lora
:
logits
=
self
.
logits_processor
(
self
.
model
.
embed_tokens
,
hidden_states
,
sampling_metadata
)
hidden_states
)
else
:
logits
=
self
.
logits_processor
(
self
.
model
.
embed_tokens
.
base_layer
,
hidden_states
,
sampling_metadata
)
hidden_states
)
return
logits
...
...
vllm/model_executor/models/dbrx.py
View file @
1c3ffdbe
...
...
@@ -24,7 +24,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
...
...
@@ -462,10 +461,8 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
vllm/model_executor/models/deepseek.py
View file @
1c3ffdbe
...
...
@@ -49,7 +49,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
,
SupportsPP
...
...
@@ -488,10 +487,8 @@ class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
)
return
logits
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
...
...
Prev
1
2
3
4
5
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment