Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
428dd144
Unverified
Commit
428dd144
authored
Aug 29, 2024
by
afeldman-nm
Committed by
GitHub
Aug 29, 2024
Browse files
[Core] Logprobs support in Multi-step (#7652)
parent
4abed65c
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
283 additions
and
75 deletions
+283
-75
vllm/executor/openvino_executor.py
vllm/executor/openvino_executor.py
+2
-1
vllm/executor/ray_gpu_executor.py
vllm/executor/ray_gpu_executor.py
+2
-1
vllm/executor/ray_tpu_executor.py
vllm/executor/ray_tpu_executor.py
+2
-1
vllm/executor/tpu_executor.py
vllm/executor/tpu_executor.py
+2
-1
vllm/executor/xpu_executor.py
vllm/executor/xpu_executor.py
+2
-1
vllm/model_executor/layers/sampler.py
vllm/model_executor/layers/sampler.py
+247
-43
vllm/model_executor/model_loader/neuron.py
vllm/model_executor/model_loader/neuron.py
+1
-2
vllm/model_executor/model_loader/openvino.py
vllm/model_executor/model_loader/openvino.py
+1
-2
vllm/model_executor/models/arctic.py
vllm/model_executor/models/arctic.py
+2
-2
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+2
-2
vllm/model_executor/models/bart.py
vllm/model_executor/models/bart.py
+2
-2
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+2
-2
vllm/model_executor/models/bloom.py
vllm/model_executor/models/bloom.py
+2
-2
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+2
-2
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+2
-2
vllm/model_executor/models/commandr.py
vllm/model_executor/models/commandr.py
+2
-2
vllm/model_executor/models/dbrx.py
vllm/model_executor/models/dbrx.py
+2
-2
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/deepseek.py
+2
-2
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+2
-2
vllm/model_executor/models/eagle.py
vllm/model_executor/models/eagle.py
+2
-1
No files found.
vllm/executor/openvino_executor.py
View file @
428dd144
...
...
@@ -9,7 +9,8 @@ from vllm.config import CacheConfig, ModelConfig
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.utils
import
(
GiB_bytes
,
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
...
...
vllm/executor/ray_gpu_executor.py
View file @
428dd144
...
...
@@ -12,7 +12,8 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable
from
vllm.executor.msgspec_utils
import
encode_hook
from
vllm.executor.ray_utils
import
RayWorkerWrapper
,
ray
from
vllm.logger
import
init_logger
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.utils
import
(
_run_task_with_lock
,
get_distributed_init_method
,
get_ip
,
get_open_port
,
get_vllm_instance_id
,
make_async
)
...
...
vllm/executor/ray_tpu_executor.py
View file @
428dd144
...
...
@@ -10,7 +10,8 @@ from vllm.executor.executor_base import ExecutorAsyncBase
from
vllm.executor.ray_utils
import
RayWorkerWrapper
,
ray
from
vllm.executor.tpu_executor
import
TPUExecutor
from
vllm.logger
import
init_logger
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
get_vllm_instance_id
,
make_async
)
...
...
vllm/executor/tpu_executor.py
View file @
428dd144
...
...
@@ -5,7 +5,8 @@ import torch
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
...
...
vllm/executor/xpu_executor.py
View file @
428dd144
...
...
@@ -9,7 +9,8 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
from
vllm.executor.executor_base
import
ExecutorAsyncBase
from
vllm.executor.gpu_executor
import
GPUExecutor
from
vllm.logger
import
init_logger
from
vllm.sequence
import
ExecuteModelRequest
,
PoolerOutput
,
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
,
PoolerOutput
from
vllm.utils
import
make_async
from
vllm.worker.worker_base
import
WorkerBase
...
...
vllm/model_executor/layers/sampler.py
View file @
428dd144
"""A layer that samples the next tokens from the model's outputs."""
import
itertools
import
warnings
from
dataclasses
import
dataclass
from
importlib.util
import
find_spec
from
math
import
inf
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
import
msgspec
import
torch
import
torch.nn
as
nn
from
vllm.spec_decode.metrics
import
SpecDecodeWorkerMetrics
from
vllm.triton_utils
import
HAS_TRITON
if
HAS_TRITON
:
...
...
@@ -19,8 +22,7 @@ from vllm.model_executor.sampling_metadata import (SamplingMetadata,
SequenceGroupToSample
)
from
vllm.sampling_params
import
SamplingType
from
vllm.sequence
import
(
CompletionSequenceGroupOutput
,
Logprob
,
PromptLogprobs
,
SampleLogprobs
,
SamplerOutput
,
SequenceOutput
)
PromptLogprobs
,
SampleLogprobs
,
SequenceOutput
)
if
envs
.
VLLM_USE_FLASHINFER_SAMPLER
and
find_spec
(
"flashinfer"
):
import
flashinfer.sampling
...
...
@@ -35,6 +37,116 @@ else:
# (num_token_ids, num_parent_ids) per sequence group.
SampleResultType
=
List
[
Tuple
[
List
[
int
],
List
[
int
]]]
# Types of temporary data structures used for
# computing sample_result
SampleMetadataType
=
Dict
[
SamplingType
,
Tuple
[
List
[
int
],
List
[
SequenceGroupToSample
]]]
MultinomialSamplesType
=
Dict
[
SamplingType
,
torch
.
Tensor
]
SampleResultsDictType
=
Dict
[
int
,
Tuple
[
List
[
int
],
List
[
int
]]]
# Encapsulates temporary data structures for computing
# sample_result.
#
# * For multi-step scheduling: must be returned
# by `Sampler.forward()` and used later to compute the pythonized
# sample_result
#
# * For single-step scheduling: consumed immediately
# inside `Sampler.forward()` to compute pythonized sample_result.
@
dataclass
class
SampleResultArgsType
:
sample_metadata
:
SampleMetadataType
multinomial_samples
:
MultinomialSamplesType
sample_results_dict
:
SampleResultsDictType
sampling_metadata
:
SamplingMetadata
greedy_samples
:
Optional
[
torch
.
Tensor
]
beam_search_logprobs
:
Optional
[
torch
.
Tensor
]
# Union of non-deferred (single-step scheduling)
# vs deferred (multi-step scheduling)
# sample result types
MaybeDeferredSampleResultType
=
Union
[
SampleResultType
,
SampleResultArgsType
]
# Abbreviation of the _sample() return type
SampleReturnType
=
Tuple
[
MaybeDeferredSampleResultType
,
Optional
[
torch
.
Tensor
]]
class
SamplerOutput
(
msgspec
.
Struct
,
omit_defaults
=
True
,
# type: ignore[call-arg]
array_like
=
True
):
# type: ignore[call-arg]
"""For each sequence group, we generate a list of SequenceOutput object,
each of which contains one possible candidate for the next token.
This data structure implements methods, so it can be used like a list, but
also has optional fields for device tensors.
"""
outputs
:
List
[
CompletionSequenceGroupOutput
]
# On-device tensor containing probabilities of each token.
sampled_token_probs
:
Optional
[
torch
.
Tensor
]
=
None
# On-device tensor containing the logprobs of each token.
logprobs
:
Optional
[
"torch.Tensor"
]
=
None
# Holds either (1) the pythonized sampler result (single-step scheduling)
# or (2) what will be arguments for later deferred pythonization of the
# sampler result (muliti-step scheduling)
deferred_sample_results_args
:
Optional
[
SampleResultArgsType
]
=
None
# On-device tensor containing the sampled token ids.
sampled_token_ids
:
Optional
[
torch
.
Tensor
]
=
None
# CPU tensor containing the sampled token ids. Used during multi-step to
# return the sampled token ids from last rank to AsyncLLMEngine to be
# 'broadcasted' to all other PP ranks for next step.
sampled_token_ids_cpu
:
Optional
[
torch
.
Tensor
]
=
None
# Spec decode metrics populated by workers.
spec_decode_worker_metrics
:
Optional
[
SpecDecodeWorkerMetrics
]
=
None
# Optional last hidden states from the model.
hidden_states
:
Optional
[
torch
.
Tensor
]
=
None
# Optional prefill hidden states from the model
# (used for models like EAGLE).
prefill_hidden_states
:
Optional
[
torch
.
Tensor
]
=
None
# Time taken in the forward pass for this across all workers
model_forward_time
:
Optional
[
float
]
=
None
# Time taken in the model execute function. This will include model forward,
# block/sync across workers, cpu-gpu sync time and sampling time.
model_execute_time
:
Optional
[
float
]
=
None
def
__getitem__
(
self
,
idx
:
int
):
return
self
.
outputs
[
idx
]
def
__setitem__
(
self
,
idx
:
int
,
value
):
self
.
outputs
[
idx
]
=
value
def
__len__
(
self
):
return
len
(
self
.
outputs
)
def
__eq__
(
self
,
other
:
object
):
return
isinstance
(
other
,
self
.
__class__
)
and
self
.
outputs
==
other
.
outputs
def
__repr__
(
self
)
->
str
:
"""Show the shape of a tensor instead of its values to reduce noise.
"""
sampled_token_probs_repr
=
(
"None"
if
self
.
sampled_token_probs
is
None
else
self
.
sampled_token_probs
.
shape
)
sampled_token_ids_repr
=
(
"None"
if
self
.
sampled_token_ids
is
None
else
self
.
sampled_token_ids
.
shape
)
return
(
f
"SamplerOutput(outputs=
{
self
.
outputs
}
, "
f
"sampled_token_probs=
{
sampled_token_probs_repr
}
, "
f
"sampled_token_ids=
{
sampled_token_ids_repr
}
, "
f
"spec_decode_worker_metrics=
{
self
.
spec_decode_worker_metrics
}
)"
)
class
Sampler
(
nn
.
Module
):
"""Samples the next tokens from the model's outputs.
...
...
@@ -98,6 +210,19 @@ class Sampler(nn.Module):
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
"""
Single-step scheduling:
* Perform GPU-side sampling computation & compute
GPU-side logprobs tensor
* Pythonize sampling result & logprobs tensor
Multi-step scheduling:
* Perform GPU-side sampling computation & compute
GPU-side logprobs tensor
* Defer Pythonization of sampling result & logprobs
tensor
* Encapsulate arguments required for deferred Pythonization
in the :class:`SamplerOutput` structure
Args:
logits: (num_tokens, vocab_size).
sampling_metadata: Metadata for sampling.
...
...
@@ -150,7 +275,7 @@ class Sampler(nn.Module):
logprobs
=
torch
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float
)
# Sample the next tokens.
sample_results
,
maybe_sampled_tokens_tensor
=
_sample
(
maybe_deferred_
sample_results
,
maybe_sampled_tokens_tensor
=
_sample
(
probs
,
logprobs
,
sampling_metadata
,
...
...
@@ -160,20 +285,28 @@ class Sampler(nn.Module):
)
if
self
.
include_gpu_probs_tensor
:
# Since we will defer sampler result Pythonization,
# preserve GPU-side tensors in support of later
# deferred pythonization of logprobs
assert
maybe_sampled_tokens_tensor
is
not
None
on_device_tensors
=
(
probs
,
logprobs
,
maybe_sampled_tokens_tensor
)
else
:
# Since Pythonization has already happened, don't preserve
# GPU-side tensors.
on_device_tensors
=
None
# Get the logprobs query results.
prompt_logprobs
=
None
sample_logprobs
=
None
if
not
sampling_metadata
.
skip_sampler_cpu_output
:
prompt_logprobs
,
sample_logprobs
=
_get_logprobs
(
logprobs
,
sampling_metadata
,
sample_results
)
# Pythonize logprobs now (GPU -> CPU); do not defer.
assert
not
isinstance
(
maybe_deferred_sample_results
,
SampleResultArgsType
)
prompt_logprobs
,
sample_logprobs
=
get_logprobs
(
logprobs
,
sampling_metadata
,
maybe_deferred_sample_results
)
return
_build_sampler_output
(
sample_results
,
maybe_deferred_
sample_results
,
sampling_metadata
,
prompt_logprobs
,
sample_logprobs
,
...
...
@@ -543,6 +676,60 @@ def _top_k_top_p_multinomial_with_flashinfer(
return
batch_next_token_ids
.
view
(
-
1
,
num_samples
)
def
get_pythonized_sample_results
(
sample_result_args
:
SampleResultArgsType
)
->
SampleResultType
:
'''This function consumes GPU-side sampler results and computes
Pythonized CPU-side sampler results (GPU -> CPU sync.)
Single-step scheduling: this function is invoked at sampling-time
for immediate Pythonization.
Multi-step scheduling: Pythonization is deferred until after multiple
GPU-side steps have been completed.
Args:
sample_result_args: GPU-side inputs to the Pythonization process
Returns:
Pythonized sampler results
'''
(
sample_metadata
,
sampling_metadata
,
greedy_samples
,
multinomial_samples
,
beam_search_logprobs
,
sample_results_dict
,
)
=
(
sample_result_args
.
sample_metadata
,
sample_result_args
.
sampling_metadata
,
sample_result_args
.
greedy_samples
,
sample_result_args
.
multinomial_samples
,
sample_result_args
.
beam_search_logprobs
,
sample_result_args
.
sample_results_dict
,
)
for
sampling_type
in
SamplingType
:
if
sampling_type
not
in
sample_metadata
:
continue
(
seq_group_id
,
seq_groups
)
=
sample_metadata
[
sampling_type
]
if
sampling_type
==
SamplingType
.
GREEDY
:
sample_results
=
_greedy_sample
(
seq_groups
,
greedy_samples
)
elif
sampling_type
in
(
SamplingType
.
RANDOM
,
SamplingType
.
RANDOM_SEED
):
sample_results
=
_random_sample
(
seq_groups
,
multinomial_samples
[
sampling_type
])
elif
sampling_type
==
SamplingType
.
BEAM
:
sample_results
=
_beam_search_sample
(
seq_groups
,
beam_search_logprobs
)
sample_results_dict
.
update
(
zip
(
seq_group_id
,
sample_results
))
return
[
sample_results_dict
.
get
(
i
,
([],
[]))
for
i
in
range
(
len
(
sampling_metadata
.
seq_groups
))
]
def
_sample_with_torch
(
probs
:
torch
.
Tensor
,
logprobs
:
torch
.
Tensor
,
...
...
@@ -550,7 +737,19 @@ def _sample_with_torch(
sampling_tensors
:
SamplingTensors
,
include_gpu_probs_tensor
:
bool
,
modify_greedy_probs
:
bool
,
)
->
Tuple
[
SampleResultType
,
Optional
[
torch
.
Tensor
]]:
)
->
SampleReturnType
:
'''Torch-oriented _sample() implementation.
Single-step scheduling:
* Perform GPU-side sampling computation
* Immediately Pythonize sampling result
Multi-step scheduling:
* Perform GPU-side sampling computation
* Defer Pythonization & preserve GPU-side
tensors required for Pythonization
'''
categorized_seq_group_ids
:
Dict
[
SamplingType
,
List
[
int
]]
=
{
t
:
[]
for
t
in
SamplingType
}
...
...
@@ -560,10 +759,11 @@ def _sample_with_torch(
sampling_type
=
sampling_params
.
sampling_type
categorized_seq_group_ids
[
sampling_type
].
append
(
i
)
sample_results_dict
:
Dict
[
int
,
Tuple
[
List
[
int
],
List
[
int
]]]
=
{}
sample_metadata
:
Dict
[
SamplingType
,
Tuple
[
List
[
int
],
List
[
SequenceGroupToSample
]]]
=
{}
multinomial_samples
:
Dict
[
SamplingType
,
torch
.
Tensor
]
=
{}
sample_results_dict
:
SampleResultsDictType
=
{}
sample_metadata
:
SampleMetadataType
=
{}
multinomial_samples
:
MultinomialSamplesType
=
{}
greedy_samples
:
Optional
[
torch
.
Tensor
]
=
None
beam_search_logprobs
:
Optional
[
torch
.
Tensor
]
=
None
# Create output tensor for sampled token ids.
if
include_gpu_probs_tensor
:
...
...
@@ -638,32 +838,29 @@ def _sample_with_torch(
else
:
raise
ValueError
(
f
"Unsupported sampling type:
{
sampling_type
}
"
)
# GPU<->CPU sync happens in the loop below.
# This also converts the sample output to Python objects.
# Encapsulate arguments for computing Pythonized sampler
# results, whether deferred or otherwise.
maybe_deferred_args
=
SampleResultArgsType
(
sampling_metadata
=
sampling_metadata
,
sample_metadata
=
sample_metadata
,
multinomial_samples
=
multinomial_samples
,
greedy_samples
=
greedy_samples
,
beam_search_logprobs
=
beam_search_logprobs
,
sample_results_dict
=
sample_results_dict
)
if
not
sampling_metadata
.
skip_sampler_cpu_output
:
for
sampling_type
in
SamplingType
:
if
sampling_type
not
in
sample_metadata
:
continue
(
seq_group_id
,
seq_groups
)
=
sample_metadata
[
sampling_type
]
if
sampling_type
==
SamplingType
.
GREEDY
:
sample_results
=
_greedy_sample
(
seq_groups
,
greedy_samples
)
elif
sampling_type
in
(
SamplingType
.
RANDOM
,
SamplingType
.
RANDOM_SEED
):
sample_results
=
_random_sample
(
seq_groups
,
multinomial_samples
[
sampling_type
])
elif
sampling_type
==
SamplingType
.
BEAM
:
sample_results
=
_beam_search_sample
(
seq_groups
,
beam_search_logprobs
)
sample_results_dict
.
update
(
zip
(
seq_group_id
,
sample_results
))
sample_results
=
[
sample_results_dict
.
get
(
i
,
([],
[]))
for
i
in
range
(
len
(
sampling_metadata
.
seq_groups
))
]
# GPU<->CPU sync happens here.
# This also converts the sampler output to a Python object.
# Return Pythonized sampler result & sampled token ids
return
get_pythonized_sample_results
(
maybe_deferred_args
),
sampled_token_ids_tensor
else
:
sample_results
=
[]
return
sample_results
,
sampled_token_ids_tensor
# Defer sampler result Pythonization; return deferred
# Pythonization args & sampled token ids
return
(
maybe_deferred_args
,
sampled_token_ids_tensor
,
)
def
_sample_with_triton_kernel
(
...
...
@@ -755,7 +952,7 @@ def _sample(
sampling_tensors
:
SamplingTensors
,
include_gpu_probs_tensor
:
bool
,
modify_greedy_probs
:
bool
,
)
->
Tuple
[
SampleRe
sultType
,
Optional
[
torch
.
Tensor
]]
:
)
->
SampleRe
turnType
:
"""
Args:
probs: (num_query_tokens_in_batch, num_vocab)
...
...
@@ -803,7 +1000,7 @@ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
return
result
.
sum
(
1
).
add_
(
1
)
def
_
get_logprobs
(
def
get_logprobs
(
logprobs
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
sample_results
:
SampleResultType
,
...
...
@@ -1126,7 +1323,7 @@ def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
def
_build_sampler_output
(
sample_results
:
SampleResultType
,
maybe_deferred_
sample_results
:
MaybeDeferred
SampleResultType
,
sampling_metadata
:
SamplingMetadata
,
prompt_logprobs
:
Optional
[
List
[
Optional
[
PromptLogprobs
]]],
sample_logprobs
:
Optional
[
List
[
SampleLogprobs
]],
...
...
@@ -1143,14 +1340,21 @@ def _build_sampler_output(
speculative decoding rejection sampling.
"""
sampler_output
:
List
[
CompletionSequenceGroupOutput
]
=
[]
if
not
skip_sampler_cpu_output
:
if
skip_sampler_cpu_output
:
assert
isinstance
(
maybe_deferred_sample_results
,
SampleResultArgsType
)
deferred_sample_results_args
=
maybe_deferred_sample_results
else
:
assert
prompt_logprobs
is
not
None
assert
sample_logprobs
is
not
None
assert
not
isinstance
(
maybe_deferred_sample_results
,
SampleResultArgsType
)
deferred_sample_results_args
=
None
for
(
seq_group
,
sample_result
,
group_prompt_logprobs
,
group_sample_logprobs
)
in
zip
(
sampling_metadata
.
seq_groups
,
sample_results
,
prompt_logprob
s
,
sample_logprobs
):
maybe_deferred_sample_result
s
,
prompt_logprobs
,
sample_logprobs
):
seq_ids
=
seq_group
.
seq_ids
next_token_ids
,
parent_ids
=
sample_result
seq_outputs
:
List
[
SequenceOutput
]
=
[]
...
...
@@ -1176,7 +1380,7 @@ def _build_sampler_output(
sampled_token_probs
=
sampled_token_probs
,
sampled_token_ids
=
sampled_token_ids
,
logprobs
=
logprobs_tensor
,
)
deferred_sample_results_args
=
deferred_sample_results_args
)
def
_get_next_prompt_tokens
(
seq_group
:
SequenceGroupToSample
)
->
List
[
int
]:
...
...
vllm/model_executor/model_loader/neuron.py
View file @
428dd144
...
...
@@ -10,9 +10,8 @@ from transformers import PretrainedConfig
from
vllm.config
import
ModelConfig
,
ParallelConfig
,
SchedulerConfig
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
SamplerOutput
TORCH_DTYPE_TO_NEURON_AMP
=
{
"auto"
:
"f32"
,
...
...
vllm/model_executor/model_loader/openvino.py
View file @
428dd144
...
...
@@ -15,9 +15,8 @@ from vllm.config import DeviceConfig, ModelConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.logits_processor
import
(
LogitsProcessor
,
_prune_hidden_states
)
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
SamplerOutput
logger
=
init_logger
(
__name__
)
...
...
vllm/model_executor/models/arctic.py
View file @
428dd144
...
...
@@ -23,13 +23,13 @@ from vllm.model_executor.layers.quantization.base_config import (
from
vllm.model_executor.layers.quantization.deepspeedfp
import
(
DeepSpeedFPConfig
,
DeepSpeedFPParameter
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.arctic
import
ArcticConfig
logger
=
init_logger
(
__name__
)
...
...
vllm/model_executor/models/baichuan.py
View file @
428dd144
...
...
@@ -38,12 +38,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
...
...
vllm/model_executor/models/bart.py
View file @
428dd144
...
...
@@ -34,12 +34,12 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
logger
=
logging
.
get_logger
(
__name__
)
...
...
vllm/model_executor/models/blip2.py
View file @
428dd144
...
...
@@ -13,13 +13,13 @@ from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.opt
import
OPTModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
SamplerOutput
,
SequenceData
)
SequenceData
)
from
.blip
import
(
BlipVisionModel
,
dummy_image_for_blip
,
get_max_blip_image_tokens
)
...
...
vllm/model_executor/models/bloom.py
View file @
428dd144
...
...
@@ -34,12 +34,12 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
def
_get_alibi_slopes
(
total_num_heads
:
int
)
->
torch
.
Tensor
:
...
...
vllm/model_executor/models/chameleon.py
View file @
428dd144
...
...
@@ -22,7 +22,7 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
...
...
@@ -33,7 +33,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
repeat_and_pad_placeholder_tokens
)
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
SamplerOutput
,
SequenceData
)
SequenceData
)
from
vllm.utils
import
print_warning_once
from
.interfaces
import
SupportsMultiModal
...
...
vllm/model_executor/models/chatglm.py
View file @
428dd144
...
...
@@ -20,12 +20,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
ChatGLMConfig
from
.interfaces
import
SupportsLoRA
...
...
vllm/model_executor/models/commandr.py
View file @
428dd144
...
...
@@ -38,14 +38,14 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
row_parallel_weight_loader
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
@
torch
.
compile
...
...
vllm/model_executor/models/dbrx.py
View file @
428dd144
...
...
@@ -17,13 +17,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.dbrx
import
DbrxConfig
...
...
vllm/model_executor/models/deepseek.py
View file @
428dd144
...
...
@@ -43,12 +43,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
class
DeepseekMLP
(
nn
.
Module
):
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
428dd144
...
...
@@ -43,12 +43,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
.utils
import
PPMissingLayer
,
is_pp_missing_parameter
,
make_layers
...
...
vllm/model_executor/models/eagle.py
View file @
428dd144
...
...
@@ -5,12 +5,13 @@ import torch.nn as nn
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.eagle
import
EAGLEConfig
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment