Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2f8844ba
Unverified
Commit
2f8844ba
authored
Mar 10, 2024
by
Zhuohan Li
Committed by
GitHub
Mar 10, 2024
Browse files
Re-enable the 80 char line width limit (#3305)
parent
4b59f00e
Changes
67
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
110 additions
and
67 deletions
+110
-67
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq.py
+4
-2
vllm/model_executor/layers/quantization/gptq.py
vllm/model_executor/layers/quantization/gptq.py
+6
-4
vllm/model_executor/layers/quantization/marlin.py
vllm/model_executor/layers/quantization/marlin.py
+24
-15
vllm/model_executor/layers/quantization/squeezellm.py
vllm/model_executor/layers/quantization/squeezellm.py
+2
-1
vllm/model_executor/layers/sampler.py
vllm/model_executor/layers/sampler.py
+2
-1
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+2
-1
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/deepseek.py
+5
-3
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_j.py
+2
-1
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internlm2.py
+2
-1
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo.py
+12
-7
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+2
-1
vllm/model_executor/models/stablelm.py
vllm/model_executor/models/stablelm.py
+7
-6
vllm/model_executor/models/starcoder2.py
vllm/model_executor/models/starcoder2.py
+2
-1
vllm/model_executor/neuron_model_loader.py
vllm/model_executor/neuron_model_loader.py
+2
-1
vllm/model_executor/parallel_utils/communication_op.py
vllm/model_executor/parallel_utils/communication_op.py
+3
-2
vllm/model_executor/sampling_metadata.py
vllm/model_executor/sampling_metadata.py
+2
-1
vllm/sampling_params.py
vllm/sampling_params.py
+2
-2
vllm/sequence.py
vllm/sequence.py
+2
-1
vllm/spec_decode/batch_expansion.py
vllm/spec_decode/batch_expansion.py
+18
-11
vllm/spec_decode/multi_step_worker.py
vllm/spec_decode/multi_step_worker.py
+9
-5
No files found.
vllm/model_executor/layers/quantization/awq.py
View file @
2f8844ba
...
...
@@ -6,7 +6,8 @@ from torch.nn.parameter import Parameter
from
vllm._C
import
ops
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
set_weight_attrs
)
from
vllm.model_executor.layers.quantization.base_config
import
QuantizationConfig
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
class
AWQConfig
(
QuantizationConfig
):
...
...
@@ -50,7 +51,8 @@ class AWQConfig(QuantizationConfig):
def
get_config_filenames
()
->
List
[
str
]:
return
[
"quant_config.json"
,
# E.g., casperhansen/vicuna-7b-v1.5-awq
"quantize_config.json"
,
# E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
# E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
"quantize_config.json"
,
]
@
classmethod
...
...
vllm/model_executor/layers/quantization/gptq.py
View file @
2f8844ba
...
...
@@ -31,8 +31,8 @@ class GPTQConfig(QuantizationConfig):
self
.
pack_factor
=
Fraction
(
32
,
self
.
weight_bits
)
if
self
.
weight_bits
not
in
[
2
,
3
,
4
,
8
]:
raise
ValueError
(
"Currently, only 2/3/4/8-bit weight quantization is
supported for
"
f
"GPTQ, but got
{
self
.
weight_bits
}
bits."
)
"Currently, only 2/3/4/8-bit weight quantization is "
f
"
supported for
GPTQ, but got
{
self
.
weight_bits
}
bits."
)
def
__repr__
(
self
)
->
str
:
return
(
f
"GPTQConfig(weight_bits=
{
self
.
weight_bits
}
, "
...
...
@@ -101,7 +101,8 @@ class GPTQLinearMethod(LinearMethodBase):
"The input size is not aligned with the quantized "
"weight shape. This can be caused by too large "
"tensor parallel size."
)
if
output_size_per_partition
%
self
.
quant_config
.
pack_factor
.
numerator
!=
0
:
if
(
output_size_per_partition
%
self
.
quant_config
.
pack_factor
.
numerator
!=
0
):
raise
ValueError
(
"The output size is not aligned with the quantized "
"weight shape. This can be caused by too large "
...
...
@@ -114,7 +115,8 @@ class GPTQLinearMethod(LinearMethodBase):
exllama_state
=
ExllamaState
.
UNINITIALIZED
scale_and_zero_size
=
input_size
//
group_size
scale_and_zero_input_dim
=
None
if
input_size
!=
input_size_per_partition
and
self
.
quant_config
.
group_size
!=
-
1
:
if
(
input_size
!=
input_size_per_partition
and
self
.
quant_config
.
group_size
!=
-
1
):
# For act-order models, we cannot use Exllama for row parallel layer
if
self
.
quant_config
.
desc_act
:
exllama_state
=
ExllamaState
.
UNUSED
...
...
vllm/model_executor/layers/quantization/marlin.py
View file @
2f8844ba
...
...
@@ -5,7 +5,8 @@ from torch.nn.parameter import Parameter
from
vllm._C
import
ops
from
vllm.model_executor.layers.linear
import
LinearMethodBase
,
set_weight_attrs
from
vllm.model_executor.layers.quantization.base_config
import
QuantizationConfig
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
class
MarlinConfig
(
QuantizationConfig
):
...
...
@@ -22,8 +23,9 @@ class MarlinConfig(QuantizationConfig):
self
.
group_size
=
group_size
if
self
.
group_size
!=
128
and
self
.
group_size
!=
-
1
:
raise
ValueError
(
"Currently, only group size 128 and -1 (channelwise) is supported for "
f
"Marlin, but got group_size of
{
self
.
group_size
}
"
)
"Currently, only group size 128 and -1 (channelwise) "
"is supported for Marlin, but got group_size of "
f
"
{
self
.
group_size
}
"
)
# 4 Bits packed into 32 bit datatype.
self
.
pack_factor
=
32
//
4
...
...
@@ -37,7 +39,8 @@ class MarlinConfig(QuantizationConfig):
# Min in_features dim
self
.
min_k_threads
=
128
# Max parallel problems to solve at once (improves large batch performance)
# Max parallel problems to solve at once (improves large
# batch performance)
self
.
max_parallel
=
16
# Permutation length used by the marlin kernels.
...
...
@@ -102,22 +105,26 @@ class MarlinLinearMethod(LinearMethodBase):
# Validate output_size_per_partition
if
output_size_per_partition
%
self
.
quant_config
.
min_n_threads
!=
0
:
raise
ValueError
(
f
"Weight output_size_per_partition =
{
output_size_per_partition
}
is not divisible by min_n_threads =
{
self
.
quant_config
.
min_n_threads
}
."
)
f
"Weight output_size_per_partition = "
f
"
{
output_size_per_partition
}
is not divisible by "
f
"min_n_threads =
{
self
.
quant_config
.
min_n_threads
}
."
)
if
output_size_per_partition
%
self
.
quant_config
.
pack_factor
!=
0
:
raise
ValueError
(
f
"Weight output_size_per_partition =
{
output_size_per_partition
}
is not divisible by pack_factor =
{
self
.
quant_config
.
pack_factor
}
."
)
f
"Weight output_size_per_partition = "
f
"
{
output_size_per_partition
}
is not divisible by "
f
"pack_factor =
{
self
.
quant_config
.
pack_factor
}
."
)
# Validate input_size_per_partition
if
input_size_per_partition
%
self
.
quant_config
.
min_k_threads
!=
0
:
raise
ValueError
(
f
"Weight input_size_per_partition =
{
input_size_per_partition
}
is not divisible by min_k_threads =
{
self
.
quant_config
.
min_k_threads
}
."
)
if
self
.
quant_config
.
group_size
!=
-
1
and
input_size_per_partition
%
self
.
quant_config
.
group_size
!=
0
:
raise
ValueError
(
f
"Weight input_size_per_partition = f
{
input_size_per_partition
}
is not divisible by group_size =
{
self
.
quant_config
.
group_size
}
."
)
f
"Weight input_size_per_partition = "
f
"
{
input_size_per_partition
}
is not divisible by "
f
"min_k_threads =
{
self
.
quant_config
.
min_k_threads
}
."
)
if
(
self
.
quant_config
.
group_size
!=
-
1
and
input_size_per_partition
%
self
.
quant_config
.
group_size
!=
0
):
raise
ValueError
(
f
"Weight input_size_per_partition = "
f
"
{
input_size_per_partition
}
is not divisible by "
f
"group_size =
{
self
.
quant_config
.
group_size
}
."
)
# Check that we have at least 4 tiles horizontally in the shard
num_tiles_per_perm
=
self
.
quant_config
.
perm_len
//
(
...
...
@@ -149,7 +156,9 @@ class MarlinLinearMethod(LinearMethodBase):
)
# Determine if channelwise or not
input_groups
=
1
if
self
.
quant_config
.
group_size
==
-
1
else
input_size_per_partition
//
self
.
quant_config
.
group_size
input_groups
=
(
1
if
self
.
quant_config
.
group_size
==
-
1
else
input_size_per_partition
//
self
.
quant_config
.
group_size
)
scales
=
Parameter
(
torch
.
empty
(
...
...
vllm/model_executor/layers/quantization/squeezellm.py
View file @
2f8844ba
...
...
@@ -6,7 +6,8 @@ from torch.nn.parameter import Parameter
from
vllm._C
import
ops
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
set_weight_attrs
)
from
vllm.model_executor.layers.quantization.base_config
import
QuantizationConfig
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.utils
import
is_hip
...
...
vllm/model_executor/layers/sampler.py
View file @
2f8844ba
...
...
@@ -6,7 +6,8 @@ import torch.nn as nn
from
vllm.model_executor.parallel_utils.communication_op
import
(
tensor_model_parallel_gather
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
,
SamplingTensors
from
vllm.model_executor.sampling_metadata
import
(
SamplingMetadata
,
SamplingTensors
)
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.sequence
import
(
Logprob
,
PromptLogprobs
,
SampleLogprobs
,
SamplerOutput
,
SequenceData
,
SequenceGroupOutput
,
...
...
vllm/model_executor/models/baichuan.py
View file @
2f8844ba
...
...
@@ -333,7 +333,8 @@ class BaiChuanBaseForCausalLM(nn.Module):
if
"rotary_emb.inv_freq"
in
name
:
continue
if
name
==
"lm_head.weight"
:
# Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to:
# Unlike Baichuan, Baichuan2 normalizes the head weights.
# Refer to:
# https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
# Distinguish between Baichuan and Baichuan2 by checking the
# vocab size. This is suggested by
...
...
vllm/model_executor/models/deepseek.py
View file @
2f8844ba
...
...
@@ -119,7 +119,8 @@ class DeepseekMoE(nn.Module):
linear_method
=
None
)
if
config
.
n_shared_experts
is
not
None
:
intermediate_size
=
config
.
moe_intermediate_size
*
config
.
n_shared_experts
intermediate_size
=
(
config
.
moe_intermediate_size
*
config
.
n_shared_experts
)
self
.
shared_experts
=
DeepseekMLP
(
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
intermediate_size
,
...
...
@@ -273,8 +274,9 @@ class DeepseekDecoderLayer(nn.Module):
max_position_embeddings
=
max_position_embeddings
,
linear_method
=
linear_method
,
)
if
(
config
.
n_routed_experts
is
not
None
and
\
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
):
if
(
config
.
n_routed_experts
is
not
None
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
):
self
.
mlp
=
DeepseekMoE
(
config
=
config
,
linear_method
=
linear_method
)
else
:
self
.
mlp
=
DeepseekMLP
(
...
...
vllm/model_executor/models/gpt_j.py
View file @
2f8844ba
...
...
@@ -143,7 +143,8 @@ class GPTJBlock(nn.Module):
linear_method
:
Optional
[
LinearMethodBase
]
=
None
,
):
super
().
__init__
()
inner_dim
=
4
*
config
.
n_embd
if
config
.
n_inner
is
None
else
config
.
n_inner
inner_dim
=
(
4
*
config
.
n_embd
if
config
.
n_inner
is
None
else
config
.
n_inner
)
self
.
ln_1
=
nn
.
LayerNorm
(
config
.
n_embd
,
eps
=
config
.
layer_norm_epsilon
)
self
.
attn
=
GPTJAttention
(
config
,
linear_method
)
self
.
mlp
=
GPTJMLP
(
inner_dim
,
config
,
linear_method
)
...
...
vllm/model_executor/models/internlm2.py
View file @
2f8844ba
...
...
@@ -305,7 +305,8 @@ class InternLM2ForCausalLM(nn.Module):
param
=
params_dict
[
name
]
if
"wqkv"
in
name
:
config
=
self
.
config
kv_groups
=
config
.
num_attention_heads
//
config
.
num_key_value_heads
kv_groups
=
(
config
.
num_attention_heads
//
config
.
num_key_value_heads
)
head_dim
=
config
.
hidden_size
//
config
.
num_attention_heads
loaded_weight
=
loaded_weight
.
view
(
-
1
,
2
+
kv_groups
,
head_dim
,
...
...
vllm/model_executor/models/olmo.py
View file @
2f8844ba
...
...
@@ -52,7 +52,8 @@ from vllm.model_executor.layers.linear import (
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_world_size
,
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
...
...
@@ -81,7 +82,8 @@ class SwiGLU(nn.Module):
class
OlmoAttention
(
nn
.
Module
):
"""
This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
This is the attention block where the output is computed as
``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
(plus another skip connection).
"""
...
...
@@ -94,11 +96,12 @@ class OlmoAttention(nn.Module):
self
.
config
=
config
self
.
hidden_size
=
config
.
d_model
assert
config
.
d_model
%
config
.
n_heads
==
0
tensor_model_parallel_world_size
=
get_tensor_model_parallel_world_size
(
)
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
()
)
self
.
total_num_heads
=
self
.
config
.
n_heads
assert
self
.
total_num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tensor_model_parallel_world_size
self
.
num_heads
=
(
self
.
total_num_heads
//
tensor_model_parallel_world_size
)
self
.
head_dim
=
self
.
hidden_size
//
self
.
total_num_heads
# Layer norms.
...
...
@@ -158,7 +161,8 @@ class OlmoAttention(nn.Module):
class
OlmoMLP
(
nn
.
Module
):
"""
This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
This is the MLP block where the output is computed as
``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
(plus another skip connection).
"""
...
...
@@ -217,7 +221,8 @@ class OlmoMLP(nn.Module):
class
OlmoBlock
(
nn
.
Module
):
"""
This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
This is a typical transformer block where the output is
computed as ``MLP(LN(x + Attention(LN(x))))``
(plus another skip connection).
"""
...
...
vllm/model_executor/models/qwen2.py
View file @
2f8844ba
...
...
@@ -170,7 +170,8 @@ class Qwen2DecoderLayer(nn.Module):
self
.
hidden_size
=
config
.
hidden_size
# Requires transformers > 4.32.0
rope_theta
=
getattr
(
config
,
"rope_theta"
,
1000000
)
use_sliding_window
=
config
.
use_sliding_window
and
layer_idx
<
config
.
max_window_layers
use_sliding_window
=
(
config
.
use_sliding_window
and
layer_idx
<
config
.
max_window_layers
)
self
.
self_attn
=
Qwen2Attention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
...
...
vllm/model_executor/models/stablelm.py
View file @
2f8844ba
# coding=utf-8
# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved.
# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -16,7 +17,8 @@
# This code is based off the following work:
# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights."""
"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
model compatible with HuggingFace weights."""
from
typing
import
List
,
Optional
,
Tuple
import
torch
...
...
@@ -102,9 +104,9 @@ class StablelmAttention(nn.Module):
self
.
kv_size
=
self
.
num_key_value_heads
*
self
.
head_dim
self
.
qkv_bias
=
getattr
(
config
,
"use_qkv_bias"
,
False
)
if
(
self
.
head_dim
*
self
.
num_heads
*
tp_size
)
!=
self
.
hidden_size
:
raise
ValueError
(
f
"hidden_size must be divisible by num_heads
(got `hidden_size`:
{
self
.
hidden_size
}
"
f
" and `num_heads`:
{
self
.
num_heads
}
)."
)
raise
ValueError
(
f
"hidden_size must be divisible by num_heads "
f
"
(got `hidden_size`:
{
self
.
hidden_size
}
"
f
" and `num_heads`:
{
self
.
num_heads
}
)."
)
self
.
qkv_proj
=
QKVParallelLinear
(
self
.
hidden_size
,
self
.
head_dim
,
...
...
@@ -192,7 +194,6 @@ class StableLMEpochModel(nn.Module):
config
:
PretrainedConfig
,
linear_method
:
Optional
[
LinearMethodBase
]
=
None
)
->
None
:
super
().
__init__
()
# self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
...
...
vllm/model_executor/models/starcoder2.py
View file @
2f8844ba
...
...
@@ -35,7 +35,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
,
ParallelLMHead
,
DEFAULT_VOCAB_PADDING_SIZE
)
from
vllm.model_executor.parallel_utils.parallel_state
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.weight_utils
import
(
default_weight_loader
,
hf_model_weights_iterator
)
from
vllm.sequence
import
SamplerOutput
...
...
vllm/model_executor/neuron_model_loader.py
View file @
2f8844ba
...
...
@@ -34,7 +34,8 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
def
get_model
(
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
**
kwargs
)
->
nn
.
Module
:
from
transformers_neuronx.config
import
NeuronConfig
,
ContinuousBatchingConfig
from
transformers_neuronx.config
import
(
NeuronConfig
,
ContinuousBatchingConfig
)
parallel_config
=
kwargs
.
get
(
"parallel_config"
)
scheduler_config
=
kwargs
.
get
(
"scheduler_config"
)
...
...
vllm/model_executor/parallel_utils/communication_op.py
View file @
2f8844ba
...
...
@@ -11,7 +11,8 @@ from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_group
,
is_cupy_nccl_enabled_for_all_reduce
,
)
from
vllm.model_executor.parallel_utils.custom_all_reduce
import
custom_all_reduce
from
vllm.model_executor.parallel_utils.custom_all_reduce
import
(
custom_all_reduce
)
def
tensor_model_parallel_all_reduce
(
input_
:
torch
.
Tensor
)
->
torch
.
Tensor
:
...
...
@@ -24,7 +25,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
and GPU topology.
TLDR: always assume this function modifies its input, but use the return
value as the output.
value as the output.
"""
# Bypass the function if we are using only 1 GPU.
if
get_tensor_model_parallel_world_size
()
==
1
:
...
...
vllm/model_executor/sampling_metadata.py
View file @
2f8844ba
...
...
@@ -114,7 +114,8 @@ class SamplingTensors:
do_penalties
=
True
if
(
i
<
sampling_metadata
.
num_prompts
and
sampling_params
.
prompt_logprobs
is
not
None
):
# For tokens in the prompt that we only need to get their logprobs
# For tokens in the prompt that we only need to get
# their logprobs
prompt_len
=
sampling_metadata
.
prompt_lens
[
i
]
temperatures
+=
[
temperature
]
*
(
prompt_len
-
1
)
top_ps
+=
[
top_p
]
*
(
prompt_len
-
1
)
...
...
vllm/sampling_params.py
View file @
2f8844ba
...
...
@@ -74,8 +74,8 @@ class SamplingParams:
stop_token_ids: List of tokens that stop the generation when they are
generated. The returned output will contain the stop tokens unless
the stop tokens are special tokens.
include_stop_str_in_output: Whether to include the stop strings in
output
text. Defaults to False.
include_stop_str_in_output: Whether to include the stop strings in
output
text. Defaults to False.
ignore_eos: Whether to ignore the EOS token and continue generating
tokens after the EOS token is generated.
max_tokens: Maximum number of tokens to generate per output sequence.
...
...
vllm/sequence.py
View file @
2f8844ba
...
...
@@ -351,7 +351,8 @@ class SequenceGroup:
self
.
metrics
.
first_token_time
=
time
def
maybe_set_first_scheduled_time
(
self
,
time
:
float
)
->
None
:
"""Sets the first scheduled time and time in queue for Request level timings."""
"""Sets the first scheduled time and time in queue for Request
level timings."""
if
self
.
metrics
.
first_scheduled_time
is
None
:
self
.
metrics
.
first_scheduled_time
=
time
self
.
metrics
.
time_in_queue
=
time
-
self
.
metrics
.
arrival_time
...
...
vllm/spec_decode/batch_expansion.py
View file @
2f8844ba
...
...
@@ -5,8 +5,12 @@ import torch
from
vllm.sequence
import
(
SamplerOutput
,
SequenceGroupMetadata
,
SequenceData
)
from
vllm.worker.worker
import
Worker
from
vllm.spec_decode.util
import
nvtx_range
,
sampler_output_to_torch
,
get_all_seq_ids
,
split_batch_by_proposal_len
from
vllm.spec_decode.interfaces
import
SpeculativeScorer
,
SpeculativeProposals
,
SpeculativeScores
from
vllm.spec_decode.util
import
(
nvtx_range
,
sampler_output_to_torch
,
get_all_seq_ids
,
split_batch_by_proposal_len
)
from
vllm.spec_decode.interfaces
import
(
SpeculativeScorer
,
SpeculativeProposals
,
SpeculativeScores
)
SeqId
=
int
TargetSeqId
=
int
...
...
@@ -68,11 +72,12 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
proposal_lens_list
=
proposals
.
proposal_lens
.
tolist
()
proposal_token_ids_list
=
proposals
.
proposal_token_ids
.
tolist
()
spec_indices
,
non_spec_indices
,
target_seq_group_metadata_list
,
num_scoring_tokens
=
self
.
_expand_batch
(
seq_group_metadata_list
=
seq_group_metadata_list
,
proposal_token_ids_list
=
proposal_token_ids_list
,
proposal_lens_list
=
proposal_lens_list
,
)
(
spec_indices
,
non_spec_indices
,
target_seq_group_metadata_list
,
num_scoring_tokens
)
=
self
.
_expand_batch
(
seq_group_metadata_list
=
seq_group_metadata_list
,
proposal_token_ids_list
=
proposal_token_ids_list
,
proposal_lens_list
=
proposal_lens_list
,
)
target_sampler_output
=
self
.
_scorer_worker
.
execute_model
(
seq_group_metadata_list
=
target_seq_group_metadata_list
,
...
...
@@ -125,7 +130,8 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
num_scoring_tokens
=
len
(
target_seq_group_metadata_list
)
target_seq_group_metadata_list
.
extend
(
non_spec_seqs
)
return
spec_indices
,
non_spec_indices
,
target_seq_group_metadata_list
,
num_scoring_tokens
return
(
spec_indices
,
non_spec_indices
,
target_seq_group_metadata_list
,
num_scoring_tokens
)
def
_contract_batch
(
self
,
original_bs
:
int
,
target_sampler_output
:
List
[
SamplerOutput
],
...
...
@@ -306,10 +312,11 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
# Convert non-speculative output tokens to tensors.
sampler_output
.
sampled_token_probs
=
non_spec_probs
sampler_output
.
sampled_token_ids
=
non_spec_sampled_tokens
non_spec_target_token_ids
,
non_spec_target_probs
=
sampler_output_to_torch
(
[
sampler_output
])
non_spec_target_token_ids
,
non_spec_target_probs
=
(
sampler_output_to_torch
(
[
sampler_output
])
)
return
target_token_ids
,
target_probs
,
non_spec_target_token_ids
,
non_spec_target_probs
return
(
target_token_ids
,
target_probs
,
non_spec_target_token_ids
,
non_spec_target_probs
)
def
_create_target_seq_id_iterator
(
self
,
seq_ids
:
List
[
SeqId
])
->
Iterator
[
TargetSeqId
]:
...
...
vllm/spec_decode/multi_step_worker.py
View file @
2f8844ba
...
...
@@ -5,7 +5,8 @@ import torch
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.worker.worker
import
Worker
from
vllm.spec_decode.interfaces
import
SpeculativeProposals
,
SpeculativeProposer
from
vllm.spec_decode.interfaces
import
(
SpeculativeProposals
,
SpeculativeProposer
)
from
vllm.spec_decode.util
import
sampler_output_to_torch
...
...
@@ -247,8 +248,9 @@ class DraftModelTop1Proposer(SpeculativeProposer):
"""
# Split speculative- and non-speculative- sequences.
proposal_lens
,
nonzero_proposal_len_seqs
,
nonzero_proposal_len_indices
=
self
.
_split_by_max_model_len
(
seq_group_metadata_list
,
max_proposal_len
)
(
proposal_lens
,
nonzero_proposal_len_seqs
,
nonzero_proposal_len_indices
)
=
self
.
_split_by_max_model_len
(
seq_group_metadata_list
,
max_proposal_len
)
if
nonzero_proposal_len_seqs
:
# Speculate tokens using the draft worker for the speculative
...
...
@@ -306,7 +308,8 @@ class DraftModelTop1Proposer(SpeculativeProposer):
else
:
proposal_lens
.
append
(
0
)
return
proposal_lens
,
nonzero_proposal_len_seqs
,
nonzero_proposal_len_indices
return
(
proposal_lens
,
nonzero_proposal_len_seqs
,
nonzero_proposal_len_indices
)
def
_merge_outputs
(
self
,
...
...
@@ -356,7 +359,8 @@ class DraftModelTop1Proposer(SpeculativeProposer):
device
=
self
.
_device
)
entire_proposal_probs
[
nonzero_proposal_len_indices
]
=
proposal_probs
proposal_tokens
,
proposal_probs
=
entire_proposal_tokens
,
entire_proposal_probs
proposal_tokens
,
proposal_probs
=
(
entire_proposal_tokens
,
entire_proposal_probs
)
proposal_lens
=
torch
.
zeros
(
batch_size
,
dtype
=
torch
.
long
,
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment