Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
e019635f
Commit
e019635f
authored
Nov 01, 2024
by
xuxzh1
🎱
Browse files
update
parent
64def8e2
Changes
171
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
4782 deletions
+0
-4782
server/vllm/vllm/model_executor/input_metadata.py
server/vllm/vllm/model_executor/input_metadata.py
+0
-85
server/vllm/vllm/model_executor/layers/__init__.py
server/vllm/vllm/model_executor/layers/__init__.py
+0
-0
server/vllm/vllm/model_executor/layers/activation.py
server/vllm/vllm/model_executor/layers/activation.py
+0
-56
server/vllm/vllm/model_executor/layers/attention.py
server/vllm/vllm/model_executor/layers/attention.py
+0
-466
server/vllm/vllm/model_executor/layers/layernorm.py
server/vllm/vllm/model_executor/layers/layernorm.py
+0
-32
server/vllm/vllm/model_executor/layers/quantized_linear/__init__.py
...m/vllm/model_executor/layers/quantized_linear/__init__.py
+0
-37
server/vllm/vllm/model_executor/layers/quantized_linear/awq.py
...r/vllm/vllm/model_executor/layers/quantized_linear/awq.py
+0
-102
server/vllm/vllm/model_executor/layers/rotary_embedding.py
server/vllm/vllm/model_executor/layers/rotary_embedding.py
+0
-169
server/vllm/vllm/model_executor/layers/sampler.py
server/vllm/vllm/model_executor/layers/sampler.py
+0
-591
server/vllm/vllm/model_executor/model_loader.py
server/vllm/vllm/model_executor/model_loader.py
+0
-106
server/vllm/vllm/model_executor/models/__init__.py
server/vllm/vllm/model_executor/models/__init__.py
+0
-33
server/vllm/vllm/model_executor/models/aquila.py
server/vllm/vllm/model_executor/models/aquila.py
+0
-372
server/vllm/vllm/model_executor/models/baichuan.py
server/vllm/vllm/model_executor/models/baichuan.py
+0
-389
server/vllm/vllm/model_executor/models/bloom.py
server/vllm/vllm/model_executor/models/bloom.py
+0
-328
server/vllm/vllm/model_executor/models/falcon.py
server/vllm/vllm/model_executor/models/falcon.py
+0
-503
server/vllm/vllm/model_executor/models/gpt2.py
server/vllm/vllm/model_executor/models/gpt2.py
+0
-305
server/vllm/vllm/model_executor/models/gpt_bigcode.py
server/vllm/vllm/model_executor/models/gpt_bigcode.py
+0
-346
server/vllm/vllm/model_executor/models/gpt_j.py
server/vllm/vllm/model_executor/models/gpt_j.py
+0
-268
server/vllm/vllm/model_executor/models/gpt_neox.py
server/vllm/vllm/model_executor/models/gpt_neox.py
+0
-285
server/vllm/vllm/model_executor/models/internlm.py
server/vllm/vllm/model_executor/models/internlm.py
+0
-309
No files found.
server/vllm/vllm/model_executor/input_metadata.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
torch
from
xformers.ops
import
AttentionBias
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
SequenceData
class
InputMetadata
:
"""Metadata for input sequences. Used for PagedAttention.
Args:
seq_groups: List of (seq_ids, sampling_params).
seq_data: Seq_id -> SequenceData.
prompt_lens: Lengths of prompts.
slot_mapping: The address to write the new KV to of each token.
context_lens: the length of attention context for each generation token.
max_context_len: The maximum context length.
block_tables: The block tables. (Seq id -> list of physical block)
"""
def
__init__
(
self
,
seq_groups
:
List
[
Tuple
[
List
[
int
],
SamplingParams
]],
seq_data
:
Dict
[
int
,
SequenceData
],
prompt_lens
:
List
[
int
],
slot_mapping
:
torch
.
Tensor
,
context_lens
:
torch
.
Tensor
,
max_context_len
:
int
,
block_tables
:
torch
.
Tensor
,
sliding_window
:
Optional
[
int
]
=
None
,
)
->
None
:
self
.
seq_groups
=
seq_groups
self
.
seq_data
=
seq_data
self
.
prompt_lens
=
prompt_lens
self
.
slot_mapping
=
slot_mapping
self
.
context_lens
=
context_lens
self
.
max_context_len
=
max_context_len
self
.
block_tables
=
block_tables
self
.
max_prompt_len
=
max
(
prompt_lens
)
if
prompt_lens
else
0
self
.
to_cache
=
None
if
sliding_window
is
not
None
:
# We need to keep the positions of sliding windows within
# the key / value tables, this is helpful to know which
# elements we need to cache.
to_cache
,
start_idx
=
[],
0
for
prompt_len
in
self
.
prompt_lens
:
to_cache
.
extend
(
range
(
start_idx
+
max
(
0
,
prompt_len
-
sliding_window
),
start_idx
+
prompt_len
,
))
start_idx
+=
self
.
max_prompt_len
to_cache
.
extend
(
range
(
start_idx
,
slot_mapping
.
shape
[
0
]))
self
.
to_cache
=
torch
.
tensor
(
to_cache
,
dtype
=
torch
.
int32
,
device
=
self
.
slot_mapping
.
device
)
self
.
num_prompts
=
len
(
prompt_lens
)
self
.
num_prompt_tokens
=
self
.
num_prompts
*
self
.
max_prompt_len
self
.
num_generation_tokens
=
context_lens
.
shape
[
0
]
if
block_tables
.
numel
()
>
0
:
self
.
max_num_blocks_per_seq
=
block_tables
.
shape
[
1
]
else
:
self
.
max_num_blocks_per_seq
=
0
assert
block_tables
.
shape
[
0
]
==
self
.
num_generation_tokens
assert
context_lens
.
shape
[
0
]
==
self
.
num_generation_tokens
# Set during the execution of the first attention op.
self
.
attn_bias
:
Optional
[
AttentionBias
]
=
None
def
__repr__
(
self
)
->
str
:
# Print only useful metadata.
return
(
f
'InputMetadata('
f
'num_prompt_tokens=
{
self
.
num_prompt_tokens
}
, '
f
'num_prompts=
{
self
.
num_prompts
}
, '
f
'prompt_lens=
{
self
.
prompt_lens
}
, '
f
'num_generation_tokens=
{
self
.
num_generation_tokens
}
, '
f
'context_lens=
{
self
.
context_lens
}
, '
f
'max_context_len=
{
self
.
max_context_len
}
), '
f
'max_num_blocks_per_seq=
{
self
.
max_num_blocks_per_seq
}
, '
f
'block_tables=
{
self
.
block_tables
}
), '
f
'slot_mapping=
{
self
.
slot_mapping
}
'
)
server/vllm/vllm/model_executor/layers/__init__.py
deleted
100644 → 0
View file @
64def8e2
server/vllm/vllm/model_executor/layers/activation.py
deleted
100644 → 0
View file @
64def8e2
"""Custom activation functions."""
import
torch
import
torch.nn
as
nn
from
vllm
import
activation_ops
class
SiluAndMul
(
nn
.
Module
):
"""An activation function for SwiGLU.
The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
Shapes:
x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
return: (batch_size, seq_len, d) or (num_tokens, d)
"""
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
d
=
x
.
shape
[
-
1
]
//
2
output_shape
=
(
x
.
shape
[:
-
1
]
+
(
d
,
))
out
=
torch
.
empty
(
output_shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
)
activation_ops
.
silu_and_mul
(
out
,
x
)
return
out
class
NewGELU
(
nn
.
Module
):
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
out
=
torch
.
empty_like
(
x
)
activation_ops
.
gelu_new
(
out
,
x
)
return
out
class
FastGELU
(
nn
.
Module
):
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
out
=
torch
.
empty_like
(
x
)
activation_ops
.
gelu_fast
(
out
,
x
)
return
out
_ACTIVATION_REGISTRY
=
{
"gelu"
:
nn
.
GELU
(),
"gelu_fast"
:
FastGELU
(),
"gelu_new"
:
NewGELU
(),
"gelu_pytorch_tanh"
:
nn
.
GELU
(
approximate
=
"tanh"
),
"relu"
:
nn
.
ReLU
(),
}
def
get_act_fn
(
act_fn
:
str
)
->
nn
.
Module
:
"""Get an activation function by name."""
act_fn
=
act_fn
.
lower
()
if
act_fn
in
_ACTIVATION_REGISTRY
:
return
_ACTIVATION_REGISTRY
[
act_fn
]
raise
ValueError
(
f
"Activation function
{
act_fn
!
r
}
is not supported."
)
server/vllm/vllm/model_executor/layers/attention.py
deleted
100644 → 0
View file @
64def8e2
"""Multi-head attention."""
from
typing
import
Any
,
Dict
,
List
,
Optional
import
torch
import
torch.nn
as
nn
from
xformers
import
ops
as
xops
from
xformers.ops.fmha.attn_bias
import
(
BlockDiagonalCausalMask
,
LowerTriangularMaskWithTensorBias
)
from
vllm
import
attention_ops
from
vllm
import
cache_ops
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.rotary_embedding
import
(
DynamicNTKScalingRotaryEmbedding
,
LinearScalingRotaryEmbedding
,
RotaryEmbedding
)
_SUPPORTED_HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
_PARTITION_SIZE
=
512
class
PagedAttention
(
nn
.
Module
):
# pylint: disable=line-too-long
"""GPT-style multi-head PagedAttention.
This class takes query, key, and value tensors as input. The input tensors
can either contain prompt tokens or generation tokens, in addition to
paddings.
The class does the following:
1. Perform multi_query_kv_attention for the prompts. This operation does
not use the KV cache.
2. Wait for the cache operations (e.g., swap, copy) to finish. The cache
operations are issued by the cache engine before executing the forward
pass of the model, and they are executed asynchronously.
3. Reshape and store the input key and value tensors in the KV cache.
4. Perform single_query_cached_kv_attention for the generation tokens.
This operation reads the previous key and value tensors from the KV
cache.
5. Return the output tensor.
"""
def
__init__
(
self
,
num_heads
:
int
,
head_size
:
int
,
scale
:
float
,
num_kv_heads
:
Optional
[
int
]
=
None
,
sliding_window
:
Optional
[
int
]
=
None
)
->
None
:
super
().
__init__
()
self
.
num_heads
=
num_heads
self
.
head_size
=
head_size
self
.
scale
=
float
(
scale
)
self
.
num_kv_heads
=
num_heads
if
num_kv_heads
is
None
else
num_kv_heads
self
.
sliding_window
=
sliding_window
assert
self
.
num_heads
%
self
.
num_kv_heads
==
0
self
.
num_queries_per_kv
=
self
.
num_heads
//
self
.
num_kv_heads
self
.
head_mapping
=
torch
.
repeat_interleave
(
torch
.
arange
(
self
.
num_kv_heads
,
dtype
=
torch
.
int32
,
device
=
"cuda"
),
self
.
num_queries_per_kv
)
if
self
.
head_size
not
in
_SUPPORTED_HEAD_SIZES
:
raise
ValueError
(
f
"head_size (
{
self
.
head_size
}
) is not supported. "
f
"Supported head sizes:
{
_SUPPORTED_HEAD_SIZES
}
."
)
def
set_attn_bias
(
self
,
input_metadata
:
InputMetadata
,
dtype
:
torch
.
dtype
,
)
->
None
:
del
dtype
# Unused.
if
input_metadata
.
attn_bias
is
not
None
:
# Already set by a previous layer.
return
prompt_lens
=
[
input_metadata
.
max_prompt_len
]
*
input_metadata
.
num_prompts
attn_bias
=
BlockDiagonalCausalMask
.
from_seqlens
(
prompt_lens
)
if
self
.
sliding_window
is
not
None
:
attn_bias
=
attn_bias
.
make_local_attention
(
self
.
sliding_window
)
input_metadata
.
attn_bias
=
attn_bias
def
multi_query_kv_attention
(
self
,
output
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
"""Normal attention for the prompt tokens.
Args:
output: shape = [num_prompt_tokens, num_heads, head_size]
query: shape = [num_prompt_tokens, num_heads, head_size]
key: shape = [num_prompt_tokens, num_kv_heads, head_size]
value: shape = [num_prompt_tokens, num_kv_heads, head_size]
input_metadata: metadata for paged attention.
"""
if
self
.
num_kv_heads
!=
self
.
num_heads
:
# Project the key and value tensors to the desired number of heads.
key
=
torch
.
repeat_interleave
(
key
,
self
.
num_queries_per_kv
,
dim
=
1
)
value
=
torch
.
repeat_interleave
(
value
,
self
.
num_queries_per_kv
,
dim
=
1
)
# TODO(woosuk): The unsqueeze op may incur some CPU overhead. Optimize.
out
=
xops
.
memory_efficient_attention_forward
(
query
.
unsqueeze
(
0
),
key
.
unsqueeze
(
0
),
value
.
unsqueeze
(
0
),
attn_bias
=
input_metadata
.
attn_bias
,
p
=
0.0
,
scale
=
self
.
scale
,
)
# TODO(woosuk): Unnecessary copy. Optimize.
output
.
copy_
(
out
.
squeeze
(
0
))
return
output
def
get_alibi_slopes
(
self
)
->
Optional
[
torch
.
Tensor
]:
"""Returns the slopes for the alibi attention bias.
Returns:
slopes: shape = [num_heads]
"""
return
None
def
single_query_cached_kv_attention
(
self
,
output
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key_cache
:
torch
.
Tensor
,
value_cache
:
torch
.
Tensor
,
input_metadata
:
InputMetadata
,
alibi_slopes
:
Optional
[
torch
.
Tensor
],
)
->
None
:
"""PagedAttention for the generation tokens.
Args:
output: shape = [num_generation_tokens, num_heads, head_size]
query: shape = [num_generation_tokens, num_heads, head_size]
key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
block_size, x]
value_cache: shape = [num_blocks, num_kv_heads, head_size,
block_size]
input_metadata: metadata for paged attention.
alibi_slopes: shape = [num_heads]
"""
block_size
=
value_cache
.
shape
[
3
]
num_seqs
,
num_heads
,
head_size
=
query
.
shape
max_num_partitions
=
(
(
input_metadata
.
max_context_len
+
_PARTITION_SIZE
-
1
)
//
_PARTITION_SIZE
)
# NOTE(woosuk): We use a simple heuristic to decide whether to use
# PagedAttention V1 or V2. If the number of partitions is 1, we use
# V1 to avoid the overhead of reduction. Also, if the number of
# sequences or heads is large, we use V1 since there is enough work
# to parallelize.
# TODO(woosuk): Tune this heuristic.
use_v1
=
max_num_partitions
==
1
or
num_seqs
*
num_heads
>
512
if
use_v1
:
# Run PagedAttention V1.
attention_ops
.
paged_attention_v1
(
output
,
query
,
key_cache
,
value_cache
,
self
.
head_mapping
,
self
.
scale
,
input_metadata
.
block_tables
,
input_metadata
.
context_lens
,
block_size
,
input_metadata
.
max_context_len
,
alibi_slopes
,
)
else
:
# Run PagedAttention V2.
assert
_PARTITION_SIZE
%
block_size
==
0
tmp_output
=
torch
.
empty
(
size
=
(
num_seqs
,
num_heads
,
max_num_partitions
,
head_size
),
dtype
=
output
.
dtype
,
device
=
output
.
device
,
)
exp_sums
=
torch
.
empty
(
size
=
(
num_seqs
,
num_heads
,
max_num_partitions
),
dtype
=
torch
.
float32
,
device
=
output
.
device
,
)
max_logits
=
torch
.
empty_like
(
exp_sums
)
attention_ops
.
paged_attention_v2
(
output
,
exp_sums
,
max_logits
,
tmp_output
,
query
,
key_cache
,
value_cache
,
self
.
head_mapping
,
self
.
scale
,
input_metadata
.
block_tables
,
input_metadata
.
context_lens
,
block_size
,
input_metadata
.
max_context_len
,
alibi_slopes
,
)
def
forward
(
self
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
key_cache
:
Optional
[
torch
.
Tensor
],
value_cache
:
Optional
[
torch
.
Tensor
],
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
"""PagedAttention forward pass.
NOTE: The query, key, and value tensors must be sliced from a qkv
tensor of shape [batch_size, seq_len, 3 * num_heads * head_size].
Args:
query: shape = [batch_size, seq_len, num_heads * head_size]
key: shape = [batch_size, seq_len, num_kv_heads * head_size]
value: shape = [batch_size, num_kv_heads * head_size]
key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
block_size, x]
value_cache: shape = [num_blocks, num_kv_heads, head_size,
block_size]
input_metadata: metadata for paged attention.
cache_event: event to wait for the cache operations to finish.
Returns:
shape = [batch_size, seq_len, num_heads * head_size]
"""
batch_size
,
seq_len
,
_
=
query
.
shape
# Reshape the query, key, and value tensors.
query
=
query
.
view
(
-
1
,
self
.
num_heads
,
self
.
head_size
)
key
=
key
.
view
(
-
1
,
self
.
num_kv_heads
,
self
.
head_size
)
value
=
value
.
view
(
-
1
,
self
.
num_kv_heads
,
self
.
head_size
)
# Pre-allocate the output tensor.
output
=
torch
.
empty_like
(
query
)
# Compute the attention op for prompts.
num_prompt_tokens
=
input_metadata
.
num_prompt_tokens
if
num_prompt_tokens
>
0
:
# Prompt run.
assert
input_metadata
.
num_generation_tokens
==
0
self
.
set_attn_bias
(
input_metadata
,
dtype
=
query
.
dtype
)
self
.
multi_query_kv_attention
(
output
,
query
,
key
,
value
,
input_metadata
,
)
# Wait until the cache op is done.
if
cache_event
is
not
None
:
cache_event
.
wait
()
# Reshape the keys and values and store them in the cache.
# When key_cache and value_cache are not provided, the new key
# and value vectors will not be cached.
if
key_cache
is
not
None
and
value_cache
is
not
None
:
key_to_cache
=
key
value_to_cache
=
value
slot_mapping
=
input_metadata
.
slot_mapping
.
view
(
-
1
)
if
input_metadata
.
to_cache
is
not
None
:
key_to_cache
=
key_to_cache
[
input_metadata
.
to_cache
]
value_to_cache
=
value_to_cache
[
input_metadata
.
to_cache
]
slot_mapping
=
slot_mapping
[
input_metadata
.
to_cache
]
cache_ops
.
reshape_and_cache
(
key_to_cache
,
value_to_cache
,
key_cache
,
value_cache
,
slot_mapping
,
)
if
input_metadata
.
num_generation_tokens
>
0
:
# Decoding run.
assert
input_metadata
.
num_prompt_tokens
==
0
assert
key_cache
is
not
None
and
value_cache
is
not
None
,
(
"key_cache and value_cache must be provided when "
"generating tokens."
)
# Compute the attention op for generation tokens.
self
.
single_query_cached_kv_attention
(
output
,
query
,
key_cache
,
value_cache
,
input_metadata
,
self
.
get_alibi_slopes
())
# Reshape the output tensor.
# NOTE(woosuk): The output tensor may include paddings.
return
output
.
view
(
batch_size
,
seq_len
,
self
.
num_heads
*
self
.
head_size
)
class
PagedAttentionWithRoPE
(
PagedAttention
):
"""PagedAttention with rotary positional embedding."""
def
__init__
(
self
,
num_heads
:
int
,
head_size
:
int
,
scale
:
float
,
rotary_dim
:
int
,
max_position
:
int
=
8192
,
base
:
int
=
10000
,
num_kv_heads
:
Optional
[
int
]
=
None
,
is_neox_style
:
bool
=
True
,
rope_scaling
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
sliding_window
:
Optional
[
int
]
=
None
,
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
,
sliding_window
=
sliding_window
)
if
rope_scaling
is
None
:
self
.
rotary_emb
=
RotaryEmbedding
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
)
else
:
scaling_type
=
rope_scaling
[
"type"
]
scaling_factor
=
rope_scaling
[
"factor"
]
if
scaling_type
==
"linear"
:
self
.
rotary_emb
=
LinearScalingRotaryEmbedding
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
scaling_factor
)
elif
scaling_type
==
"dynamic"
:
self
.
rotary_emb
=
DynamicNTKScalingRotaryEmbedding
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
scaling_factor
)
else
:
raise
ValueError
(
f
"Unknown RoPE scaling type
{
scaling_type
}
"
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
key_cache
:
torch
.
Tensor
,
value_cache
:
torch
.
Tensor
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
""" PagedAttention forward pass with rotary embedding.
Args:
positions: shape = [batch_size, seq_len]
query: shape = [batch_size, seq_len, num_heads * head_size]
key: shape = [batch_size, seq_len, num_kv_heads * head_size]
value: shape = [batch_size, seq_len, num_kv_heads * head_size]
key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
block_size, x]
value_cache: shape = [num_blocks, num_kv_heads, head_size,
block_size]
input_metadata: metadata for paged attention.
cache_event: event to wait for the cache operations to finish.
Returns:
shape = [batch_size, seq_len, num_heads * head_size]
"""
# Apply rotary embedding to the query and key before passing them
# to the attention op.
query
,
key
=
self
.
rotary_emb
(
positions
,
query
,
key
)
return
super
().
forward
(
query
,
key
,
value
,
key_cache
,
value_cache
,
input_metadata
,
cache_event
,
)
class
PagedAttentionWithALiBi
(
PagedAttention
):
"""PagedAttention with ALiBi attention bias."""
def
__init__
(
self
,
num_heads
:
int
,
head_size
:
int
,
scale
:
float
,
slopes
:
List
[
float
],
num_kv_heads
:
Optional
[
int
]
=
None
)
->
None
:
super
().
__init__
(
num_heads
,
head_size
,
scale
,
num_kv_heads
)
assert
len
(
slopes
)
==
num_heads
slopes
=
torch
.
tensor
(
slopes
,
dtype
=
torch
.
float32
)
self
.
register_buffer
(
"alibi_slopes"
,
slopes
,
persistent
=
False
)
def
set_attn_bias
(
self
,
input_metadata
:
InputMetadata
,
dtype
:
torch
.
dtype
)
->
None
:
if
input_metadata
.
attn_bias
is
not
None
:
# Already set by a previous layer.
return
# Generates ALiBi mask based on the max prompt length.
max_prompt_len
=
input_metadata
.
max_prompt_len
bias
=
torch
.
arange
(
max_prompt_len
,
dtype
=
dtype
)
# NOTE(zhuohan): HF uses
# `bias = bias[None, :].repeat(prompt_len, 1)`
# here. We find that both biases give the same results, but
# the bias below more accurately follows the original ALiBi
# paper.
bias
=
bias
[
None
,
:]
-
bias
[:,
None
]
bias
=
bias
.
to
(
self
.
alibi_slopes
.
device
)
# When using custom attention bias, xformers requires the bias to
# be sliced from a tensor whose length is a multiple of 8.
padded_len
=
(
max_prompt_len
+
7
)
//
8
*
8
bias
=
torch
.
empty
(
input_metadata
.
num_prompts
,
self
.
num_heads
,
max_prompt_len
,
padded_len
,
device
=
self
.
alibi_slopes
.
device
,
dtype
=
dtype
,
)[:,
:,
:,
:
max_prompt_len
].
copy_
(
bias
)
bias
.
mul_
(
self
.
alibi_slopes
[:,
None
,
None
])
attn_bias
=
LowerTriangularMaskWithTensorBias
(
bias
)
input_metadata
.
attn_bias
=
attn_bias
def
multi_query_kv_attention
(
self
,
output
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
"""Attention with ALiBi bias for the prompt tokens.
Args:
output: shape = [num_prompt_tokens, num_heads, head_size]
query: shape = [num_prompt_tokens, num_heads, head_size]
key: shape = [num_prompt_tokens, num_kv_heads, head_size]
value: shape = [num_prompt_tokens, num_kv_heads, head_size]
input_metadata: metadata for paged attention.
"""
if
self
.
num_kv_heads
!=
self
.
num_heads
:
# Project the key and value tensors to the desired number of heads.
key
=
torch
.
repeat_interleave
(
key
,
self
.
num_queries_per_kv
,
dim
=
1
)
value
=
torch
.
repeat_interleave
(
value
,
self
.
num_queries_per_kv
,
dim
=
1
)
batch_size
=
input_metadata
.
num_prompts
seq_len
=
input_metadata
.
max_prompt_len
out
=
xops
.
memory_efficient_attention_forward
(
query
.
view
(
batch_size
,
seq_len
,
self
.
num_heads
,
self
.
head_size
),
key
.
view
(
batch_size
,
seq_len
,
self
.
num_heads
,
self
.
head_size
),
value
.
view
(
batch_size
,
seq_len
,
self
.
num_heads
,
self
.
head_size
),
attn_bias
=
input_metadata
.
attn_bias
,
p
=
0.0
,
scale
=
self
.
scale
,
)
# TODO(woosuk): Unnecessary copy. Optimize.
output
.
copy_
(
out
.
view
(
-
1
,
self
.
num_heads
,
self
.
head_size
))
return
output
def
get_alibi_slopes
(
self
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
alibi_slopes
server/vllm/vllm/model_executor/layers/layernorm.py
deleted
100644 → 0
View file @
64def8e2
"""Custom normalization layers."""
import
torch
import
torch.nn
as
nn
from
vllm
import
layernorm_ops
class
RMSNorm
(
nn
.
Module
):
"""Root mean square normalization.
Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
Refer to https://arxiv.org/abs/1910.07467
"""
def
__init__
(
self
,
hidden_size
:
int
,
eps
:
float
=
1e-6
,
)
->
None
:
super
().
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
variance_epsilon
=
eps
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
out
=
torch
.
empty_like
(
x
)
layernorm_ops
.
rms_norm
(
out
,
x
,
self
.
weight
.
data
,
self
.
variance_epsilon
,
)
return
out
server/vllm/vllm/model_executor/layers/quantized_linear/__init__.py
deleted
100644 → 0
View file @
64def8e2
from
vllm.model_executor.layers.quantized_linear.awq
import
(
AWQColumnParallelLinear
,
AWQRowParallelLinear
)
from
vllm.model_executor.parallel_utils.layers
import
(
ColumnParallelLinear
,
RowParallelLinear
)
_QUANTIZED_LINEAR_REGISTRY
=
{
"awq"
:
(
AWQColumnParallelLinear
,
AWQRowParallelLinear
),
}
class
ParallelLinear
:
@
classmethod
def
column
(
cls
,
*
args
,
**
kwargs
)
->
ColumnParallelLinear
:
quant_config
=
kwargs
.
get
(
"quant_config"
,
None
)
if
quant_config
is
None
:
return
ColumnParallelLinear
(
*
args
,
**
kwargs
)
name
=
quant_config
.
get_name
()
if
name
not
in
_QUANTIZED_LINEAR_REGISTRY
:
raise
ValueError
(
f
"No quantized linear is found for
{
name
}
"
)
quant_linear_cls
=
_QUANTIZED_LINEAR_REGISTRY
[
name
][
0
]
return
quant_linear_cls
(
*
args
,
**
kwargs
)
@
classmethod
def
row
(
cls
,
*
args
,
**
kwargs
)
->
RowParallelLinear
:
quant_config
=
kwargs
.
get
(
"quant_config"
,
None
)
if
quant_config
is
None
:
return
RowParallelLinear
(
*
args
,
**
kwargs
)
name
=
quant_config
.
get_name
()
if
name
not
in
_QUANTIZED_LINEAR_REGISTRY
:
raise
ValueError
(
f
"No quantized linear is found for
{
name
}
"
)
quant_linear_cls
=
_QUANTIZED_LINEAR_REGISTRY
[
name
][
1
]
return
quant_linear_cls
(
*
args
,
**
kwargs
)
server/vllm/vllm/model_executor/layers/quantized_linear/awq.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
Optional
import
torch
from
torch.nn.parameter
import
Parameter
from
vllm
import
quantization_ops
from
vllm.model_executor.parallel_utils.layers
import
(
ColumnParallelLinear
,
RowParallelLinear
)
class
AWQColumnParallelLinear
(
ColumnParallelLinear
):
def
create_weights
(
self
,
dtype
:
torch
.
dtype
)
->
None
:
assert
self
.
input_size
%
self
.
quant_config
.
weight_bits
==
0
assert
(
self
.
output_size_per_partition
%
self
.
quant_config
.
pack_factor
==
0
)
self
.
qweight
=
Parameter
(
torch
.
empty
(
self
.
input_size
,
self
.
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
device
=
"cuda"
,
dtype
=
torch
.
int32
,
),
requires_grad
=
False
,
)
self
.
qzeros
=
Parameter
(
torch
.
empty
(
self
.
input_size
//
self
.
quant_config
.
group_size
,
self
.
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
device
=
"cuda"
,
dtype
=
torch
.
int32
,
),
requires_grad
=
False
,
)
self
.
scales
=
Parameter
(
torch
.
empty
(
self
.
input_size
//
self
.
quant_config
.
group_size
,
self
.
output_size_per_partition
,
device
=
"cuda"
,
dtype
=
dtype
,
),
requires_grad
=
False
,
)
def
apply_weights
(
self
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
pack_factor
=
self
.
quant_config
.
pack_factor
out_shape
=
(
x
.
shape
[:
-
1
]
+
(
self
.
qweight
.
shape
[
-
1
]
*
pack_factor
,
))
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
out
=
quantization_ops
.
awq_gemm
(
reshaped_x
,
self
.
qweight
,
self
.
scales
,
self
.
qzeros
,
pack_factor
)
if
bias
is
not
None
:
out
=
out
+
bias
return
out
.
reshape
(
out_shape
)
class
AWQRowParallelLinear
(
RowParallelLinear
):
def
create_weights
(
self
,
dtype
:
torch
.
dtype
)
->
None
:
assert
(
self
.
input_size_per_partition
%
self
.
quant_config
.
weight_bits
==
0
)
assert
self
.
output_size
%
self
.
quant_config
.
pack_factor
==
0
self
.
qweight
=
Parameter
(
torch
.
empty
(
self
.
input_size_per_partition
,
self
.
output_size
//
self
.
quant_config
.
pack_factor
,
device
=
"cuda"
,
dtype
=
torch
.
int32
,
),
requires_grad
=
False
,
)
self
.
qzeros
=
Parameter
(
torch
.
empty
(
self
.
input_size_per_partition
//
self
.
quant_config
.
group_size
,
self
.
output_size
//
self
.
quant_config
.
pack_factor
,
device
=
"cuda"
,
dtype
=
torch
.
int32
,
),
requires_grad
=
False
,
)
self
.
scales
=
Parameter
(
torch
.
empty
(
self
.
input_size_per_partition
//
self
.
quant_config
.
group_size
,
self
.
output_size
,
device
=
"cuda"
,
dtype
=
dtype
,
),
requires_grad
=
False
,
)
def
apply_weights
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
pack_factor
=
self
.
quant_config
.
pack_factor
out_shape
=
(
x
.
shape
[:
-
1
]
+
(
self
.
qweight
.
shape
[
-
1
]
*
pack_factor
,
))
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
out
=
quantization_ops
.
awq_gemm
(
reshaped_x
,
self
.
qweight
,
self
.
scales
,
self
.
qzeros
,
pack_factor
)
return
out
.
reshape
(
out_shape
)
server/vllm/vllm/model_executor/layers/rotary_embedding.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Rotary Positional Embeddings."""
from
typing
import
Tuple
,
Union
import
torch
import
torch.nn
as
nn
from
vllm
import
pos_encoding_ops
class
RotaryEmbedding
(
nn
.
Module
):
"""Original rotary positional embedding."""
def
__init__
(
self
,
head_size
:
int
,
rotary_dim
:
int
,
max_position_embeddings
:
int
,
base
:
int
,
is_neox_style
:
bool
,
)
->
None
:
super
().
__init__
()
self
.
head_size
=
head_size
self
.
rotary_dim
=
rotary_dim
self
.
max_position_embeddings
=
max_position_embeddings
self
.
base
=
base
self
.
is_neox_style
=
is_neox_style
cache
=
self
.
_compute_cos_sin_cache
()
cache
=
cache
.
to
(
torch
.
get_default_dtype
())
self
.
register_buffer
(
"cos_sin_cache"
,
cache
,
persistent
=
False
)
def
_compute_inv_freq
(
self
,
base
:
Union
[
int
,
float
])
->
torch
.
Tensor
:
"""Compute the inverse frequency."""
# NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
# However, we use `torch.arange(..., dtype=torch.float)` instead to
# avoid numerical issues with large base values (e.g., 10000000).
# This may cause a slight numerical difference between the HF
# implementation and ours.
# NOTE(woosuk): To exactly match the HF implementation, we need to
# use CPU to compute the cache and then move it to GPU. However, we
# create the cache on GPU for faster initialization. This may cause
# a slight numerical difference between the HF implementation and ours.
inv_freq
=
1.0
/
(
base
**
(
torch
.
arange
(
0
,
self
.
rotary_dim
,
2
,
dtype
=
torch
.
float
,
device
=
"cuda"
)
/
self
.
rotary_dim
))
return
inv_freq
def
_compute_cos_sin_cache
(
self
)
->
torch
.
Tensor
:
"""Compute the cos and sin cache."""
inv_freq
=
self
.
_compute_inv_freq
(
self
.
base
)
t
=
torch
.
arange
(
self
.
max_position_embeddings
,
dtype
=
torch
.
float
,
device
=
"cuda"
)
freqs
=
torch
.
einsum
(
"i,j -> ij"
,
t
,
inv_freq
)
cos
=
freqs
.
cos
()
sin
=
freqs
.
sin
()
cache
=
torch
.
cat
((
cos
,
sin
),
dim
=-
1
)
return
cache
def
forward
(
self
,
positions
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# pos_encoding_ops.rotary_embedding() is an in-place operation that
# updates the query and key tensors.
pos_encoding_ops
.
rotary_embedding
(
positions
,
query
,
key
,
self
.
head_size
,
self
.
cos_sin_cache
,
self
.
is_neox_style
)
return
query
,
key
class
LinearScalingRotaryEmbedding
(
RotaryEmbedding
):
"""RotaryEmbedding extended with linear scaling.
Credits to the Reddit user /u/kaiokendev
"""
def
__init__
(
self
,
head_size
:
int
,
rotary_dim
:
int
,
max_position_embeddings
:
int
,
base
:
int
,
is_neox_style
:
bool
,
scaling_factor
:
float
,
)
->
None
:
self
.
scaling_factor
=
scaling_factor
super
().
__init__
(
head_size
,
rotary_dim
,
max_position_embeddings
,
base
,
is_neox_style
)
def
_compute_cos_sin_cache
(
self
)
->
torch
.
Tensor
:
inv_freq
=
self
.
_compute_inv_freq
(
self
.
base
)
# NOTE(woosuk): self.max_position_embeddings is the original
# maximum length before applying the rope scaling.
# Thus, the maximum length after applying the rope scaling is
# self.max_position_embeddings * self.scaling_factor.
max_len
=
self
.
max_position_embeddings
*
self
.
scaling_factor
t
=
torch
.
arange
(
max_len
,
dtype
=
torch
.
float
,
device
=
"cuda"
)
t
=
t
/
self
.
scaling_factor
freqs
=
torch
.
einsum
(
"i,j -> ij"
,
t
,
inv_freq
)
cos
=
freqs
.
cos
()
sin
=
freqs
.
sin
()
cache
=
torch
.
cat
((
cos
,
sin
),
dim
=-
1
)
return
cache
class
DynamicNTKScalingRotaryEmbedding
(
RotaryEmbedding
):
"""RotaryEmbedding extended with Dynamic NTK scaling.
Credits to the Reddit users /u/bloc97 and /u/emozilla
"""
def
__init__
(
self
,
head_size
:
int
,
rotary_dim
:
int
,
max_position_embeddings
:
int
,
base
:
int
,
is_neox_style
:
bool
,
scaling_factor
:
float
,
)
->
None
:
self
.
scaling_factor
=
scaling_factor
super
().
__init__
(
head_size
,
rotary_dim
,
max_position_embeddings
,
base
,
is_neox_style
)
def
_compute_cos_sin_cache
(
self
)
->
torch
.
Tensor
:
# NOTE(woosuk): self.max_position_embeddings is the original
# maximum length before applying the rope scaling.
# Thus, the maximum length after applying the rope scaling is
# self.max_position_embeddings * self.scaling_factor.
max_len
=
self
.
max_position_embeddings
*
self
.
scaling_factor
base
=
self
.
base
*
(
(
self
.
scaling_factor
*
max_len
/
self
.
max_position_embeddings
)
-
(
self
.
scaling_factor
-
1
))
**
(
self
.
rotary_dim
/
(
self
.
rotary_dim
-
2
))
inv_freq
=
self
.
_compute_inv_freq
(
base
)
t
=
torch
.
arange
(
max_len
,
dtype
=
torch
.
float
,
device
=
"cuda"
)
freqs
=
torch
.
einsum
(
"i,j -> ij"
,
t
,
inv_freq
)
cos
=
freqs
.
cos
()
sin
=
freqs
.
sin
()
cache
=
torch
.
cat
((
cos
,
sin
),
dim
=-
1
)
return
cache
server/vllm/vllm/model_executor/layers/sampler.py
deleted
100644 → 0
View file @
64def8e2
"""A layer that samples the next tokens from the model's outputs."""
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.parallel_utils.communication_op
import
(
tensor_model_parallel_all_gather
)
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.sequence
import
(
PromptLogprobs
,
SampleLogprobs
,
SamplerOutput
,
SequenceData
,
SequenceGroupOutputs
,
SequenceOutputs
)
_SAMPLING_EPS
=
1e-5
class
Sampler
(
nn
.
Module
):
"""Samples the next tokens from the model's outputs.
This layer does the following:
1. Discard the hidden states that are not used for sampling (i.e., all
tokens except the final one in each prompt).
2. Compute the logits for the next tokens.
3. Apply presence and frequency penalties.
4. Apply temperature scaling.
5. Apply top-p and top-k truncation.
6. Sample the next tokens.
Here, each sequence group within the batch can have different sampling
parameters (e.g., sampling method, temperature, top-p, top-k, etc.).
"""
def
__init__
(
self
,
vocab_size
:
int
)
->
None
:
super
().
__init__
()
self
.
vocab_size
=
vocab_size
def
forward
(
self
,
embedding
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
input_metadata
:
InputMetadata
,
embedding_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
SamplerOutput
:
# Get the hidden states that we use for sampling.
hidden_states
=
_prune_hidden_states
(
hidden_states
,
input_metadata
)
# Get the logits for the next tokens.
logits
=
_get_logits
(
hidden_states
,
embedding
,
embedding_bias
,
self
.
vocab_size
)
# Apply presence and frequency penalties.
output_tokens
=
_get_output_tokens
(
input_metadata
)
assert
len
(
output_tokens
)
==
logits
.
shape
[
0
]
presence_penalties
,
frequency_penalties
=
_get_penalties
(
input_metadata
)
assert
len
(
presence_penalties
)
==
logits
.
shape
[
0
]
assert
len
(
frequency_penalties
)
==
logits
.
shape
[
0
]
logits
=
_apply_penalties
(
logits
,
output_tokens
,
presence_penalties
,
frequency_penalties
)
# Apply temperature scaling.
temperatures
=
_get_temperatures
(
input_metadata
)
assert
len
(
temperatures
)
==
logits
.
shape
[
0
]
if
any
(
t
!=
1.0
for
t
in
temperatures
):
t
=
torch
.
tensor
(
temperatures
,
dtype
=
logits
.
dtype
,
device
=
logits
.
device
)
# Use in-place division to avoid creating a new tensor.
logits
.
div_
(
t
.
unsqueeze
(
dim
=
1
))
# Apply top-p and top-k truncation.
top_ps
,
top_ks
=
_get_top_p_top_k
(
input_metadata
,
self
.
vocab_size
)
assert
len
(
top_ps
)
==
len
(
top_ks
)
==
logits
.
shape
[
0
]
do_top_p
=
any
(
p
<
1.0
-
_SAMPLING_EPS
for
p
in
top_ps
)
do_top_k
=
any
(
k
!=
self
.
vocab_size
for
k
in
top_ks
)
if
do_top_p
or
do_top_k
:
logits
=
_apply_top_p_top_k
(
logits
,
top_ps
,
top_ks
)
# We use float32 for probabilities and log probabilities.
# Compute the probabilities.
probs
=
torch
.
softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float
)
# Compute the log probabilities.
# Use log_softmax to ensure numerical stability.
logprobs
=
torch
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float
)
# Sample the next tokens.
sample_results
=
_sample
(
probs
,
logprobs
,
input_metadata
)
# Get the logprobs query results.
prompt_logprobs
,
sample_logprobs
=
_get_logprobs
(
logprobs
,
input_metadata
,
sample_results
)
return
_build_sampler_output
(
sample_results
,
input_metadata
,
prompt_logprobs
,
sample_logprobs
)
def
_get_logits
(
hidden_states
:
torch
.
Tensor
,
embedding
:
torch
.
Tensor
,
embedding_bias
:
Optional
[
torch
.
Tensor
],
vocab_size
:
int
)
->
torch
.
Tensor
:
# Get the logits for the next tokens.
logits
=
torch
.
matmul
(
hidden_states
,
embedding
.
t
())
if
embedding_bias
is
not
None
:
logits
+=
embedding_bias
logits
=
tensor_model_parallel_all_gather
(
logits
)
# Remove paddings in vocab (if any).
logits
=
logits
[:,
:
vocab_size
]
return
logits
def
_prune_hidden_states
(
hidden_states
:
torch
.
Tensor
,
input_metadata
:
InputMetadata
,
)
->
torch
.
Tensor
:
selected_token_indices
:
List
[
int
]
=
[]
start_idx
=
0
for
i
,
seq_group
in
enumerate
(
input_metadata
.
seq_groups
):
seq_ids
,
sampling_params
=
seq_group
if
i
<
input_metadata
.
num_prompts
:
assert
len
(
seq_ids
)
==
1
,
"Prompt input should have only one seq."
prompt_len
=
input_metadata
.
prompt_lens
[
i
]
if
sampling_params
.
prompt_logprobs
is
not
None
:
selected_token_indices
.
extend
(
range
(
start_idx
,
start_idx
+
prompt_len
-
1
))
selected_token_indices
.
append
(
start_idx
+
prompt_len
-
1
)
start_idx
+=
input_metadata
.
max_prompt_len
else
:
num_seqs
=
len
(
seq_ids
)
selected_token_indices
.
extend
(
range
(
start_idx
,
start_idx
+
num_seqs
))
start_idx
+=
num_seqs
selected_token_indices
=
torch
.
tensor
(
selected_token_indices
,
dtype
=
torch
.
long
,
device
=
hidden_states
.
device
)
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_states
.
shape
[
-
1
])
return
hidden_states
.
index_select
(
0
,
selected_token_indices
)
def
_get_penalties
(
input_metadata
:
InputMetadata
)
->
Tuple
[
List
[
float
],
List
[
float
]]:
# Collect the presence and frequency penalties.
presence_penalties
:
List
[
float
]
=
[]
frequency_penalties
:
List
[
float
]
=
[]
for
i
,
seq_group
in
enumerate
(
input_metadata
.
seq_groups
):
seq_ids
,
sampling_params
=
seq_group
p
=
sampling_params
.
presence_penalty
f
=
sampling_params
.
frequency_penalty
if
(
i
<
input_metadata
.
num_prompts
and
sampling_params
.
prompt_logprobs
is
not
None
):
# NOTE: We do not apply presence and frequency penalties for the
# prompt token positions where we don't sample new tokens.
prompt_len
=
input_metadata
.
prompt_lens
[
i
]
presence_penalties
+=
[
0
]
*
(
prompt_len
-
1
)
frequency_penalties
+=
[
0
]
*
(
prompt_len
-
1
)
presence_penalties
+=
[
p
]
*
len
(
seq_ids
)
frequency_penalties
+=
[
f
]
*
len
(
seq_ids
)
return
presence_penalties
,
frequency_penalties
def
_get_output_tokens
(
input_metadata
:
InputMetadata
)
->
List
[
List
[
int
]]:
output_tokens
:
List
[
List
[
int
]]
=
[]
for
i
,
seq_group
in
enumerate
(
input_metadata
.
seq_groups
):
seq_ids
,
sampling_params
=
seq_group
if
(
i
<
input_metadata
.
num_prompts
and
sampling_params
.
prompt_logprobs
is
not
None
):
# NOTE: prompt token positions do not need output tokens to
# compute penalties.
prompt_len
=
input_metadata
.
prompt_lens
[
i
]
output_tokens
.
extend
([]
for
_
in
range
(
prompt_len
-
1
))
for
seq_id
in
seq_ids
:
seq_data
=
input_metadata
.
seq_data
[
seq_id
]
output_tokens
.
append
(
seq_data
.
output_token_ids
)
return
output_tokens
def
_apply_penalties
(
logits
:
torch
.
Tensor
,
output_tokens
:
List
[
List
[
int
]],
presence_penalties
:
List
[
float
],
frequency_penalties
:
List
[
float
],
)
->
torch
.
Tensor
:
num_seqs
,
vocab_size
=
logits
.
shape
for
i
in
range
(
num_seqs
):
if
not
output_tokens
[
i
]:
continue
p
=
presence_penalties
[
i
]
f
=
frequency_penalties
[
i
]
if
abs
(
p
)
<
_SAMPLING_EPS
and
abs
(
f
)
<
_SAMPLING_EPS
:
continue
break
else
:
# Return early if all sequences have zero penalties.
return
logits
max_output_len
=
max
(
len
(
tokens
)
for
tokens
in
output_tokens
)
padded_output_tokens
=
[
tokens
+
[
vocab_size
]
*
(
max_output_len
-
len
(
tokens
))
for
tokens
in
output_tokens
]
output_tokens_tensor
=
torch
.
tensor
(
padded_output_tokens
,
dtype
=
torch
.
long
,
device
=
logits
.
device
)
# Compute the bin counts for the output tokens.
# vocab_size + 1 for padding.
bin_counts
=
torch
.
zeros
((
num_seqs
,
vocab_size
+
1
),
dtype
=
torch
.
long
,
device
=
logits
.
device
)
bin_counts
.
scatter_add_
(
1
,
output_tokens_tensor
,
torch
.
ones_like
(
output_tokens_tensor
))
bin_counts
=
bin_counts
[:,
:
vocab_size
]
# Remove the padding bin.
frequency_penalties
=
torch
.
tensor
(
frequency_penalties
,
dtype
=
logits
.
dtype
,
device
=
logits
.
device
)
presence_penalties
=
torch
.
tensor
(
presence_penalties
,
dtype
=
logits
.
dtype
,
device
=
logits
.
device
)
# We follow the definition in OpenAI API.
# Refer to https://platform.openai.com/docs/api-reference/parameter-details
logits
-=
frequency_penalties
.
unsqueeze
(
dim
=
1
)
*
bin_counts
logits
-=
presence_penalties
.
unsqueeze
(
dim
=
1
)
*
(
bin_counts
>
0
)
return
logits
def
_get_temperatures
(
input_metadata
:
InputMetadata
)
->
List
[
float
]:
# Collect the temperatures for the logits.
temperatures
:
List
[
float
]
=
[]
for
i
,
seq_group
in
enumerate
(
input_metadata
.
seq_groups
):
seq_ids
,
sampling_params
=
seq_group
temperature
=
sampling_params
.
temperature
if
temperature
<
_SAMPLING_EPS
:
# NOTE: Zero temperature means deterministic sampling
# (i.e., greedy sampling or beam search).
# Set the temperature to 1 to avoid division by zero.
temperature
=
1.0
if
(
i
<
input_metadata
.
num_prompts
and
sampling_params
.
prompt_logprobs
is
not
None
):
prompt_len
=
input_metadata
.
prompt_lens
[
i
]
temperatures
+=
[
temperature
]
*
(
prompt_len
-
1
)
temperatures
+=
[
temperature
]
*
len
(
seq_ids
)
return
temperatures
def
_get_top_p_top_k
(
input_metadata
:
InputMetadata
,
vocab_size
:
int
,
)
->
Tuple
[
List
[
float
],
List
[
int
]]:
top_ps
:
List
[
float
]
=
[]
top_ks
:
List
[
int
]
=
[]
for
i
,
seq_group
in
enumerate
(
input_metadata
.
seq_groups
):
seq_ids
,
sampling_params
=
seq_group
top_p
=
sampling_params
.
top_p
# k should not be greater than the vocab size.
top_k
=
min
(
sampling_params
.
top_k
,
vocab_size
)
# k=-1 means no truncation.
top_k
=
vocab_size
if
top_k
==
-
1
else
top_k
if
(
i
<
input_metadata
.
num_prompts
and
sampling_params
.
prompt_logprobs
is
not
None
):
prompt_len
=
input_metadata
.
prompt_lens
[
i
]
top_ps
+=
[
top_p
]
*
(
prompt_len
-
1
)
top_ks
+=
[
top_k
]
*
(
prompt_len
-
1
)
top_ps
+=
[
top_p
]
*
len
(
seq_ids
)
top_ks
+=
[
top_k
]
*
len
(
seq_ids
)
return
top_ps
,
top_ks
def
_apply_top_p_top_k
(
logits
:
torch
.
Tensor
,
top_ps
:
List
[
float
],
top_ks
:
List
[
int
],
)
->
torch
.
Tensor
:
p
=
torch
.
tensor
(
top_ps
,
dtype
=
logits
.
dtype
,
device
=
logits
.
device
)
k
=
torch
.
tensor
(
top_ks
,
dtype
=
torch
.
int
,
device
=
logits
.
device
)
logits_sort
,
logits_idx
=
logits
.
sort
(
dim
=-
1
,
descending
=
True
)
# Apply top-p.
probs_sort
=
logits_sort
.
softmax
(
dim
=-
1
)
probs_sum
=
probs_sort
.
cumsum
(
dim
=-
1
)
top_p_mask
=
(
probs_sum
-
probs_sort
)
>
p
.
unsqueeze
(
dim
=
1
)
logits_sort
[
top_p_mask
]
=
-
float
(
"inf"
)
# Apply top-k.
# Create a mask for the top-k elements.
top_k_mask
=
torch
.
arange
(
logits_idx
.
shape
[
-
1
],
device
=
logits_idx
.
device
)
top_k_mask
=
top_k_mask
.
expand
(
logits_idx
.
shape
[
0
],
-
1
)
top_k_mask
=
top_k_mask
>=
k
.
unsqueeze
(
dim
=
1
)
logits_sort
[
top_k_mask
]
=
-
float
(
"inf"
)
# Re-sort the probabilities.
logits
=
torch
.
gather
(
logits_sort
,
dim
=-
1
,
index
=
torch
.
argsort
(
logits_idx
,
dim
=-
1
))
return
logits
def
_greedy_sample
(
selected_seq_groups
:
List
[
Tuple
[
List
[
int
],
SamplingParams
]],
logprobs
:
torch
.
Tensor
,
)
->
List
[
Tuple
[
List
[
int
],
List
[
int
]]]:
samples
=
torch
.
argmax
(
logprobs
,
dim
=-
1
).
cpu
()
sample_idx
=
0
results
=
[]
for
seq_group
in
selected_seq_groups
:
seq_ids
,
_
=
seq_group
num_parent_seqs
=
len
(
seq_ids
)
assert
num_parent_seqs
==
1
,
(
"Greedy sampling should have only one seq."
)
parent_ids
=
list
(
range
(
num_parent_seqs
))
next_token_ids
=
[
samples
[
sample_idx
].
item
()]
results
.
append
((
next_token_ids
,
parent_ids
))
sample_idx
+=
num_parent_seqs
assert
sample_idx
==
logprobs
.
size
(
0
)
return
results
def
_random_sample
(
selected_seq_groups
:
List
[
Tuple
[
List
[
int
],
SamplingParams
]],
is_prompts
:
List
[
bool
],
probs
:
torch
.
Tensor
,
)
->
List
[
Tuple
[
List
[
int
],
List
[
int
]]]:
# Find the maximum best_of value of the prompt phase requests.
max_best_of
=
1
for
seq_group
,
is_prompt
in
zip
(
selected_seq_groups
,
is_prompts
):
if
is_prompt
:
seq_ids
,
sampling_params
=
seq_group
max_best_of
=
max
(
max_best_of
,
sampling_params
.
best_of
)
random_samples
=
torch
.
multinomial
(
probs
,
num_samples
=
max_best_of
,
replacement
=
True
).
cpu
()
sample_idx
=
0
results
=
[]
for
seq_group
,
is_prompt
in
zip
(
selected_seq_groups
,
is_prompts
):
seq_ids
,
sampling_params
=
seq_group
num_parent_seqs
=
len
(
seq_ids
)
if
is_prompt
:
# Prompt phase.
assert
num_parent_seqs
==
1
,
(
"Prompt input should have only one seq."
)
parent_ids
=
[
0
]
*
sampling_params
.
best_of
next_token_ids
=
random_samples
[
sample_idx
,
:
sampling_params
.
best_of
].
tolist
()
else
:
# Generation phase.
parent_ids
=
list
(
range
(
num_parent_seqs
))
next_token_ids
=
random_samples
[
sample_idx
:
sample_idx
+
num_parent_seqs
,
0
].
tolist
()
results
.
append
((
next_token_ids
,
parent_ids
))
sample_idx
+=
num_parent_seqs
assert
sample_idx
==
probs
.
size
(
0
)
return
results
def
_beam_search_sample
(
selected_seq_groups
:
List
[
Tuple
[
List
[
int
],
SamplingParams
]],
is_prompts
:
List
[
bool
],
seq_data
:
Dict
[
int
,
SequenceData
],
logprobs
:
torch
.
Tensor
,
)
->
List
[
Tuple
[
List
[
int
],
List
[
int
]]]:
# We sample 2 * beam_width candidates to make sure that with high
# probability we can get `beam_width` candidates in addition to
# the finished sequences for the next iteration. See
# https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563
# for details. See also HF reference:
# https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065
#
# NOTE: Beam search is not vectorized, so its speed can be slower than
# other sampling methods.
sample_idx
=
0
results
=
[]
for
seq_group
,
is_prompt
in
zip
(
selected_seq_groups
,
is_prompts
):
seq_ids
,
sampling_params
=
seq_group
num_parent_seqs
=
len
(
seq_ids
)
beam_width
=
sampling_params
.
best_of
seq_group_logprobs
=
logprobs
[
sample_idx
:
sample_idx
+
num_parent_seqs
]
if
is_prompt
:
# Prompt phase.
assert
num_parent_seqs
==
1
,
(
"Prompt input should have only one seq."
)
parent_ids
=
[
0
]
*
(
2
*
beam_width
)
_
,
next_token_ids
=
torch
.
topk
(
seq_group_logprobs
[
0
],
2
*
beam_width
)
next_token_ids
=
next_token_ids
.
tolist
()
else
:
# Generation phase.
cumulative_logprobs
=
[
seq_data
[
seq_id
].
cumulative_logprob
for
seq_id
in
seq_ids
]
cumulative_logprobs
=
torch
.
tensor
(
cumulative_logprobs
,
dtype
=
torch
.
float
,
device
=
seq_group_logprobs
.
device
)
seq_group_logprobs
=
(
seq_group_logprobs
+
cumulative_logprobs
.
unsqueeze
(
dim
=
1
))
_
,
topk_ids
=
torch
.
topk
(
seq_group_logprobs
.
flatten
(),
2
*
beam_width
)
topk_ids
=
topk_ids
.
tolist
()
vocab_size
=
seq_group_logprobs
.
size
(
-
1
)
parent_ids
=
[
i
//
vocab_size
for
i
in
topk_ids
]
next_token_ids
=
[
i
%
vocab_size
for
i
in
topk_ids
]
results
.
append
((
next_token_ids
,
parent_ids
))
sample_idx
+=
num_parent_seqs
assert
sample_idx
==
logprobs
.
size
(
0
)
return
results
def
_sample
(
probs
:
torch
.
Tensor
,
logprobs
:
torch
.
Tensor
,
input_metadata
:
InputMetadata
,
)
->
List
[
Tuple
[
List
[
int
],
List
[
int
]]]:
categorized_seq_group_ids
=
{
t
:
[]
for
t
in
SamplingType
}
categorized_sample_indices
=
{
t
:
[]
for
t
in
SamplingType
}
start_idx
=
0
for
i
,
seq_group
in
enumerate
(
input_metadata
.
seq_groups
):
seq_ids
,
sampling_params
=
seq_group
sampling_type
=
sampling_params
.
sampling_type
if
(
i
<
input_metadata
.
num_prompts
and
sampling_params
.
prompt_logprobs
is
not
None
):
# NOTE: prompt token positions do not need sample, skip
prompt_len
=
input_metadata
.
prompt_lens
[
i
]
start_idx
+=
prompt_len
-
1
categorized_seq_group_ids
[
sampling_type
].
append
(
i
)
num_seqs
=
len
(
seq_ids
)
categorized_sample_indices
[
sampling_type
].
extend
(
range
(
start_idx
,
start_idx
+
num_seqs
))
start_idx
+=
num_seqs
sample_results_dict
:
Dict
[
int
,
Tuple
[
List
[
int
],
List
[
int
]]]
=
{}
for
sampling_type
in
SamplingType
:
seq_group_ids
=
categorized_seq_group_ids
[
sampling_type
]
seq_groups
=
[
input_metadata
.
seq_groups
[
i
]
for
i
in
seq_group_ids
]
is_prompts
=
[
i
<
input_metadata
.
num_prompts
for
i
in
seq_group_ids
]
sample_indices
=
categorized_sample_indices
[
sampling_type
]
num_tokens
=
len
(
sample_indices
)
if
num_tokens
==
0
:
continue
if
sampling_type
==
SamplingType
.
GREEDY
:
category_logprobs
=
logprobs
[
sample_indices
]
sample_results
=
_greedy_sample
(
seq_groups
,
category_logprobs
)
elif
sampling_type
==
SamplingType
.
RANDOM
:
category_probs
=
probs
[
sample_indices
]
sample_results
=
_random_sample
(
seq_groups
,
is_prompts
,
category_probs
)
elif
sampling_type
==
SamplingType
.
BEAM
:
category_logprobs
=
logprobs
[
sample_indices
]
sample_results
=
_beam_search_sample
(
seq_groups
,
is_prompts
,
input_metadata
.
seq_data
,
category_logprobs
)
else
:
raise
ValueError
(
f
"Unsupported sampling type:
{
sampling_type
}
"
)
sample_results_dict
.
update
(
zip
(
seq_group_ids
,
sample_results
))
sample_results
=
[
sample_results_dict
[
i
]
for
i
in
range
(
len
(
input_metadata
.
seq_groups
))
]
return
sample_results
def
_get_logprobs
(
logprobs
:
torch
.
Tensor
,
input_metadata
:
InputMetadata
,
sample_results
:
List
[
Tuple
[
List
[
int
],
List
[
int
]]],
)
->
Tuple
[
List
[
Optional
[
List
[
Optional
[
Dict
[
int
,
float
]]]]],
List
[
List
[
Dict
[
int
,
float
]]]]:
# Prepare query indices
batched_logprobs_query_seq_indices
:
List
[
int
]
=
[]
batched_logprobs_query_token_indices
:
List
[
int
]
=
[]
largest_num_logprobs
=
0
sample_idx
=
0
for
i
,
(
seq_group
,
sample_result
)
in
enumerate
(
zip
(
input_metadata
.
seq_groups
,
sample_results
)):
seq_ids
,
sampling_params
=
seq_group
next_token_ids
,
parent_ids
=
sample_result
num_parent_seqs
=
len
(
seq_ids
)
if
(
i
<
input_metadata
.
num_prompts
and
sampling_params
.
prompt_logprobs
is
not
None
):
largest_num_logprobs
=
max
(
largest_num_logprobs
,
sampling_params
.
prompt_logprobs
)
prompt_len
=
input_metadata
.
prompt_lens
[
i
]
prompt_tokens
=
input_metadata
.
seq_data
[
seq_ids
[
0
]].
prompt_token_ids
batched_logprobs_query_seq_indices
.
extend
(
sample_idx
+
j
for
j
in
range
(
prompt_len
-
1
))
batched_logprobs_query_token_indices
.
extend
(
token_id
for
token_id
in
prompt_tokens
[
1
:])
sample_idx
+=
prompt_len
-
1
batched_logprobs_query_seq_indices
.
extend
(
[
sample_idx
+
parent_id
for
parent_id
in
parent_ids
])
batched_logprobs_query_token_indices
.
extend
(
next_token_ids
)
if
sampling_params
.
logprobs
is
not
None
:
largest_num_logprobs
=
max
(
largest_num_logprobs
,
sampling_params
.
logprobs
)
sample_idx
+=
num_parent_seqs
assert
sample_idx
==
logprobs
.
size
(
0
)
# Batched query for logprobs of selected token
batched_logprobs_query_result
=
logprobs
[[
batched_logprobs_query_seq_indices
,
batched_logprobs_query_token_indices
]].
cpu
()
# Batched query for logprobs of topk tokens
if
largest_num_logprobs
>
0
:
top_logprobs
,
top_token_ids
=
torch
.
topk
(
logprobs
,
largest_num_logprobs
,
dim
=-
1
)
top_logprobs
=
top_logprobs
.
cpu
()
top_token_ids
=
top_token_ids
.
cpu
()
else
:
top_logprobs
,
top_token_ids
=
None
,
None
# Gather results
result_prompt_logprobs
:
List
[
Optional
[
PromptLogprobs
]]
=
[]
result_sample_logprobs
:
List
[
SampleLogprobs
]
=
[]
sample_idx
=
0
query_result_idx
=
0
for
i
,
(
seq_group
,
sample_result
)
in
enumerate
(
zip
(
input_metadata
.
seq_groups
,
sample_results
)):
seq_ids
,
sampling_params
=
seq_group
next_token_ids
,
parent_ids
=
sample_result
# Prompt logprobs
if
(
i
<
input_metadata
.
num_prompts
and
sampling_params
.
prompt_logprobs
is
not
None
):
num_logprobs
=
sampling_params
.
prompt_logprobs
prompt_len
=
input_metadata
.
prompt_lens
[
i
]
prompt_tokens
=
input_metadata
.
seq_data
[
seq_ids
[
0
]].
prompt_token_ids
group_prompt_logprobs
:
PromptLogprobs
=
[
None
]
for
token_id
in
prompt_tokens
[
1
:]:
prompt_logprobs_dict
=
{
token_id
:
batched_logprobs_query_result
[
query_result_idx
].
item
()
}
if
num_logprobs
>
0
:
prompt_logprobs_dict
.
update
(
zip
(
top_token_ids
[
sample_idx
,
:
num_logprobs
].
tolist
(),
top_logprobs
[
sample_idx
,
:
num_logprobs
].
tolist
()))
group_prompt_logprobs
.
append
(
prompt_logprobs_dict
)
sample_idx
+=
1
query_result_idx
+=
1
result_prompt_logprobs
.
append
(
group_prompt_logprobs
)
else
:
result_prompt_logprobs
.
append
(
None
)
# Sample logprobs
num_logprobs
=
sampling_params
.
logprobs
if
num_logprobs
is
None
:
num_logprobs
=
0
group_sample_logprobs
:
SampleLogprobs
=
[]
for
next_token_id
,
parent_id
in
zip
(
next_token_ids
,
parent_ids
):
sample_logprobs_dict
=
{
next_token_id
:
batched_logprobs_query_result
[
query_result_idx
].
item
()
}
query_result_idx
+=
1
if
num_logprobs
>
0
:
sample_logprobs_dict
.
update
(
zip
(
top_token_ids
[
sample_idx
+
parent_id
,
:
num_logprobs
].
tolist
(),
top_logprobs
[
sample_idx
+
parent_id
,
:
num_logprobs
].
tolist
()))
group_sample_logprobs
.
append
(
sample_logprobs_dict
)
result_sample_logprobs
.
append
(
group_sample_logprobs
)
sample_idx
+=
len
(
seq_ids
)
return
result_prompt_logprobs
,
result_sample_logprobs
def
_build_sampler_output
(
sample_results
:
List
[
Tuple
[
List
[
int
],
List
[
int
]]],
input_metadata
:
InputMetadata
,
prompt_logprobs
:
List
[
Optional
[
PromptLogprobs
]],
sample_logprobs
:
List
[
SampleLogprobs
],
)
->
SamplerOutput
:
sampler_output
=
[]
for
(
seq_group
,
sample_result
,
group_prompt_logprobs
,
group_sample_logprobs
)
in
zip
(
input_metadata
.
seq_groups
,
sample_results
,
prompt_logprobs
,
sample_logprobs
):
seq_ids
,
_
=
seq_group
next_token_ids
,
parent_ids
=
sample_result
seq_outputs
=
[]
for
parent_id
,
next_token_id
,
logprobs
in
zip
(
parent_ids
,
next_token_ids
,
group_sample_logprobs
):
seq_outputs
.
append
(
SequenceOutputs
(
seq_ids
[
parent_id
],
next_token_id
,
logprobs
))
sampler_output
.
append
(
SequenceGroupOutputs
(
seq_outputs
,
group_prompt_logprobs
))
return
sampler_output
server/vllm/vllm/model_executor/model_loader.py
deleted
100644 → 0
View file @
64def8e2
"""Utilities for selecting and loading models."""
import
contextlib
from
typing
import
Type
import
torch
import
torch.nn
as
nn
from
transformers
import
PretrainedConfig
from
vllm.config
import
ModelConfig
from
vllm.model_executor.models
import
*
# pylint: disable=wildcard-import
from
vllm.model_executor.weight_utils
import
(
get_quant_config
,
initialize_dummy_weights
)
# TODO(woosuk): Lazy-load the model classes.
_MODEL_REGISTRY
=
{
"AquilaModel"
:
AquilaForCausalLM
,
"AquilaForCausalLM"
:
AquilaForCausalLM
,
# AquilaChat2
"BaiChuanForCausalLM"
:
BaiChuanForCausalLM
,
# baichuan-7b
"BaichuanForCausalLM"
:
BaichuanForCausalLM
,
# baichuan-13b
"BloomForCausalLM"
:
BloomForCausalLM
,
"FalconForCausalLM"
:
FalconForCausalLM
,
"GPT2LMHeadModel"
:
GPT2LMHeadModel
,
"GPTBigCodeForCausalLM"
:
GPTBigCodeForCausalLM
,
"GPTJForCausalLM"
:
GPTJForCausalLM
,
"GPTNeoXForCausalLM"
:
GPTNeoXForCausalLM
,
"InternLMForCausalLM"
:
InternLMForCausalLM
,
"LlamaForCausalLM"
:
LlamaForCausalLM
,
"LLaMAForCausalLM"
:
LlamaForCausalLM
,
# For decapoda-research/llama-*
# "MistralForCausalLM": MistralForCausalLM,
"MPTForCausalLM"
:
MPTForCausalLM
,
"OPTForCausalLM"
:
OPTForCausalLM
,
"QWenLMHeadModel"
:
QWenLMHeadModel
,
"RWForCausalLM"
:
FalconForCausalLM
,
}
# FIXME(woosuk): Remove this once all models support quantization.
_MODEL_CLASSES_SUPPORT_QUANTIZATION
=
[
LlamaForCausalLM
,
# MistralForCausalLM,
]
@
contextlib
.
contextmanager
def
_set_default_torch_dtype
(
dtype
:
torch
.
dtype
):
"""Sets the default torch dtype to the given dtype."""
old_dtype
=
torch
.
get_default_dtype
()
torch
.
set_default_dtype
(
dtype
)
yield
torch
.
set_default_dtype
(
old_dtype
)
def
_get_model_architecture
(
config
:
PretrainedConfig
)
->
Type
[
nn
.
Module
]:
architectures
=
getattr
(
config
,
"architectures"
,
[])
for
arch
in
architectures
:
if
arch
in
_MODEL_REGISTRY
:
return
_MODEL_REGISTRY
[
arch
]
raise
ValueError
(
f
"Model architectures
{
architectures
}
are not supported for now. "
f
"Supported architectures:
{
list
(
_MODEL_REGISTRY
.
keys
())
}
"
)
def
get_model
(
model_config
:
ModelConfig
)
->
nn
.
Module
:
model_class
=
_get_model_architecture
(
model_config
.
hf_config
)
# Get the quantization config.
quant_config
=
None
if
model_config
.
quantization
is
not
None
:
if
model_class
not
in
_MODEL_CLASSES_SUPPORT_QUANTIZATION
:
raise
ValueError
(
f
"Quantization is not supported for
{
model_class
}
."
)
quant_config
=
get_quant_config
(
model_config
.
quantization
,
model_config
.
model
,
model_config
.
download_dir
)
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
if
capability
<
quant_config
.
get_min_capability
():
raise
ValueError
(
f
"The quantization method
{
model_config
.
quantization
}
is not "
"supported for the current GPU. "
f
"Minimum capability:
{
quant_config
.
get_min_capability
()
}
. "
f
"Current capability:
{
capability
}
."
)
supported_dtypes
=
quant_config
.
get_supported_act_dtypes
()
if
model_config
.
dtype
not
in
supported_dtypes
:
raise
ValueError
(
f
"
{
model_config
.
dtype
}
is not supported for quantization "
f
"method
{
model_config
.
quantization
}
. Supported dtypes: "
f
"
{
supported_dtypes
}
"
)
with
_set_default_torch_dtype
(
model_config
.
dtype
):
# Create a model instance.
# The weights will be initialized as empty tensors.
if
model_class
in
_MODEL_CLASSES_SUPPORT_QUANTIZATION
:
model
=
model_class
(
model_config
.
hf_config
,
quant_config
)
else
:
model
=
model_class
(
model_config
.
hf_config
)
if
model_config
.
load_format
==
"dummy"
:
model
=
model
.
cuda
()
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
initialize_dummy_weights
(
model
)
else
:
# Load the weights from the cached or downloaded files.
model
.
load_weights
(
model_config
.
model
,
model_config
.
download_dir
,
model_config
.
load_format
,
model_config
.
revision
)
model
=
model
.
cuda
()
return
model
.
eval
()
server/vllm/vllm/model_executor/models/__init__.py
deleted
100644 → 0
View file @
64def8e2
from
vllm.model_executor.models.aquila
import
AquilaForCausalLM
from
vllm.model_executor.models.baichuan
import
(
BaiChuanForCausalLM
,
BaichuanForCausalLM
)
from
vllm.model_executor.models.bloom
import
BloomForCausalLM
from
vllm.model_executor.models.falcon
import
FalconForCausalLM
from
vllm.model_executor.models.gpt2
import
GPT2LMHeadModel
from
vllm.model_executor.models.gpt_bigcode
import
GPTBigCodeForCausalLM
from
vllm.model_executor.models.gpt_j
import
GPTJForCausalLM
from
vllm.model_executor.models.gpt_neox
import
GPTNeoXForCausalLM
from
vllm.model_executor.models.internlm
import
InternLMForCausalLM
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
# from vllm.model_executor.models.mistral import MistralForCausalLM
from
vllm.model_executor.models.mpt
import
MPTForCausalLM
from
vllm.model_executor.models.opt
import
OPTForCausalLM
from
vllm.model_executor.models.qwen
import
QWenLMHeadModel
__all__
=
[
"AquilaForCausalLM"
,
"BaiChuanForCausalLM"
,
"BaichuanForCausalLM"
,
"BloomForCausalLM"
,
"FalconForCausalLM"
,
"GPT2LMHeadModel"
,
"GPTBigCodeForCausalLM"
,
"GPTJForCausalLM"
,
"GPTNeoXForCausalLM"
,
"InternLMForCausalLM"
,
"LlamaForCausalLM"
,
"MPTForCausalLM"
,
"OPTForCausalLM"
,
"QWenLMHeadModel"
,
# "MistralForCausalLM",
]
server/vllm/vllm/model_executor/models/aquila.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only LLaMA model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
PagedAttentionWithRoPE
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
hf_model_weights_iterator
,
load_padded_tensor_parallel_vocab
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.sequence
import
SamplerOutput
from
vllm.transformers_utils.configs.aquila
import
AquilaConfig
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
AquilaMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
,
):
super
().
__init__
()
self
.
gate_up_proj
=
ColumnParallelLinear
(
hidden_size
,
2
*
intermediate_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
)
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
self
.
act_fn
=
SiluAndMul
()
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
AquilaRMSNorm
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
,
eps
=
1e-6
):
"""
AquilaRMSNorm is equivalent to T5LayerNorm
"""
super
().
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
variance_epsilon
=
eps
def
forward
(
self
,
hidden_states
):
input_dtype
=
hidden_states
.
dtype
variance
=
hidden_states
.
to
(
torch
.
float32
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
hidden_states
=
hidden_states
*
torch
.
rsqrt
(
variance
+
self
.
variance_epsilon
)
return
(
self
.
weight
*
hidden_states
).
to
(
input_dtype
)
class
AquilaAttention
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
num_heads
:
int
,
num_kv_heads
:
int
,
rope_theta
:
float
=
10000
,
max_position_embeddings
:
int
=
8192
,
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tp_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_size
self
.
total_num_kv_heads
=
num_kv_heads
assert
self
.
total_num_kv_heads
%
tp_size
==
0
self
.
num_kv_heads
=
self
.
total_num_kv_heads
//
tp_size
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
q_size
=
self
.
num_heads
*
self
.
head_dim
self
.
kv_size
=
self
.
num_kv_heads
*
self
.
head_dim
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
self
.
qkv_proj
=
ColumnParallelLinear
(
hidden_size
,
(
self
.
total_num_heads
+
2
*
self
.
total_num_kv_heads
)
*
self
.
head_dim
,
bias
=
False
,
gather_output
=
False
,
)
self
.
o_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
)
self
.
attn
=
PagedAttentionWithRoPE
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
rotary_dim
=
self
.
head_dim
,
base
=
self
.
rope_theta
,
max_position
=
self
.
max_position_embeddings
,
num_kv_heads
=
self
.
num_kv_heads
,
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
positions
,
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
AquilaDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
AquilaConfig
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
self_attn
=
AquilaAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
num_kv_heads
=
config
.
num_key_value_heads
,
rope_theta
=
rope_theta
,
max_position_embeddings
=
max_position_embeddings
,
)
self
.
mlp
=
AquilaMLP
(
hidden_size
=
self
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
)
self
.
input_layernorm
=
AquilaRMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
post_attention_layernorm
=
AquilaRMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
# Self Attention
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
hidden_states
=
residual
+
hidden_states
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
AquilaModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
AquilaConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
#vocab_size = ((config.vocab_size + 63) // 64) * 64
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
)
self
.
layers
=
nn
.
ModuleList
([
AquilaDecoderLayer
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
self
.
norm
=
AquilaRMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed_tokens
(
input_ids
)
for
i
in
range
(
len
(
self
.
layers
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
layers
[
i
]
hidden_states
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
norm
(
hidden_states
)
return
hidden_states
class
AquilaForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
().
__init__
()
self
.
config
=
config
self
.
model
=
AquilaModel
(
config
)
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
self
.
lm_head
=
ColumnParallelLinear
(
config
.
hidden_size
,
vocab_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[
"qkv_proj.weight"
,
"gate_proj.weight"
,
"up_proj.weight"
]
_row_parallel_weights
=
[
"o_proj.weight"
,
"down_proj.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tp_size
=
get_tensor_model_parallel_world_size
()
tensor_model_parallel_rank
=
get_tensor_model_parallel_rank
()
q_proj_shard_size
=
(
self
.
config
.
hidden_size
//
tp_size
)
kv_proj_shard_size
=
(
self
.
config
.
hidden_size
//
self
.
config
.
num_attention_heads
*
self
.
config
.
num_key_value_heads
//
tp_size
)
attention_weight_specs
=
[
# (weight_name, shard_size, offset)
(
"q_proj"
,
q_proj_shard_size
,
0
),
(
"k_proj"
,
kv_proj_shard_size
,
q_proj_shard_size
),
(
"v_proj"
,
kv_proj_shard_size
,
q_proj_shard_size
+
kv_proj_shard_size
),
]
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"rotary_emb.inv_freq"
in
name
:
continue
is_attention_weight
=
False
for
weight_name
,
shard_size
,
offset
in
attention_weight_specs
:
if
weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
weight_name
,
"qkv_proj"
)]
loaded_weight
=
loaded_weight
[
shard_size
*
tensor_model_parallel_rank
:
shard_size
*
(
tensor_model_parallel_rank
+
1
)]
param_slice
=
param
.
data
[
offset
:
offset
+
shard_size
]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_attention_weight
=
True
break
if
is_attention_weight
:
continue
is_gate_up_weight
=
False
for
stride_id
,
weight_name
in
enumerate
([
"gate_proj"
,
"up_proj"
]):
if
weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
weight_name
,
"gate_up_proj"
)]
shard_size
=
param
.
shape
[
0
]
//
2
loaded_weight
=
loaded_weight
[
shard_size
*
tensor_model_parallel_rank
:
shard_size
*
(
tensor_model_parallel_rank
+
1
)]
param_slice
=
param
.
data
[
shard_size
*
stride_id
:
shard_size
*
(
stride_id
+
1
)]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_gate_up_weight
=
True
break
if
is_gate_up_weight
:
continue
param
=
state_dict
[
name
]
if
"embed_tokens"
in
name
or
"lm_head"
in
name
:
load_padded_tensor_parallel_vocab
(
param
,
loaded_weight
,
tensor_model_parallel_rank
)
continue
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
server/vllm/vllm/model_executor/models/baichuan.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only BaiChuan model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
import
math
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.attention
import
(
PagedAttentionWithRoPE
,
PagedAttentionWithALiBi
)
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
convert_pyslice_to_tensor
,
hf_model_weights_iterator
,
load_padded_tensor_parallel_vocab
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.sequence
import
SamplerOutput
from
vllm.transformers_utils.configs.baichuan
import
BaiChuanConfig
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
def
_get_alibi_slopes
(
total_num_heads
:
int
)
->
torch
.
Tensor
:
closest_power_of_2
=
2
**
math
.
floor
(
math
.
log2
(
total_num_heads
))
base
=
torch
.
tensor
(
2
**
(
-
(
2
**-
(
math
.
log2
(
closest_power_of_2
)
-
3
))),
dtype
=
torch
.
float32
,
)
powers
=
torch
.
arange
(
1
,
1
+
closest_power_of_2
,
dtype
=
torch
.
int32
)
slopes
=
torch
.
pow
(
base
,
powers
)
if
closest_power_of_2
!=
total_num_heads
:
extra_base
=
torch
.
tensor
(
2
**
(
-
(
2
**-
(
math
.
log2
(
2
*
closest_power_of_2
)
-
3
))),
dtype
=
torch
.
float32
,
)
num_remaining_heads
=
min
(
closest_power_of_2
,
total_num_heads
-
closest_power_of_2
)
extra_powers
=
torch
.
arange
(
start
=
1
,
end
=
1
+
2
*
num_remaining_heads
,
step
=
2
,
dtype
=
torch
.
int32
)
slopes
=
torch
.
cat
(
[
slopes
,
torch
.
pow
(
extra_base
,
extra_powers
)],
dim
=
0
)
return
slopes
class
BaiChuanMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
,
):
super
().
__init__
()
self
.
gate_up_proj
=
ColumnParallelLinear
(
hidden_size
,
2
*
intermediate_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
)
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
self
.
act_fn
=
SiluAndMul
()
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
BaiChuanAttention
(
nn
.
Module
):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def
__init__
(
self
,
hidden_size
:
int
,
num_heads
:
int
,
position_embedding
:
str
,
rope_theta
:
float
=
10000
,
max_position_embeddings
:
int
=
8192
,
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tensor_model_parallel_world_size
=
get_tensor_model_parallel_world_size
(
)
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
(
self
.
total_num_heads
//
tensor_model_parallel_world_size
)
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
postion_embedding
=
position_embedding
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
# pylint: disable=invalid-name
self
.
W_pack
=
ColumnParallelLinear
(
hidden_size
,
3
*
hidden_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
o_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
)
# Create the alibi slopes and slice them.
if
self
.
postion_embedding
==
"ALIBI"
:
tp_rank
=
get_tensor_model_parallel_rank
()
head_start
=
tp_rank
*
self
.
num_heads
head_end
=
(
tp_rank
+
1
)
*
self
.
num_heads
alibi_slopes
=
_get_alibi_slopes
(
self
.
total_num_heads
)
alibi_slopes
=
alibi_slopes
[
head_start
:
head_end
].
tolist
()
scaling
=
self
.
head_dim
**-
0.5
self
.
attn
=
PagedAttentionWithALiBi
(
self
.
num_heads
,
self
.
head_dim
,
scaling
,
alibi_slopes
)
else
:
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
attn
=
PagedAttentionWithRoPE
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
rotary_dim
=
self
.
head_dim
,
base
=
self
.
rope_theta
,
max_position
=
self
.
max_position_embeddings
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
W_pack
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
k_cache
,
v_cache
=
kv_cache
if
self
.
postion_embedding
==
"ALIBI"
:
attn_output
=
self
.
attn
(
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
else
:
attn_output
=
self
.
attn
(
positions
,
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
BaiChuanDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BaiChuanConfig
,
position_embedding
:
str
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
self_attn
=
BaiChuanAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
position_embedding
=
position_embedding
,
rope_theta
=
rope_theta
,
max_position_embeddings
=
max_position_embeddings
,
)
self
.
mlp
=
BaiChuanMLP
(
hidden_size
=
self
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
)
self
.
input_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
post_attention_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
# Self Attention
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
hidden_states
=
residual
+
hidden_states
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
BaiChuanModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BaiChuanConfig
,
position_embedding
:
str
):
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
)
self
.
layers
=
nn
.
ModuleList
([
BaiChuanDecoderLayer
(
config
,
position_embedding
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed_tokens
(
input_ids
)
for
i
in
range
(
len
(
self
.
layers
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
layers
[
i
]
hidden_states
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
norm
(
hidden_states
)
return
hidden_states
class
BaiChuanBaseForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
,
position_embedding
:
str
):
super
().
__init__
()
self
.
config
=
config
self
.
model
=
BaiChuanModel
(
config
,
position_embedding
)
self
.
lm_head
=
ColumnParallelLinear
(
config
.
hidden_size
,
config
.
vocab_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[]
_row_parallel_weights
=
[
"o_proj.weight"
,
"down_proj.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tp_world_size
=
get_tensor_model_parallel_world_size
()
tp_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"rotary_emb.inv_freq"
in
name
:
continue
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
if
"W_pack"
in
name
:
total_num_heads
=
self
.
config
.
num_attention_heads
hidden_size
=
self
.
config
.
hidden_size
head_size
=
hidden_size
//
total_num_heads
num_heads
=
total_num_heads
//
tp_world_size
head_start
=
tp_rank
*
num_heads
head_end
=
(
tp_rank
+
1
)
*
num_heads
loaded_weight
=
loaded_weight
.
view
(
3
,
total_num_heads
,
head_size
,
hidden_size
)
loaded_weight
=
loaded_weight
[:,
head_start
:
head_end
,
:,
:]
loaded_weight
=
loaded_weight
.
reshape
(
-
1
,
hidden_size
)
is_gate_up_weight
=
False
for
stride_id
,
weight_name
in
enumerate
([
"gate_proj"
,
"up_proj"
]):
if
weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
weight_name
,
"gate_up_proj"
)]
shard_size
=
param
.
shape
[
0
]
//
2
loaded_weight
=
loaded_weight
[
shard_size
*
tp_rank
:
shard_size
*
(
tp_rank
+
1
)]
param_slice
=
param
.
data
[
shard_size
*
stride_id
:
shard_size
*
(
stride_id
+
1
)]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_gate_up_weight
=
True
break
if
is_gate_up_weight
:
continue
param
=
state_dict
[
name
]
if
"embed_tokens"
in
name
or
"lm_head"
in
name
:
load_padded_tensor_parallel_vocab
(
param
,
loaded_weight
,
tp_rank
)
continue
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tp_rank
,
)
class
BaichuanForCausalLM
(
BaiChuanBaseForCausalLM
):
# baichuan 13b
def
__init__
(
self
,
config
):
super
().
__init__
(
config
,
"ALIBI"
)
class
BaiChuanForCausalLM
(
BaiChuanBaseForCausalLM
):
# baichuan 7b
def
__init__
(
self
,
config
):
super
().
__init__
(
config
,
"ROPE"
)
server/vllm/vllm/model_executor/models/bloom.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
# Copyright 2023 The CacheFlow team.
# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only BLOOM model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
import
math
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers
import
BloomConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
PagedAttentionWithALiBi
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.sequence
import
SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
def
_get_alibi_slopes
(
total_num_heads
:
int
)
->
torch
.
Tensor
:
closest_power_of_2
=
2
**
math
.
floor
(
math
.
log2
(
total_num_heads
))
base
=
torch
.
tensor
(
2
**
(
-
(
2
**-
(
math
.
log2
(
closest_power_of_2
)
-
3
))),
dtype
=
torch
.
float32
,
)
powers
=
torch
.
arange
(
1
,
1
+
closest_power_of_2
,
dtype
=
torch
.
int32
)
slopes
=
torch
.
pow
(
base
,
powers
)
if
closest_power_of_2
!=
total_num_heads
:
extra_base
=
torch
.
tensor
(
2
**
(
-
(
2
**-
(
math
.
log2
(
2
*
closest_power_of_2
)
-
3
))),
dtype
=
torch
.
float32
,
)
num_remaining_heads
=
min
(
closest_power_of_2
,
total_num_heads
-
closest_power_of_2
)
extra_powers
=
torch
.
arange
(
start
=
1
,
end
=
1
+
2
*
num_remaining_heads
,
step
=
2
,
dtype
=
torch
.
int32
)
slopes
=
torch
.
cat
(
[
slopes
,
torch
.
pow
(
extra_base
,
extra_powers
)],
dim
=
0
)
return
slopes
class
BloomAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BloomConfig
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
self
.
total_num_heads
=
config
.
n_head
self
.
head_dim
=
self
.
hidden_size
//
self
.
total_num_heads
assert
self
.
head_dim
*
self
.
total_num_heads
==
self
.
hidden_size
tp_world_size
=
get_tensor_model_parallel_world_size
()
assert
self
.
total_num_heads
%
tp_world_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_world_size
self
.
query_key_value
=
ColumnParallelLinear
(
self
.
hidden_size
,
3
*
self
.
hidden_size
,
bias
=
True
,
gather_output
=
False
,
)
self
.
dense
=
RowParallelLinear
(
self
.
hidden_size
,
self
.
hidden_size
,
bias
=
True
,
input_is_parallel
=
True
,
)
# Create the alibi slopes and slice them.
tp_rank
=
get_tensor_model_parallel_rank
()
head_start
=
tp_rank
*
self
.
num_heads
head_end
=
(
tp_rank
+
1
)
*
self
.
num_heads
alibi_slopes
=
_get_alibi_slopes
(
self
.
total_num_heads
)
alibi_slopes
=
alibi_slopes
[
head_start
:
head_end
].
tolist
()
scaling
=
self
.
head_dim
**-
0.5
self
.
attn
=
PagedAttentionWithALiBi
(
self
.
num_heads
,
self
.
head_dim
,
scaling
,
alibi_slopes
)
def
forward
(
self
,
position_ids
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
del
position_ids
# Unused.
qkv
,
_
=
self
.
query_key_value
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
output
,
_
=
self
.
dense
(
attn_output
)
return
output
class
BloomMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BloomConfig
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
self
.
dense_h_to_4h
=
ColumnParallelLinear
(
hidden_size
,
4
*
hidden_size
,
gather_output
=
False
,
)
self
.
act
=
get_act_fn
(
"gelu"
)
self
.
dense_4h_to_h
=
RowParallelLinear
(
4
*
hidden_size
,
hidden_size
,
input_is_parallel
=
True
,
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
,
_
=
self
.
dense_h_to_4h
(
x
)
x
=
self
.
act
(
x
)
x
,
_
=
self
.
dense_4h_to_h
(
x
)
return
x
class
BloomBlock
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BloomConfig
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
self
.
input_layernorm
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
self_attention
=
BloomAttention
(
config
)
self
.
post_attention_layernorm
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
mlp
=
BloomMLP
(
config
)
self
.
apply_residual_connection_post_layernorm
=
(
config
.
apply_residual_connection_post_layernorm
)
def
forward
(
self
,
position_ids
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
# Layer norm at the beginning of the transformer layer.
layernorm_output
=
self
.
input_layernorm
(
hidden_states
)
# Layer norm post the self attention.
if
self
.
apply_residual_connection_post_layernorm
:
residual
=
layernorm_output
else
:
residual
=
hidden_states
# Self attention.
attention_output
=
self
.
self_attention
(
position_ids
=
position_ids
,
hidden_states
=
layernorm_output
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
attention_output
=
attention_output
+
residual
layernorm_output
=
self
.
post_attention_layernorm
(
attention_output
)
# Get residual
if
self
.
apply_residual_connection_post_layernorm
:
residual
=
layernorm_output
else
:
residual
=
attention_output
# MLP.
output
=
self
.
mlp
(
layernorm_output
)
+
residual
return
output
class
BloomModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BloomConfig
):
super
().
__init__
()
self
.
embed_dim
=
config
.
hidden_size
# Embedding + LN Embedding
self
.
word_embeddings
=
VocabParallelEmbedding
(
config
.
vocab_size
,
self
.
embed_dim
,
)
self
.
word_embeddings_layernorm
=
nn
.
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_epsilon
)
# Transformer blocks
self
.
h
=
nn
.
ModuleList
(
[
BloomBlock
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)])
# Final Layer Norm
self
.
ln_f
=
nn
.
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
word_embeddings
(
input_ids
)
hidden_states
=
self
.
word_embeddings_layernorm
(
hidden_states
)
for
i
in
range
(
len
(
self
.
h
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
h
[
i
]
hidden_states
=
layer
(
position_ids
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
ln_f
(
hidden_states
)
return
hidden_states
class
BloomForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BloomConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
transformer
=
BloomModel
(
config
)
# TODO(zhuohan): create a new weight after implementing pipeline
# parallelism
self
.
lm_head_weight
=
self
.
transformer
.
word_embeddings
.
weight
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head_weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[
"word_embeddings.weight"
,
"dense_h_to_4h.weight"
,
"dense_h_to_4h.bias"
]
_row_parallel_weights
=
[
"dense.weight"
,
"dense_4h_to_h.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tp_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
name
==
"lm_head.weight"
:
# Since hidden_states are parallelized, we need to
# load lm_head.weight in parallel.
self
.
_column_parallel_weights
.
append
(
name
)
# If lm_head is provided, use it instead.
param
=
self
.
lm_head_weight
else
:
if
not
name
.
startswith
(
"transformer."
):
name
=
"transformer."
+
name
param
=
state_dict
[
name
]
if
"query_key_value"
in
name
:
# NOTE(woosuk): BLOOM's fused QKV has the shape of
# [num_heads * 3 * head_size, hidden_size], while the
# required shape is [3 * num_heads * head_size, hidden_size].
# Thus, we need weight conversion.
shard_size
=
param
.
shape
[
0
]
start
=
shard_size
*
tp_rank
end
=
shard_size
*
(
tp_rank
+
1
)
loaded_weight
=
loaded_weight
[
start
:
end
]
num_heads
=
self
.
config
.
num_attention_heads
hidden_size
=
self
.
config
.
hidden_size
head_size
=
hidden_size
//
num_heads
if
"query_key_value.weight"
in
name
:
loaded_weight
=
loaded_weight
.
view
(
-
1
,
3
,
head_size
,
hidden_size
)
loaded_weight
=
loaded_weight
.
transpose
(
0
,
1
)
loaded_weight
=
loaded_weight
.
reshape
(
-
1
,
hidden_size
)
elif
"query_key_value.bias"
in
name
:
loaded_weight
=
loaded_weight
.
view
(
-
1
,
3
,
head_size
)
loaded_weight
=
loaded_weight
.
transpose
(
0
,
1
)
loaded_weight
=
loaded_weight
.
reshape
(
-
1
)
else
:
raise
ValueError
(
f
"Unexpected weight name:
{
name
}
"
)
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tp_rank
)
server/vllm/vllm/model_executor/models/falcon.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
# Copyright 2023 The vLLM team.
# Copyright 2023 the Falcon authors and HuggingFace Inc. team. All rights
# reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Falcon model."""
import
math
from
typing
import
List
,
Optional
,
Tuple
,
Union
import
torch
from
torch
import
nn
from
torch.nn
import
LayerNorm
from
transformers
import
FalconConfig
as
HF_FalconConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.attention
import
(
PagedAttention
,
PagedAttentionWithALiBi
,
PagedAttentionWithRoPE
)
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
convert_pyslice_to_tensor
,
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.parallel_utils.communication_op
import
(
tensor_model_parallel_all_reduce
)
from
vllm.sequence
import
SamplerOutput
from
vllm.transformers_utils.configs
import
RWConfig
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
FalconConfig
=
Union
[
HF_FalconConfig
,
RWConfig
]
# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during
# training, this means that there's one additional quantization to bfloat16
# between the operations. In order not to degrade the quality of our HF-port,
# we keep these characteristics in the final model.
class
FalconLinear
(
nn
.
Linear
):
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
=
x
@
self
.
weight
.
T
if
self
.
bias
is
None
:
return
hidden_states
return
hidden_states
+
self
.
bias
def
_get_alibi_slopes
(
total_num_heads
:
int
)
->
torch
.
Tensor
:
closest_power_of_2
=
2
**
math
.
floor
(
math
.
log2
(
total_num_heads
))
base
=
torch
.
tensor
(
2
**
(
-
(
2
**-
(
math
.
log2
(
closest_power_of_2
)
-
3
))),
dtype
=
torch
.
float32
)
powers
=
torch
.
arange
(
1
,
1
+
closest_power_of_2
,
dtype
=
torch
.
int32
)
slopes
=
torch
.
pow
(
base
,
powers
)
if
closest_power_of_2
!=
total_num_heads
:
extra_base
=
torch
.
tensor
(
2
**
(
-
(
2
**-
(
math
.
log2
(
2
*
closest_power_of_2
)
-
3
))),
dtype
=
torch
.
float32
)
num_remaining_heads
=
min
(
closest_power_of_2
,
total_num_heads
-
closest_power_of_2
)
extra_powers
=
torch
.
arange
(
1
,
1
+
2
*
num_remaining_heads
,
2
,
dtype
=
torch
.
int32
)
slopes
=
torch
.
cat
(
[
slopes
,
torch
.
pow
(
extra_base
,
extra_powers
)],
dim
=
0
)
return
slopes
class
FalconAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
FalconConfig
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
total_num_heads
=
config
.
num_attention_heads
assert
self
.
total_num_heads
%
tp_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_size
self
.
head_dim
=
self
.
hidden_size
//
self
.
total_num_heads
assert
self
.
head_dim
*
self
.
total_num_heads
==
self
.
hidden_size
self
.
new_decoder_architecture
=
config
.
new_decoder_architecture
self
.
multi_query
=
config
.
multi_query
if
self
.
new_decoder_architecture
:
self
.
total_num_kv_heads
=
config
.
num_kv_heads
assert
self
.
total_num_heads
%
tp_size
==
0
self
.
num_kv_heads
=
self
.
total_num_kv_heads
//
tp_size
self
.
query_key_value
=
ColumnParallelLinear
(
self
.
hidden_size
,
(
self
.
total_num_heads
+
2
*
self
.
total_num_kv_heads
)
*
self
.
head_dim
,
bias
=
config
.
bias
,
gather_output
=
False
,
skip_bias_add
=
True
,
)
elif
self
.
multi_query
:
self
.
total_num_kv_heads
=
1
self
.
num_kv_heads
=
1
self
.
query
=
ColumnParallelLinear
(
self
.
hidden_size
,
self
.
total_num_heads
*
self
.
head_dim
,
bias
=
config
.
bias
,
gather_output
=
False
,
skip_bias_add
=
True
,
)
self
.
key_value
=
FalconLinear
(
self
.
hidden_size
,
2
*
self
.
head_dim
,
bias
=
config
.
bias
)
else
:
self
.
total_num_kv_heads
=
self
.
total_num_heads
self
.
num_kv_heads
=
self
.
num_heads
self
.
query_key_value
=
ColumnParallelLinear
(
self
.
hidden_size
,
(
self
.
total_num_heads
+
2
*
self
.
total_num_kv_heads
)
*
self
.
head_dim
,
bias
=
config
.
bias
,
gather_output
=
False
,
skip_bias_add
=
True
,
)
self
.
q_size
=
self
.
num_heads
*
self
.
head_dim
self
.
kv_size
=
self
.
num_kv_heads
*
self
.
head_dim
# Layer-wise attention scaling
self
.
inv_norm_factor
=
1.0
/
math
.
sqrt
(
self
.
head_dim
)
self
.
reduce_row_parallel_results
=
not
(
config
.
new_decoder_architecture
or
config
.
parallel_attn
)
self
.
dense
=
RowParallelLinear
(
self
.
hidden_size
,
self
.
hidden_size
,
bias
=
config
.
bias
,
input_is_parallel
=
True
,
skip_bias_add
=
True
,
reduce_results
=
self
.
reduce_row_parallel_results
)
self
.
use_rotary
=
config
.
rotary
self
.
use_alibi
=
config
.
alibi
assert
not
(
self
.
use_rotary
and
self
.
use_alibi
),
(
"Rotary and alibi are mutually exclusive."
)
if
self
.
use_rotary
:
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
attn
=
PagedAttentionWithRoPE
(
self
.
num_heads
,
self
.
head_dim
,
self
.
inv_norm_factor
,
base
=
rope_theta
,
max_position
=
max_position_embeddings
,
rotary_dim
=
self
.
head_dim
,
num_kv_heads
=
self
.
num_kv_heads
)
elif
self
.
use_alibi
:
tp_rank
=
get_tensor_model_parallel_rank
()
head_start
=
tp_rank
*
self
.
num_heads
head_end
=
(
tp_rank
+
1
)
*
self
.
num_heads
alibi_slopes
=
(
_get_alibi_slopes
(
self
.
total_num_heads
)
*
self
.
inv_norm_factor
)
alibi_slopes
=
alibi_slopes
[
head_start
:
head_end
].
tolist
()
self
.
attn
=
PagedAttentionWithALiBi
(
self
.
num_heads
,
self
.
head_dim
,
self
.
inv_norm_factor
,
alibi_slopes
,
num_kv_heads
=
self
.
num_kv_heads
)
else
:
self
.
attn
=
PagedAttention
(
self
.
num_heads
,
self
.
head_dim
,
scale
=
self
.
inv_norm_factor
,
num_kv_heads
=
self
.
num_kv_heads
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
if
not
self
.
new_decoder_architecture
and
self
.
multi_query
:
q
,
bias
=
self
.
query
(
hidden_states
)
if
bias
is
not
None
:
q
+=
bias
kv
=
self
.
key_value
(
hidden_states
)
k
,
v
=
kv
.
split
([
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
else
:
qkv
,
bias
=
self
.
query_key_value
(
hidden_states
)
if
bias
is
not
None
:
qkv
+=
bias
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
k_cache
,
v_cache
=
kv_cache
if
self
.
use_rotary
:
attn_output
=
self
.
attn
(
positions
,
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
else
:
attn_output
=
self
.
attn
(
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
attn_output
,
bias
=
self
.
dense
(
attn_output
)
return
attn_output
,
bias
class
FalconMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
FalconConfig
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
self
.
dense_h_to_4h
=
ColumnParallelLinear
(
hidden_size
,
4
*
hidden_size
,
bias
=
config
.
bias
,
gather_output
=
False
,
skip_bias_add
=
True
)
self
.
act
=
nn
.
GELU
()
self
.
reduce_row_parallel_results
=
not
(
config
.
new_decoder_architecture
or
config
.
parallel_attn
)
self
.
dense_4h_to_h
=
RowParallelLinear
(
4
*
hidden_size
,
hidden_size
,
bias
=
config
.
bias
,
input_is_parallel
=
True
,
skip_bias_add
=
True
,
reduce_results
=
self
.
reduce_row_parallel_results
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
# NOTE(zhuohan): Following huggingface, we do not fuse bias add here.
x
,
bias
=
self
.
dense_h_to_4h
(
x
)
if
bias
is
not
None
:
x
+=
bias
x
=
self
.
act
(
x
)
x
,
bias
=
self
.
dense_4h_to_h
(
x
)
return
x
,
bias
class
FalconDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
FalconConfig
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
self
.
num_heads
=
config
.
num_attention_heads
self
.
self_attention
=
FalconAttention
(
config
)
self
.
mlp
=
FalconMLP
(
config
)
self
.
config
=
config
if
config
.
new_decoder_architecture
:
# The layer norm before self-attention
self
.
ln_attn
=
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
# The layer norm before the MLP
self
.
ln_mlp
=
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
else
:
self
.
input_layernorm
=
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
if
not
config
.
parallel_attn
:
self
.
post_attention_layernorm
=
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
reduce_row_parallel_results
=
not
(
config
.
new_decoder_architecture
or
config
.
parallel_attn
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
):
residual
=
hidden_states
if
self
.
config
.
new_decoder_architecture
:
attention_layernorm_out
=
self
.
ln_attn
(
hidden_states
)
mlp_layernorm_out
=
self
.
ln_mlp
(
hidden_states
)
else
:
attention_layernorm_out
=
self
.
input_layernorm
(
hidden_states
)
# Self attention.
attention_output
,
attention_bias
=
self
.
self_attention
(
positions
=
positions
,
hidden_states
=
attention_layernorm_out
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
if
self
.
reduce_row_parallel_results
and
attention_bias
is
not
None
:
attention_output
+=
attention_bias
if
not
self
.
config
.
new_decoder_architecture
:
if
self
.
config
.
parallel_attn
:
mlp_layernorm_out
=
attention_layernorm_out
else
:
residual
+=
attention_output
mlp_layernorm_out
=
self
.
post_attention_layernorm
(
residual
)
# MLP.
mlp_output
,
mlp_bias
=
self
.
mlp
(
mlp_layernorm_out
)
if
self
.
reduce_row_parallel_results
and
mlp_bias
is
not
None
:
mlp_output
+=
mlp_bias
if
not
self
.
reduce_row_parallel_results
:
# When MLP and Attention layers are parallel, we can use
# only one all-reduce operator to reduce the results from
# both MLP and Attention layers.
mlp_output
+=
attention_output
mlp_output
=
tensor_model_parallel_all_reduce
(
mlp_output
)
if
attention_bias
is
not
None
:
mlp_output
+=
attention_bias
if
mlp_bias
is
not
None
:
mlp_output
+=
mlp_bias
output
=
mlp_output
+
residual
return
output
class
FalconModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
FalconConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_dim
=
config
.
hidden_size
self
.
num_heads
=
config
.
num_attention_heads
self
.
use_alibi
=
config
.
alibi
# Embedding + LN Embedding
self
.
word_embeddings
=
VocabParallelEmbedding
(
config
.
vocab_size
,
self
.
embed_dim
,
)
# Transformer blocks
self
.
h
=
nn
.
ModuleList
([
FalconDecoderLayer
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
# Final Layer Norm
self
.
ln_f
=
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
input_ids
:
torch
.
LongTensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
word_embeddings
(
input_ids
)
for
i
in
range
(
len
(
self
.
h
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
h
[
i
]
hidden_states
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
ln_f
(
hidden_states
)
return
hidden_states
class
FalconForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
:
FalconConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
transformer
=
FalconModel
(
config
)
self
.
lm_head
=
ColumnParallelLinear
(
config
.
hidden_size
,
config
.
vocab_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
LongTensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
,
)
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[
"word_embeddings.weight"
,
"lm_head.weight"
,
"dense_h_to_4h.weight"
,
"dense_h_to_4h.bias"
]
_row_parallel_weights
=
[
"dense.weight"
,
"dense_4h_to_h.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tp_size
=
(
get_tensor_model_parallel_world_size
())
tp_rank
=
get_tensor_model_parallel_rank
()
hidden_size
=
self
.
config
.
hidden_size
total_num_heads
=
self
.
config
.
num_attention_heads
num_heads
=
total_num_heads
//
tp_size
head_size
=
hidden_size
//
total_num_heads
head_start
=
tp_rank
*
num_heads
head_end
=
(
tp_rank
+
1
)
*
num_heads
if
self
.
config
.
new_decoder_architecture
:
total_num_kv_heads
=
self
.
config
.
num_kv_heads
num_kv_heads
=
total_num_kv_heads
//
tp_size
separated_q_kv
=
False
kv_head_start
=
tp_rank
*
num_kv_heads
kv_head_end
=
(
tp_rank
+
1
)
*
num_kv_heads
elif
self
.
config
.
multi_query
:
total_num_kv_heads
=
1
num_kv_heads
=
1
separated_q_kv
=
True
kv_head_start
=
0
kv_head_end
=
1
else
:
total_num_kv_heads
=
total_num_heads
num_kv_heads
=
total_num_kv_heads
//
tp_size
separated_q_kv
=
False
kv_head_start
=
tp_rank
*
num_kv_heads
kv_head_end
=
(
tp_rank
+
1
)
*
num_kv_heads
num_query_heads_per_kv_head
=
total_num_heads
//
total_num_kv_heads
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"query_key_value"
in
name
:
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
loaded_weight_size
=
loaded_weight
.
size
()
loaded_weight
=
loaded_weight
.
view
(
total_num_kv_heads
,
num_query_heads_per_kv_head
+
2
,
head_size
,
*
loaded_weight_size
[
1
:])
wq
=
loaded_weight
[:,
:
-
2
].
reshape
(
-
1
,
*
loaded_weight_size
[
1
:])
wk
=
loaded_weight
[:,
[
-
2
]].
reshape
(
-
1
,
*
loaded_weight_size
[
1
:])
wv
=
loaded_weight
[:,
[
-
1
]].
reshape
(
-
1
,
*
loaded_weight_size
[
1
:])
wq
=
wq
[
head_size
*
head_start
:
head_size
*
head_end
]
wk
=
wk
[
head_size
*
kv_head_start
:
head_size
*
kv_head_end
]
wv
=
wv
[
head_size
*
kv_head_start
:
head_size
*
kv_head_end
]
if
separated_q_kv
:
loaded_weight_q
=
wq
loaded_weight_kv
=
torch
.
cat
([
wk
,
wv
],
dim
=
0
)
q_weight_name
=
name
.
replace
(
"query_key_value"
,
"query"
)
kv_weight_name
=
name
.
replace
(
"query_key_value"
,
"key_value"
)
load_tensor_parallel_weights
(
state_dict
[
q_weight_name
],
loaded_weight_q
,
q_weight_name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tp_rank
)
load_tensor_parallel_weights
(
state_dict
[
kv_weight_name
],
loaded_weight_kv
,
kv_weight_name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tp_rank
)
continue
else
:
loaded_weight
=
torch
.
cat
([
wq
,
wk
,
wv
],
dim
=
0
)
param
=
state_dict
[
name
]
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tp_rank
)
server/vllm/vllm/model_executor/models/gpt2.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Copyright 2023 The vLLM team.
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GPT-2 model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers
import
GPT2Config
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
PagedAttention
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
convert_pyslice_to_tensor
,
hf_model_weights_iterator
,
load_padded_tensor_parallel_vocab
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.sequence
import
SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
GPT2Attention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPT2Config
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
total_num_heads
=
config
.
num_attention_heads
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
assert
total_num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
total_num_heads
//
tensor_model_parallel_world_size
self
.
head_dim
=
self
.
hidden_size
//
total_num_heads
self
.
scale
=
self
.
head_dim
**-
0.5
self
.
c_attn
=
ColumnParallelLinear
(
self
.
hidden_size
,
3
*
self
.
hidden_size
,
bias
=
True
,
gather_output
=
False
,
)
self
.
c_proj
=
RowParallelLinear
(
self
.
hidden_size
,
self
.
hidden_size
,
bias
=
True
,
input_is_parallel
=
True
,
)
self
.
attn
=
PagedAttention
(
self
.
num_heads
,
self
.
head_dim
,
scale
=
self
.
scale
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
c_attn
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
key_cache
,
value_cache
=
kv_cache
attn_output
=
self
.
attn
(
q
,
k
,
v
,
key_cache
,
value_cache
,
input_metadata
,
cache_event
)
attn_output
,
_
=
self
.
c_proj
(
attn_output
)
return
attn_output
class
GPT2MLP
(
nn
.
Module
):
def
__init__
(
self
,
intermediate_size
:
int
,
config
:
GPT2Config
,
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
self
.
c_fc
=
ColumnParallelLinear
(
hidden_size
,
intermediate_size
,
bias
=
True
,
gather_output
=
False
,
)
self
.
c_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
True
,
input_is_parallel
=
True
,
)
self
.
act
=
get_act_fn
(
config
.
activation_function
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
,
_
=
self
.
c_fc
(
hidden_states
)
hidden_states
=
self
.
act
(
hidden_states
)
hidden_states
,
_
=
self
.
c_proj
(
hidden_states
)
return
hidden_states
class
GPT2Block
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPT2Config
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
inner_dim
=
(
config
.
n_inner
if
config
.
n_inner
is
not
None
else
4
*
hidden_size
)
self
.
ln_1
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
attn
=
GPT2Attention
(
config
)
self
.
ln_2
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
mlp
=
GPT2MLP
(
inner_dim
,
config
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
residual
=
hidden_states
hidden_states
=
self
.
ln_1
(
hidden_states
)
attn_output
=
self
.
attn
(
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
# residual connection
hidden_states
=
attn_output
+
residual
residual
=
hidden_states
hidden_states
=
self
.
ln_2
(
hidden_states
)
feed_forward_hidden_states
=
self
.
mlp
(
hidden_states
)
# residual connection
hidden_states
=
residual
+
feed_forward_hidden_states
return
hidden_states
class
GPT2Model
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPT2Config
):
super
().
__init__
()
self
.
config
=
config
assert
not
config
.
add_cross_attention
assert
not
config
.
scale_attn_by_inverse_layer_idx
assert
not
config
.
reorder_and_upcast_attn
self
.
embed_dim
=
config
.
hidden_size
# Optimization: While the vocab size of GPT-2 is 50257, we extend it
# to 50304 in order to make it divisible by 64.
# This improves performance since GPUs are faster if the dimension
# is divisible by 64. In addition, it allows us to shard the embedding
# layer across 2, 4, 8, or more GPUs.
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
self
.
wte
=
VocabParallelEmbedding
(
vocab_size
,
self
.
embed_dim
)
self
.
wpe
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
self
.
embed_dim
)
self
.
h
=
nn
.
ModuleList
(
[
GPT2Block
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)])
self
.
ln_f
=
nn
.
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
wte
(
input_ids
)
position_embeds
=
self
.
wpe
(
position_ids
)
hidden_states
=
inputs_embeds
+
position_embeds
for
i
in
range
(
len
(
self
.
h
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
h
[
i
]
hidden_states
=
layer
(
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
)
hidden_states
=
self
.
ln_f
(
hidden_states
)
return
hidden_states
class
GPT2LMHeadModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPT2Config
):
super
().
__init__
()
self
.
config
=
config
self
.
transformer
=
GPT2Model
(
config
)
# TODO(zhuohan): create a new weight after implementing pipeline
# parallelism
self
.
lm_head_weight
=
self
.
transformer
.
wte
.
weight
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head_weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[
"c_fc.weight"
,
"c_fc.bias"
]
_row_parallel_weights
=
[
"c_proj.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
tensor_model_parallel_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"lm_head.weight"
in
name
:
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
continue
if
".attn.bias"
in
name
or
".attn.masked_bias"
in
name
:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
if
not
name
.
startswith
(
"transformer."
):
name
=
"transformer."
+
name
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
# The HF's GPT-2 implementation uses Conv1D instead of Linear.
# Because of this, we need to transpose the weights.
for
conv1d_weight_name
in
[
"c_attn"
,
"c_proj"
,
"c_fc"
]:
if
conv1d_weight_name
not
in
name
:
continue
if
not
name
.
endswith
(
".weight"
):
continue
loaded_weight
=
loaded_weight
.
t
()
param
=
state_dict
[
name
]
if
name
==
"transformer.wte.weight"
:
load_padded_tensor_parallel_vocab
(
param
,
loaded_weight
,
tensor_model_parallel_rank
)
continue
# For the fused QKV linear layer, manually shard the weights.
if
"c_attn"
in
name
:
# GPT-2's fused QKV has the shape of
# [3 * num_heads * head_size, hidden_size].
# When tensor parallelism is used, we shard the weights along
# the head dimension.
total_num_heads
=
self
.
config
.
num_attention_heads
hidden_size
=
self
.
config
.
hidden_size
head_size
=
hidden_size
//
total_num_heads
num_heads
=
total_num_heads
//
tensor_model_parallel_world_size
head_start
=
tensor_model_parallel_rank
*
num_heads
head_end
=
(
tensor_model_parallel_rank
+
1
)
*
num_heads
if
name
.
endswith
(
".weight"
):
loaded_weight
=
loaded_weight
.
view
(
3
,
total_num_heads
,
head_size
,
hidden_size
)
loaded_weight
=
loaded_weight
[:,
head_start
:
head_end
,
:,
:]
loaded_weight
=
loaded_weight
.
reshape
(
-
1
,
hidden_size
)
elif
name
.
endswith
(
".bias"
):
loaded_weight
=
loaded_weight
.
view
(
3
,
total_num_heads
,
head_size
)
loaded_weight
=
loaded_weight
[:,
head_start
:
head_end
,
:]
loaded_weight
=
loaded_weight
.
reshape
(
-
1
)
else
:
raise
ValueError
(
f
"Unexpected parameter name
{
name
}
"
)
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
server/vllm/vllm/model_executor/models/gpt_bigcode.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Copyright 2023 The vLLM team.
# Copyright 2023 CTranslate2, and Michael Feil
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GPTBigCode model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers
import
GPTBigCodeConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
PagedAttention
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
convert_pyslice_to_tensor
,
hf_model_weights_iterator
,
load_padded_tensor_parallel_vocab
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.sequence
import
SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
GPTBigCodeAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTBigCodeConfig
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
total_num_heads
=
config
.
num_attention_heads
self
.
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
assert
total_num_heads
%
self
.
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
(
total_num_heads
//
self
.
tensor_model_parallel_world_size
)
self
.
head_dim
=
self
.
hidden_size
//
total_num_heads
self
.
scale
=
self
.
head_dim
**-
0.5
self
.
multi_query
=
config
.
multi_query
if
self
.
multi_query
:
self
.
num_kv_heads
=
1
self
.
kv_dim
=
self
.
head_dim
self
.
c_attn_q
=
ColumnParallelLinear
(
self
.
hidden_size
,
self
.
hidden_size
,
bias
=
True
,
gather_output
=
False
,
)
self
.
c_attn_kv
=
nn
.
Linear
(
self
.
hidden_size
,
2
*
self
.
kv_dim
,
bias
=
True
)
else
:
self
.
num_kv_heads
=
self
.
num_heads
self
.
kv_dim
=
self
.
num_kv_heads
*
self
.
head_dim
self
.
c_attn
=
ColumnParallelLinear
(
self
.
hidden_size
,
self
.
hidden_size
+
2
*
self
.
kv_dim
,
bias
=
True
,
gather_output
=
False
,
)
self
.
c_proj
=
RowParallelLinear
(
self
.
hidden_size
,
self
.
hidden_size
,
bias
=
True
,
input_is_parallel
=
True
,
)
self
.
attn
=
PagedAttention
(
self
.
num_heads
,
self
.
head_dim
,
scale
=
self
.
scale
,
num_kv_heads
=
self
.
num_kv_heads
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
if
self
.
multi_query
:
q
,
_
=
self
.
c_attn_q
(
hidden_states
)
kv
=
self
.
c_attn_kv
(
hidden_states
)
k
,
v
=
kv
.
split
([
self
.
kv_dim
,
self
.
kv_dim
],
dim
=-
1
)
else
:
qkv
,
_
=
self
.
c_attn
(
hidden_states
)
q
,
k
,
v
=
qkv
.
split
([
self
.
hidden_size
//
self
.
tensor_model_parallel_world_size
,
self
.
kv_dim
,
self
.
kv_dim
],
dim
=-
1
)
key_cache
,
value_cache
=
kv_cache
attn_output
=
self
.
attn
(
q
,
k
,
v
,
key_cache
,
value_cache
,
input_metadata
,
cache_event
)
attn_output
,
_
=
self
.
c_proj
(
attn_output
)
return
attn_output
class
GPTBigMLP
(
nn
.
Module
):
def
__init__
(
self
,
intermediate_size
:
int
,
config
:
GPTBigCodeConfig
,
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
self
.
c_fc
=
ColumnParallelLinear
(
hidden_size
,
intermediate_size
,
bias
=
True
,
gather_output
=
False
,
)
self
.
c_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
True
,
input_is_parallel
=
True
,
)
self
.
act
=
get_act_fn
(
config
.
activation_function
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
,
_
=
self
.
c_fc
(
hidden_states
)
hidden_states
=
self
.
act
(
hidden_states
)
hidden_states
,
_
=
self
.
c_proj
(
hidden_states
)
return
hidden_states
class
GPTBigCodeBlock
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTBigCodeConfig
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
inner_dim
=
(
config
.
n_inner
if
config
.
n_inner
is
not
None
else
4
*
hidden_size
)
self
.
ln_1
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
attn
=
GPTBigCodeAttention
(
config
)
self
.
ln_2
=
nn
.
LayerNorm
(
hidden_size
,
eps
=
config
.
layer_norm_epsilon
)
self
.
mlp
=
GPTBigMLP
(
inner_dim
,
config
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
residual
=
hidden_states
hidden_states
=
self
.
ln_1
(
hidden_states
)
attn_output
=
self
.
attn
(
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
# residual connection
hidden_states
=
attn_output
+
residual
residual
=
hidden_states
hidden_states
=
self
.
ln_2
(
hidden_states
)
feed_forward_hidden_states
=
self
.
mlp
(
hidden_states
)
# residual connection
hidden_states
=
residual
+
feed_forward_hidden_states
return
hidden_states
class
GPTBigCodeModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTBigCodeConfig
):
super
().
__init__
()
self
.
config
=
config
assert
not
config
.
add_cross_attention
self
.
embed_dim
=
config
.
hidden_size
# Optimization: While the vocab size of GPT-2 is 50257, we extend it
# to 50304 in order to make it divisible by 64.
# This improves performance since GPUs are faster if the dimension
# is divisible by 64. In addition, it allows us to shard the embedding
# layer across 2, 4, 8, or more GPUs.
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
self
.
wte
=
VocabParallelEmbedding
(
vocab_size
,
self
.
embed_dim
)
self
.
wpe
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
self
.
embed_dim
)
self
.
h
=
nn
.
ModuleList
(
[
GPTBigCodeBlock
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)])
self
.
ln_f
=
nn
.
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
inputs_embeds
=
self
.
wte
(
input_ids
)
position_embeds
=
self
.
wpe
(
position_ids
)
hidden_states
=
inputs_embeds
+
position_embeds
for
i
in
range
(
len
(
self
.
h
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
h
[
i
]
hidden_states
=
layer
(
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
)
hidden_states
=
self
.
ln_f
(
hidden_states
)
return
hidden_states
class
GPTBigCodeForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTBigCodeConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
transformer
=
GPTBigCodeModel
(
config
)
# TODO(zhuohan): create a new weight after implementing pipeline
# parallelism
self
.
lm_head_weight
=
self
.
transformer
.
wte
.
weight
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head_weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[
"c_fc.weight"
,
"c_fc.bias"
]
_row_parallel_weights
=
[
"c_proj.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
tensor_model_parallel_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"lm_head.weight"
in
name
:
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
continue
if
".attn.bias"
in
name
:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
if
not
name
.
startswith
(
"transformer."
):
name
=
"transformer."
+
name
# For the fused QKV linear layer, manually shard the weights.
if
"c_attn"
in
name
:
# GPT-2's fused QKV has the shape of
# [3 * num_heads * head_size, hidden_size].
# When tensor parallelism is used, we shard the weights along
# the head dimension.
total_num_heads
=
self
.
config
.
num_attention_heads
total_num_kv_heads
=
(
1
if
self
.
config
.
multi_query
else
total_num_heads
)
hidden_size
=
self
.
config
.
hidden_size
head_size
=
hidden_size
//
total_num_heads
total_kv_size
=
head_size
*
total_num_kv_heads
num_heads
=
total_num_heads
//
tensor_model_parallel_world_size
head_start
=
tensor_model_parallel_rank
*
num_heads
head_end
=
(
tensor_model_parallel_rank
+
1
)
*
num_heads
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
wq
,
wk
,
wv
=
torch
.
split
(
loaded_weight
,
[
hidden_size
,
total_kv_size
,
total_kv_size
],
dim
=
0
)
wq
=
wq
[
head_size
*
head_start
:
head_size
*
head_end
]
if
not
self
.
config
.
multi_query
:
# Split the heads when using normal multi-head attention
wk
=
wk
[
head_size
*
head_start
:
head_size
*
head_end
]
wv
=
wv
[
head_size
*
head_start
:
head_size
*
head_end
]
loaded_weight
=
torch
.
cat
([
wq
,
wk
,
wv
],
dim
=
0
)
else
:
# For multi-query attention, we split the query
# but replicate the key and value.
loaded_weight_q
=
wq
loaded_weight_kv
=
torch
.
cat
([
wk
,
wv
],
dim
=
0
)
q_weight_name
=
name
.
replace
(
"c_attn"
,
"c_attn_q"
)
kv_weight_name
=
name
.
replace
(
"c_attn"
,
"c_attn_kv"
)
load_tensor_parallel_weights
(
state_dict
[
q_weight_name
],
loaded_weight_q
,
q_weight_name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
load_tensor_parallel_weights
(
state_dict
[
kv_weight_name
],
loaded_weight_kv
,
kv_weight_name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
continue
param
=
state_dict
[
name
]
if
name
==
"transformer.wte.weight"
:
load_padded_tensor_parallel_vocab
(
param
,
loaded_weight
,
tensor_model_parallel_rank
)
continue
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
server/vllm/vllm/model_executor/models/gpt_j.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
# Copyright 2023 The vLLM team.
# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GPT-J model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers
import
GPTJConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
PagedAttentionWithRoPE
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.sequence
import
SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
GPTJAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTJConfig
):
super
().
__init__
()
self
.
total_num_heads
=
config
.
num_attention_heads
self
.
hidden_size
=
config
.
hidden_size
self
.
head_size
=
self
.
hidden_size
//
self
.
total_num_heads
self
.
qkv_proj
=
ColumnParallelLinear
(
config
.
hidden_size
,
3
*
config
.
hidden_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
out_proj
=
RowParallelLinear
(
config
.
hidden_size
,
config
.
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
)
tp_world_size
=
get_tensor_model_parallel_world_size
()
assert
self
.
total_num_heads
%
tp_world_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_world_size
scaling
=
self
.
head_size
**-
0.5
assert
getattr
(
config
,
"rotary"
,
True
)
assert
config
.
rotary_dim
%
2
==
0
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
attn
=
PagedAttentionWithRoPE
(
self
.
num_heads
,
self
.
head_size
,
scaling
,
config
.
rotary_dim
,
base
=
rope_theta
,
max_position
=
max_position_embeddings
,
is_neox_style
=
False
)
self
.
warmup
=
False
def
forward
(
self
,
position_ids
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
position_ids
,
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
attn_output
,
_
=
self
.
out_proj
(
attn_output
)
return
attn_output
class
GPTJMLP
(
nn
.
Module
):
def
__init__
(
self
,
intermediate_size
:
int
,
config
:
GPTJConfig
):
super
().
__init__
()
hidden_size
=
config
.
n_embd
self
.
fc_in
=
ColumnParallelLinear
(
hidden_size
,
intermediate_size
,
gather_output
=
False
,
)
self
.
fc_out
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
input_is_parallel
=
True
,
)
self
.
act
=
get_act_fn
(
config
.
activation_function
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
,
_
=
self
.
fc_in
(
hidden_states
)
hidden_states
=
self
.
act
(
hidden_states
)
hidden_states
,
_
=
self
.
fc_out
(
hidden_states
)
return
hidden_states
class
GPTJBlock
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTJConfig
):
super
().
__init__
()
if
config
.
n_inner
is
None
:
inner_dim
=
4
*
config
.
n_embd
else
:
inner_dim
=
config
.
n_inner
self
.
ln_1
=
nn
.
LayerNorm
(
config
.
n_embd
,
eps
=
config
.
layer_norm_epsilon
)
self
.
attn
=
GPTJAttention
(
config
)
self
.
mlp
=
GPTJMLP
(
inner_dim
,
config
)
def
forward
(
self
,
position_ids
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
residual
=
hidden_states
hidden_states
=
self
.
ln_1
(
hidden_states
)
attn_output
=
self
.
attn
(
position_ids
=
position_ids
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
mlp_output
=
self
.
mlp
(
hidden_states
)
hidden_states
=
attn_output
+
mlp_output
+
residual
return
hidden_states
class
GPTJModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTJConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_dim
=
config
.
n_embd
self
.
wte
=
VocabParallelEmbedding
(
config
.
vocab_size
,
self
.
embed_dim
,
)
self
.
h
=
nn
.
ModuleList
(
[
GPTJBlock
(
config
)
for
_
in
range
(
config
.
n_layer
)])
self
.
ln_f
=
nn
.
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_epsilon
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
wte
(
input_ids
)
for
i
in
range
(
len
(
self
.
h
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
h
[
i
]
hidden_states
=
layer
(
position_ids
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
ln_f
(
hidden_states
)
return
hidden_states
class
GPTJForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTJConfig
):
super
().
__init__
()
self
.
config
=
config
assert
not
config
.
tie_word_embeddings
self
.
transformer
=
GPTJModel
(
config
)
self
.
lm_head
=
ColumnParallelLinear
(
config
.
n_embd
,
config
.
vocab_size
,
gather_output
=
False
,
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
transformer
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
input_metadata
,
self
.
lm_head
.
bias
)
return
next_tokens
_column_parallel_weights
=
[
"wte.weight"
,
"fc_in.weight"
,
"fc_in.bias"
,
"lm_head.weight"
,
"lm_head.bias"
]
_row_parallel_weights
=
[
"out_proj.weight"
,
"fc_out.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tp_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"attn.bias"
in
name
or
"attn.masked_bias"
in
name
:
continue
is_attention_weight
=
False
for
stride_id
,
att_weight_name
in
enumerate
(
[
"q_proj"
,
"k_proj"
,
"v_proj"
]):
if
att_weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
att_weight_name
,
"qkv_proj"
)]
shard_size
=
param
.
shape
[
1
]
loaded_weight
=
loaded_weight
[
shard_size
*
tp_rank
:
shard_size
*
(
tp_rank
+
1
)]
param_slice
=
param
.
data
[
shard_size
*
stride_id
:
shard_size
*
(
stride_id
+
1
)]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_attention_weight
=
True
break
if
is_attention_weight
:
continue
param
=
state_dict
[
name
]
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tp_rank
)
server/vllm/vllm/model_executor/models/gpt_neox.py
deleted
100644 → 0
View file @
64def8e2
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GPT-NeoX model compatible with HuggingFace weights.
The input of the model is flattened to a 1D tensor of tokens. The model uses
InputMetadata to extract the original 2D shape of the input.
"""
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers
import
GPTNeoXConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
PagedAttentionWithRoPE
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.weight_utils
import
(
hf_model_weights_iterator
,
load_tensor_parallel_weights
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
VocabParallelEmbedding
,
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.sequence
import
SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
GPTNeoXAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTNeoXConfig
):
super
().
__init__
()
self
.
total_num_heads
=
config
.
num_attention_heads
self
.
hidden_size
=
config
.
hidden_size
self
.
head_size
=
self
.
hidden_size
//
self
.
total_num_heads
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
assert
self
.
total_num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
(
self
.
total_num_heads
//
tensor_model_parallel_world_size
)
self
.
query_key_value
=
ColumnParallelLinear
(
config
.
hidden_size
,
3
*
config
.
hidden_size
,
gather_output
=
False
,
)
self
.
dense
=
RowParallelLinear
(
config
.
hidden_size
,
config
.
hidden_size
,
input_is_parallel
=
True
,
)
scaling
=
self
.
head_size
**-
0.5
rotary_dim
=
int
(
self
.
head_size
*
config
.
rotary_pct
)
assert
rotary_dim
%
2
==
0
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
attn
=
PagedAttentionWithRoPE
(
self
.
num_heads
,
self
.
head_size
,
scaling
,
rotary_dim
,
base
=
rope_theta
,
max_position
=
max_position_embeddings
)
def
forward
(
self
,
position_ids
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
query_key_value
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
position_ids
,
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
output
,
_
=
self
.
dense
(
attn_output
)
return
output
class
GPTNeoXMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTNeoXConfig
):
super
().
__init__
()
self
.
dense_h_to_4h
=
ColumnParallelLinear
(
config
.
hidden_size
,
config
.
intermediate_size
,
gather_output
=
False
,
)
self
.
dense_4h_to_h
=
RowParallelLinear
(
config
.
intermediate_size
,
config
.
hidden_size
,
input_is_parallel
=
True
,
)
self
.
act
=
get_act_fn
(
config
.
hidden_act
)
def
forward
(
self
,
hidden_states
):
hidden_states
,
_
=
self
.
dense_h_to_4h
(
hidden_states
)
hidden_states
=
self
.
act
(
hidden_states
)
hidden_states
,
_
=
self
.
dense_4h_to_h
(
hidden_states
)
return
hidden_states
class
GPTNeoXLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTNeoXConfig
):
super
().
__init__
()
self
.
use_parallel_residual
=
config
.
use_parallel_residual
self
.
input_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
post_attention_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
attention
=
GPTNeoXAttention
(
config
)
self
.
mlp
=
GPTNeoXMLP
(
config
)
def
forward
(
self
,
position_ids
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
attn_input
=
self
.
input_layernorm
(
hidden_states
)
attn_output
=
self
.
attention
(
position_ids
=
position_ids
,
hidden_states
=
attn_input
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
if
self
.
use_parallel_residual
:
# pseudocode:
# x = x + attn(ln1(x)) + mlp(ln2(x))
mlp_input
=
self
.
post_attention_layernorm
(
hidden_states
)
mlp_output
=
self
.
mlp
(
mlp_input
)
hidden_states
=
mlp_output
+
attn_output
+
hidden_states
else
:
# pseudocode:
# x = x + attn(ln1(x))
# x = x + mlp(ln2(x))
attn_output
=
attn_output
+
hidden_states
mlp_input
=
self
.
post_attention_layernorm
(
attn_output
)
mlp_output
=
self
.
mlp
(
mlp_input
)
hidden_states
=
mlp_output
+
attn_output
return
hidden_states
class
GPTNeoXModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
GPTNeoXConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_in
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
)
self
.
layers
=
nn
.
ModuleList
(
[
GPTNeoXLayer
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)])
self
.
final_layer_norm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
position_ids
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed_in
(
input_ids
)
for
i
in
range
(
len
(
self
.
layers
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
layers
[
i
]
hidden_states
=
layer
(
position_ids
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
final_layer_norm
(
hidden_states
)
return
hidden_states
class
GPTNeoXForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
().
__init__
()
self
.
config
=
config
self
.
gpt_neox
=
GPTNeoXModel
(
config
)
self
.
embed_out
=
ColumnParallelLinear
(
config
.
hidden_size
,
config
.
vocab_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
gpt_neox
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
embed_out
.
weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[
"embed_in.weight"
,
"embed_out.weight"
,
"dense_h_to_4h.weight"
,
"dense_h_to_4h.bias"
]
_row_parallel_weights
=
[
"dense.weight"
,
"dense_4h_to_h.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tensor_model_parallel_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
(
"attention.bias"
in
name
or
"attention.masked_bias"
in
name
or
"rotary_emb.inv_freq"
in
name
):
continue
param
=
state_dict
[
name
]
if
"query_key_value"
in
name
:
# NOTE(woosuk): GPT-NeoX's fused QKV has the shape of
# [num_heads * 3 * head_size, hidden_size], while the
# required shape is [3 * num_heads * head_size, hidden_size].
# Thus, we need weight conversion.
shard_size
=
param
.
shape
[
0
]
loaded_weight
=
loaded_weight
[
shard_size
*
tensor_model_parallel_rank
:
shard_size
*
(
tensor_model_parallel_rank
+
1
)]
num_heads
=
self
.
config
.
num_attention_heads
hidden_size
=
self
.
config
.
hidden_size
head_size
=
hidden_size
//
num_heads
if
"query_key_value.weight"
in
name
:
loaded_weight
=
loaded_weight
.
view
(
-
1
,
3
,
head_size
,
hidden_size
)
loaded_weight
=
loaded_weight
.
transpose
(
0
,
1
)
loaded_weight
=
loaded_weight
.
reshape
(
-
1
,
hidden_size
)
elif
"query_key_value.bias"
in
name
:
loaded_weight
=
loaded_weight
.
view
(
-
1
,
3
,
head_size
)
loaded_weight
=
loaded_weight
.
transpose
(
0
,
1
)
loaded_weight
=
loaded_weight
.
reshape
(
-
1
)
else
:
raise
ValueError
(
f
"Unexpected weight name:
{
name
}
"
)
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
server/vllm/vllm/model_executor/models/internlm.py
deleted
100644 → 0
View file @
64def8e2
# -*- coding: utf-8 -*-
from
typing
import
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers
import
LlamaConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
PagedAttentionWithRoPE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.parallel_utils.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.parallel_utils.layers
import
(
ColumnParallelLinear
,
RowParallelLinear
,
VocabParallelEmbedding
)
from
vllm.model_executor.weight_utils
import
(
hf_model_weights_iterator
,
load_padded_tensor_parallel_vocab
,
load_tensor_parallel_weights
)
from
vllm.sequence
import
SamplerOutput
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
InternLMMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
,
):
super
().
__init__
()
self
.
gate_up_proj
=
ColumnParallelLinear
(
hidden_size
,
2
*
intermediate_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
input_is_parallel
=
True
,
)
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
self
.
act_fn
=
SiluAndMul
()
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
InternLMAttention
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
num_heads
:
int
,
rope_theta
:
float
=
10000
,
max_position_embeddings
:
int
=
8192
,
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tensor_model_parallel_world_size
=
(
get_tensor_model_parallel_world_size
())
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tensor_model_parallel_world_size
==
0
self
.
num_heads
=
(
self
.
total_num_heads
//
tensor_model_parallel_world_size
)
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
self
.
qkv_proj
=
ColumnParallelLinear
(
hidden_size
,
3
*
self
.
total_num_heads
*
self
.
head_dim
,
bias
=
True
,
gather_output
=
False
,
)
self
.
o_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
True
,
input_is_parallel
=
True
,
)
self
.
attn
=
PagedAttentionWithRoPE
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
base
=
self
.
rope_theta
,
max_position
=
self
.
max_position_embeddings
,
rotary_dim
=
self
.
head_dim
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
k_cache
,
v_cache
=
kv_cache
attn_output
=
self
.
attn
(
positions
,
q
,
k
,
v
,
k_cache
,
v_cache
,
input_metadata
,
cache_event
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
InternLMDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
):
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
self
.
self_attn
=
InternLMAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
rope_theta
=
rope_theta
,
max_position_embeddings
=
max_position_embeddings
,
)
self
.
mlp
=
InternLMMLP
(
hidden_size
=
self
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
)
self
.
input_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
post_attention_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
KVCache
,
input_metadata
:
InputMetadata
,
cache_event
:
Optional
[
torch
.
cuda
.
Event
],
)
->
torch
.
Tensor
:
# Self Attention
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
input_metadata
=
input_metadata
,
cache_event
=
cache_event
,
)
hidden_states
=
residual
+
hidden_states
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
InternLMModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
self
.
embed_tokens
=
VocabParallelEmbedding
(
vocab_size
,
config
.
hidden_size
,
)
self
.
layers
=
nn
.
ModuleList
([
InternLMDecoderLayer
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed_tokens
(
input_ids
)
for
i
in
range
(
len
(
self
.
layers
)):
if
cache_events
is
None
:
cache_event
=
None
else
:
cache_event
=
cache_events
[
i
]
layer
=
self
.
layers
[
i
]
hidden_states
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
input_metadata
,
cache_event
,
)
hidden_states
=
self
.
norm
(
hidden_states
)
return
hidden_states
class
InternLMForCausalLM
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
().
__init__
()
self
.
config
=
config
self
.
model
=
InternLMModel
(
config
)
vocab_size
=
((
config
.
vocab_size
+
63
)
//
64
)
*
64
self
.
lm_head
=
ColumnParallelLinear
(
config
.
hidden_size
,
vocab_size
,
bias
=
False
,
gather_output
=
False
,
)
self
.
sampler
=
Sampler
(
config
.
vocab_size
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
KVCache
],
input_metadata
:
InputMetadata
,
cache_events
:
Optional
[
List
[
torch
.
cuda
.
Event
]],
)
->
SamplerOutput
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
cache_events
)
next_tokens
=
self
.
sampler
(
self
.
lm_head
.
weight
,
hidden_states
,
input_metadata
)
return
next_tokens
_column_parallel_weights
=
[
"qkv_proj.weight"
,
"gate_proj.weight"
,
"up_proj.weight"
]
_row_parallel_weights
=
[
"o_proj.weight"
,
"down_proj.weight"
]
def
load_weights
(
self
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
):
tensor_model_parallel_rank
=
get_tensor_model_parallel_rank
()
state_dict
=
self
.
state_dict
()
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
if
"rotary_emb.inv_freq"
in
name
:
continue
if
"embed_tokens"
in
name
or
"lm_head"
in
name
:
param
=
state_dict
[
name
]
load_padded_tensor_parallel_vocab
(
param
,
loaded_weight
,
tensor_model_parallel_rank
)
continue
is_attention_weight
=
False
for
stride_id
,
att_weight_name
in
enumerate
(
[
"q_proj"
,
"k_proj"
,
"v_proj"
]):
if
att_weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
att_weight_name
,
"qkv_proj"
)]
shard_size
=
param
.
shape
[
0
]
//
3
loaded_weight
=
loaded_weight
[
shard_size
*
tensor_model_parallel_rank
:
shard_size
*
(
tensor_model_parallel_rank
+
1
)]
param_slice
=
param
.
data
[
shard_size
*
stride_id
:
shard_size
*
(
stride_id
+
1
)]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_attention_weight
=
True
break
if
is_attention_weight
:
continue
is_gate_up_weight
=
False
for
stride_id
,
weight_name
in
enumerate
([
"gate_proj"
,
"up_proj"
]):
if
weight_name
not
in
name
:
continue
param
=
state_dict
[
name
.
replace
(
weight_name
,
"gate_up_proj"
)]
shard_size
=
param
.
shape
[
0
]
//
2
loaded_weight
=
loaded_weight
[
shard_size
*
tensor_model_parallel_rank
:
shard_size
*
(
tensor_model_parallel_rank
+
1
)]
param_slice
=
param
.
data
[
shard_size
*
stride_id
:
shard_size
*
(
stride_id
+
1
)]
assert
param_slice
.
shape
==
loaded_weight
.
shape
param_slice
.
copy_
(
loaded_weight
)
is_gate_up_weight
=
True
break
if
is_gate_up_weight
:
continue
param
=
state_dict
[
name
]
load_tensor_parallel_weights
(
param
,
loaded_weight
,
name
,
self
.
_column_parallel_weights
,
self
.
_row_parallel_weights
,
tensor_model_parallel_rank
)
Prev
1
…
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment