Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2daf23ab
Unverified
Commit
2daf23ab
authored
Mar 07, 2024
by
Woosuk Kwon
Committed by
GitHub
Mar 07, 2024
Browse files
Separate attention backends (#3005)
parent
cbf4c05b
Changes
35
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
55 additions
and
55 deletions
+55
-55
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_j.py
+2
-2
vllm/model_executor/models/gpt_neox.py
vllm/model_executor/models/gpt_neox.py
+2
-2
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internlm2.py
+5
-5
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+6
-6
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral.py
+2
-2
vllm/model_executor/models/mixtral_quant.py
vllm/model_executor/models/mixtral_quant.py
+2
-2
vllm/model_executor/models/mpt.py
vllm/model_executor/models/mpt.py
+6
-6
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo.py
+4
-4
vllm/model_executor/models/opt.py
vllm/model_executor/models/opt.py
+4
-4
vllm/model_executor/models/orion.py
vllm/model_executor/models/orion.py
+5
-5
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi.py
+2
-2
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+2
-2
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+6
-6
vllm/model_executor/models/stablelm.py
vllm/model_executor/models/stablelm.py
+5
-5
vllm/model_executor/models/starcoder2.py
vllm/model_executor/models/starcoder2.py
+2
-2
No files found.
vllm/model_executor/models/gpt_j.py
View file @
2daf23ab
...
...
@@ -24,7 +24,7 @@ from transformers import GPTJConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
LinearMethodBase
,
QKVParallelLinear
,
...
...
@@ -86,7 +86,7 @@ class GPTJAttention(nn.Module):
base
=
rope_theta
,
is_neox_style
=
False
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_size
,
scaling
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_size
,
scaling
)
def
forward
(
self
,
...
...
vllm/model_executor/models/gpt_neox.py
View file @
2daf23ab
...
...
@@ -24,7 +24,7 @@ from transformers import GPTNeoXConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
LinearMethodBase
,
QKVParallelLinear
,
...
...
@@ -87,7 +87,7 @@ class GPTNeoXAttention(nn.Module):
max_position
=
max_position_embeddings
,
base
=
rope_theta
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_size
,
scaling
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_size
,
scaling
)
def
forward
(
self
,
...
...
vllm/model_executor/models/internlm2.py
View file @
2daf23ab
...
...
@@ -7,7 +7,7 @@ from transformers import PretrainedConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
MergedColumnParallelLinear
,
...
...
@@ -114,10 +114,10 @@ class InternLM2Attention(nn.Module):
base
=
rope_theta
,
rope_scaling
=
rope_scaling
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
)
def
forward
(
self
,
...
...
vllm/model_executor/models/llama.py
View file @
2daf23ab
...
...
@@ -30,7 +30,7 @@ from transformers import LlamaConfig
from
vllm.config
import
LoRAConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
MergedColumnParallelLinear
,
...
...
@@ -139,11 +139,11 @@ class LlamaAttention(nn.Module):
base
=
rope_theta
,
rope_scaling
=
rope_scaling
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
sliding_window
=
sliding_window
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
sliding_window
=
sliding_window
)
def
forward
(
self
,
...
...
vllm/model_executor/models/mixtral.py
View file @
2daf23ab
...
...
@@ -29,7 +29,7 @@ from transformers import MixtralConfig
from
vllm.config
import
LoRAConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
...
...
@@ -197,7 +197,7 @@ class MixtralAttention(nn.Module):
base
=
int
(
self
.
rope_theta
),
is_neox_style
=
True
,
)
self
.
attn
=
Paged
Attention
(
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
...
...
vllm/model_executor/models/mixtral_quant.py
View file @
2daf23ab
...
...
@@ -32,7 +32,7 @@ from torch import nn
from
transformers
import
MixtralConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
ReplicatedLinear
,
...
...
@@ -214,7 +214,7 @@ class MixtralAttention(nn.Module):
base
=
int
(
self
.
rope_theta
),
is_neox_style
=
True
,
)
self
.
attn
=
Paged
Attention
(
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
...
...
vllm/model_executor/models/mpt.py
View file @
2daf23ab
...
...
@@ -8,7 +8,7 @@ import torch.nn as nn
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
LinearMethodBase
,
QKVParallelLinear
,
...
...
@@ -105,11 +105,11 @@ class MPTAttention(nn.Module):
self
.
head_dim
=
self
.
d_model
//
self
.
total_num_heads
scaling
=
self
.
head_dim
**-
0.5
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_dim
,
scaling
,
alibi_slopes
=
alibi_slopes
,
num_kv_heads
=
self
.
num_kv_heads
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
scaling
,
alibi_slopes
=
alibi_slopes
,
num_kv_heads
=
self
.
num_kv_heads
)
def
forward
(
self
,
...
...
vllm/model_executor/models/olmo.py
View file @
2daf23ab
...
...
@@ -43,7 +43,7 @@ import torch.nn.functional as F
from
torch
import
nn
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
LinearMethodBase
,
...
...
@@ -126,9 +126,9 @@ class OlmoAttention(nn.Module):
base
=
rope_theta
,
)
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_dim
,
scale
=
self
.
scaling
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
scale
=
self
.
scaling
)
# Attention output projection.
self
.
attn_out
=
RowParallelLinear
(
...
...
vllm/model_executor/models/opt.py
View file @
2daf23ab
...
...
@@ -25,7 +25,7 @@ from transformers import OPTConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
LinearMethodBase
,
QKVParallelLinear
,
...
...
@@ -89,9 +89,9 @@ class OPTAttention(nn.Module):
bias
=
bias
,
linear_method
=
linear_method
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_dim
,
scale
=
self
.
scaling
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
scale
=
self
.
scaling
)
def
forward
(
self
,
...
...
vllm/model_executor/models/orion.py
View file @
2daf23ab
...
...
@@ -12,7 +12,7 @@ from transformers import PretrainedConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
MergedColumnParallelLinear
,
QKVParallelLinear
,
...
...
@@ -118,10 +118,10 @@ class OrionAttention(nn.Module):
base
=
rope_theta
,
rope_scaling
=
rope_scaling
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
)
def
forward
(
self
,
...
...
vllm/model_executor/models/phi.py
View file @
2daf23ab
...
...
@@ -43,7 +43,7 @@ from transformers import PretrainedConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
LinearMethodBase
,
QKVParallelLinear
,
...
...
@@ -108,7 +108,7 @@ class PhiAttention(nn.Module):
max_position
=
max_position_embeddings
,
base
=
rope_theta
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_size
,
scaling
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_size
,
scaling
)
def
forward
(
self
,
...
...
vllm/model_executor/models/qwen.py
View file @
2daf23ab
...
...
@@ -12,7 +12,7 @@ from transformers import PretrainedConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
MergedColumnParallelLinear
,
...
...
@@ -104,7 +104,7 @@ class QWenAttention(nn.Module):
base
=
rope_theta
,
rope_scaling
=
rope_scaling
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
)
def
forward
(
self
,
...
...
vllm/model_executor/models/qwen2.py
View file @
2daf23ab
...
...
@@ -30,7 +30,7 @@ from transformers import Qwen2Config
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
MergedColumnParallelLinear
,
...
...
@@ -135,11 +135,11 @@ class Qwen2Attention(nn.Module):
max_position
=
max_position
,
base
=
self
.
rope_theta
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
sliding_window
=
self
.
sliding_window
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
sliding_window
=
self
.
sliding_window
)
def
forward
(
self
,
...
...
vllm/model_executor/models/stablelm.py
View file @
2daf23ab
...
...
@@ -25,7 +25,7 @@ from transformers import PretrainedConfig
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.linear
import
(
LinearMethodBase
,
MergedColumnParallelLinear
,
QKVParallelLinear
,
...
...
@@ -122,10 +122,10 @@ class StablelmAttention(nn.Module):
max_position
=
self
.
config
.
max_position_embeddings
,
base
=
self
.
config
.
rope_theta
,
)
self
.
attn
=
Paged
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_key_value_heads
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_key_value_heads
)
def
forward
(
self
,
...
...
vllm/model_executor/models/starcoder2.py
View file @
2daf23ab
...
...
@@ -25,7 +25,7 @@ from torch import nn
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.layers.attention
import
Paged
Attention
from
vllm.model_executor.layers.attention
import
Attention
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -103,7 +103,7 @@ class Starcoder2Attention(nn.Module):
base
=
int
(
self
.
rope_theta
),
is_neox_style
=
True
,
)
self
.
attn
=
Paged
Attention
(
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment