Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b0b9d2d9
Commit
b0b9d2d9
authored
Aug 01, 2024
by
zhuwenwen
Browse files
Merge branch 'v0.5.0-dtk24.04.1' into v0.5.3.post1-dtk24.04.1
parents
c9305344
ffbef65c
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
101 additions
and
47 deletions
+101
-47
README.md
README.md
+4
-1
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+23
-0
csrc/attention/static_switch.h
csrc/attention/static_switch.h
+18
-28
examples/offline_inference.py
examples/offline_inference.py
+1
-1
setup.py
setup.py
+1
-1
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+7
-3
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+13
-3
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+7
-3
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+13
-3
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+14
-4
No files found.
README.md
View file @
b0b9d2d9
...
...
@@ -54,7 +54,10 @@ pip install setuptools wheel
```
shell
git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git
# 根据需要的分支进行切换
```
安装依赖:
```
shell
pip
install
-r
requirements-rocm.txt
```
-
提供2种源码编译方式(进入vllm目录):
```
1. 编译whl包并安装
...
...
benchmarks/benchmark_throughput.py
View file @
b0b9d2d9
...
...
@@ -5,12 +5,14 @@ import random
import
time
from
typing
import
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
from
tqdm
import
tqdm
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
)
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.inputs
import
PromptInputs
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -123,6 +125,23 @@ def run_vllm(
max_tokens
=
output_len
,
))
# warmup
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
args
.
batch_size
,
args
.
input_len
))
dummy_inputs
:
List
[
PromptInputs
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
def
run_to_completion
():
llm
.
generate
(
dummy_inputs
,
sampling_params
=
sampling_params
,
use_tqdm
=
False
)
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
args
.
num_iters_warmup
),
desc
=
"Warmup iterations"
):
run_to_completion
()
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
...
...
@@ -299,6 +318,10 @@ if __name__ == "__main__":
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
'--num-iters-warmup'
,
type
=
int
,
default
=
1
,
help
=
'Number of iterations to run for warmup.'
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
...
...
csrc/attention/static_switch.h
View file @
b0b9d2d9
...
...
@@ -31,38 +31,28 @@
} \
}()
// #define HEADSIZE_SWITCH(HEADDIM, ...) \
// [&] { \
// if (HEADDIM == 64) { \
// constexpr static int HEAD_SIZE = 64; \
// return __VA_ARGS__(); \
// } else if (HEADDIM == 80) { \
// constexpr static int HEAD_SIZE = 80; \
// return __VA_ARGS__(); \
// } else if (HEADDIM == 96) { \
// constexpr static int HEAD_SIZE = 96; \
// return __VA_ARGS__(); \
// } else if (HEADDIM == 112) { \
// constexpr static int HEAD_SIZE = 112; \
// return __VA_ARGS__(); \
// } else if (HEADDIM == 128) { \
// constexpr static int HEAD_SIZE = 128; \
// return __VA_ARGS__(); \
// } else if (HEADDIM == 256) { \
// constexpr static int HEAD_SIZE = 256; \
// return __VA_ARGS__(); \
// } \
// else { \
// TORCH_CHECK(false, "Unsupported head size: ", HEADDIM);\
// } \
// }()
#define HEADSIZE_SWITCH(HEADDIM, ...) \
[&] { \
if (HEADDIM == 128) { \
if (HEADDIM == 64) { \
constexpr static int HEAD_SIZE = 64; \
return __VA_ARGS__(); \
} else if (HEADDIM == 80) { \
constexpr static int HEAD_SIZE = 80; \
return __VA_ARGS__(); \
} else if (HEADDIM == 96) { \
constexpr static int HEAD_SIZE = 96; \
return __VA_ARGS__(); \
} else if (HEADDIM == 112) { \
constexpr static int HEAD_SIZE = 112; \
return __VA_ARGS__(); \
} else if (HEADDIM == 128) { \
constexpr static int HEAD_SIZE = 128; \
return __VA_ARGS__(); \
} else { \
} else if (HEADDIM == 256) { \
constexpr static int HEAD_SIZE = 256; \
return __VA_ARGS__(); \
} \
else { \
TORCH_CHECK(false, "Unsupported head size: ", HEADDIM);\
} \
}()
...
...
examples/offline_inference.py
View file @
b0b9d2d9
...
...
@@ -12,7 +12,7 @@ if __name__ == '__main__':
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
trust_remote_code
=
True
,
dtype
=
"float16"
,
enforce_eager
=
Fals
e
)
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
trust_remote_code
=
True
,
dtype
=
"float16"
,
enforce_eager
=
Tru
e
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
...
setup.py
View file @
b0b9d2d9
...
...
@@ -377,7 +377,7 @@ def get_version_add(sha: Optional[str] = None) -> str:
if
sha
!=
'Unknown'
:
if
sha
is
None
:
sha
=
get_sha
(
vllm_root
)
version
=
'das1.
1
.git'
+
sha
[:
7
]
version
=
'das1.
2
.git'
+
sha
[:
7
]
# abi version
version
+=
"."
+
get_abi
()
...
...
vllm/model_executor/models/baichuan.py
View file @
b0b9d2d9
...
...
@@ -186,7 +186,7 @@ class BaiChuanAttention(nn.Module):
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
W_pack
(
hidden_states
)
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
and
qkv
.
shape
[
-
1
]
==
12320
:
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
:
qkv
=
qkv
[...,:
-
32
]
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
if
self
.
postion_embedding
!=
"ALIBI"
:
...
...
@@ -423,14 +423,18 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
]
combined_words
=
"|"
.
join
(
lay_key_words
)
lay_qkv_words
=
[
"self_attn.W_pack.weight"
]
qkv_words
=
"|"
.
join
(
lay_qkv_words
)
for
layername
,
weight
in
params_dict
.
items
():
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
if
self
.
use_gemm_pad
and
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
weight
.
data
.
shape
[
0
]
==
12288
:
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
(
re
.
findall
(
qkv_words
,
layername
)):
if
not
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
...
...
vllm/model_executor/models/chatglm.py
View file @
b0b9d2d9
...
...
@@ -108,7 +108,7 @@ class GLMAttention(nn.Module):
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
query_key_value
(
hidden_states
)
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
and
qkv
.
shape
[
-
1
]
==
12320
:
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
:
qkv
=
qkv
[...,:
-
32
]
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
position_ids
,
q
,
k
)
...
...
@@ -421,14 +421,24 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA):
]
combined_words
=
"|"
.
join
(
lay_key_words
)
lay_qkv_words
=
[
"self_attention.query_key_value.weight"
]
qkv_words
=
"|"
.
join
(
lay_qkv_words
)
lay_qkv_bias_words
=
[
"self_attention.query_key_value.bias"
]
qkv_bias_words
=
"|"
.
join
(
lay_qkv_bias_words
)
for
layername
,
weight
in
params_dict
.
items
():
if
self
.
use_fa_pad
and
(
re
.
findall
(
qkv_bias_words
,
layername
)):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
if
self
.
use_gemm_pad
and
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
weight
.
data
.
shape
[
0
]
==
12288
:
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
(
re
.
findall
(
qkv_words
,
layername
)):
if
not
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
...
...
vllm/model_executor/models/llama.py
View file @
b0b9d2d9
...
...
@@ -175,7 +175,7 @@ class LlamaAttention(nn.Module):
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
and
qkv
.
shape
[
-
1
]
==
12320
:
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
:
qkv
=
qkv
[...,:
-
32
]
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
...
...
@@ -531,14 +531,18 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
]
combined_words
=
"|"
.
join
(
lay_key_words
)
lay_qkv_words
=
[
"self_attn.qkv_proj.weight"
]
qkv_words
=
"|"
.
join
(
lay_qkv_words
)
for
layername
,
weight
in
params_dict
.
items
():
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
if
self
.
use_gemm_pad
and
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
weight
.
data
.
shape
[
0
]
==
12288
:
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
(
re
.
findall
(
qkv_words
,
layername
)):
if
not
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
...
...
vllm/model_executor/models/qwen.py
View file @
b0b9d2d9
...
...
@@ -124,7 +124,7 @@ class QWenAttention(nn.Module):
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
c_attn
(
hidden_states
)
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
and
qkv
.
shape
[
-
1
]
==
12320
:
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
:
qkv
=
qkv
[...,:
-
32
]
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
...
...
@@ -326,14 +326,24 @@ class QWenLMHeadModel(nn.Module):
]
combined_words
=
"|"
.
join
(
lay_key_words
)
lay_qkv_words
=
[
"attn.c_attn.weight"
]
qkv_words
=
"|"
.
join
(
lay_qkv_words
)
lay_qkv_bias_words
=
[
"attn.c_attn.bias"
]
qkv_bias_words
=
"|"
.
join
(
lay_qkv_bias_words
)
for
layername
,
weight
in
params_dict
.
items
():
if
self
.
use_fa_pad
and
(
re
.
findall
(
qkv_bias_words
,
layername
)):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
if
self
.
use_gemm_pad
and
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
weight
.
data
.
shape
[
0
]
==
12288
:
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
(
re
.
findall
(
qkv_words
,
layername
)):
if
not
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
...
...
vllm/model_executor/models/qwen2.py
View file @
b0b9d2d9
...
...
@@ -156,7 +156,7 @@ class Qwen2Attention(nn.Module):
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
and
qkv
.
shape
[
-
1
]
==
12320
:
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
:
qkv
=
qkv
[...,:
-
32
]
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
...
...
@@ -411,14 +411,24 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
]
combined_words
=
"|"
.
join
(
lay_key_words
)
lay_qkv_words
=
[
"self_attn.qkv_proj.weight"
]
qkv_words
=
"|"
.
join
(
lay_qkv_words
)
lay_qkv_bias_words
=
[
"self_attn.qkv_proj.bias"
]
qkv_bias_words
=
"|"
.
join
(
lay_qkv_bias_words
)
for
layername
,
weight
in
params_dict
.
items
():
if
self
.
use_fa_pad
and
(
re
.
findall
(
qkv_bias_words
,
layername
)):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
if
self
.
use_gemm_pad
and
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
weight
.
data
.
shape
[
0
]
==
12288
:
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
if
self
.
use_fa_pad
and
(
re
.
findall
(
qkv_words
,
layername
)):
if
not
gemm_bank_conf
(
weight
.
data
.
shape
[
0
]):
weight
.
data
=
pad_weight
(
weight
.
data
,
32
)
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment