Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
35a8304d
Commit
35a8304d
authored
Oct 09, 2024
by
zhuwenwen
Browse files
添加w8a8 rocblas非融合支持
parent
5a9c236d
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
97 additions
and
57 deletions
+97
-57
README.md
README.md
+12
-21
examples/offline_inference.py
examples/offline_inference.py
+19
-20
requirements-rocm.txt
requirements-rocm.txt
+1
-1
vllm/_custom_ops.py
vllm/_custom_ops.py
+10
-9
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/loader.py
+1
-1
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+2
-2
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+18
-1
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+17
-1
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+17
-1
No files found.
README.md
View file @
35a8304d
...
...
@@ -9,25 +9,16 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
## 支持模型结构列表
| 结构 | 模型 | 模型并行 | FP16 |
| :----------: | :----------: | :------: | :--: |
| LlamaForCausalLM | LLaMA | Yes | Yes |
| LlamaForCausalLM | LLaMA-2 | Yes | Yes |
| LlamaForCausalLM | LLaMA-3 | Yes | Yes |
| LlamaForCausalLM | Codellama | Yes | Yes |
| :------: | :------: | :------: | :------: |
| LlamaForCausalLM | LLaMA、LLaMA-2、LLaMA-3、Codellama、deepseek、Yi | Yes | Yes |
| QWenLMHeadModel | QWen | Yes | Yes |
| Qwen2ForCausalLM | QWen1.5 | Yes | Yes |
| Qwen2ForCausalLM | CodeQwen1.5 | Yes | Yes |
| Qwen2ForCausalLM | QWen2 | Yes | Yes |
| ChatGLMModel | chatglm2 | Yes | Yes |
| ChatGLMModel | chatglm3 | Yes | Yes |
| BaiChuanForCausalLM | Baichuan | Yes | Yes |
| BaiChuanForCausalLM | Baichuan2 | Yes | Yes |
| BloomForCausalLM | BLOOM | Yes | Yes |
| Qwen2ForCausalLM | QWen1.5、CodeQwen1.5、QWen2 | Yes | Yes |
| ChatGLMModel | chatglm2、chatglm3 | Yes | Yes |
| BaiChuanForCausalLM | Baichuan、Baichuan2 | Yes | Yes |
| BloomForCausalLM | BLOOM | Yes | Yes |
| InternLMForCausalLM | InternLM | Yes | Yes |
| InternLM2ForCausalLM | InternLM2 | Yes | Yes |
| LlamaForCausalLM | deepseek | Yes | Yes |
| DeepseekV2ForCausalLM | DeepSeek-V2 | Yes | Yes |
| LlamaForCausalLM | Yi | Yes | Yes |
| MixtralForCausalLM | Mixtral-8x7B | Yes | Yes |
...
...
examples/offline_inference.py
View file @
35a8304d
from
vllm
import
LLM
,
SamplingParams
if
__name__
==
'__main__'
:
# Sample prompts.
prompts
=
[
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
16
)
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
16
)
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
tensor_parallel_size
=
1
,
distributed_executor_backend
=
"ray"
,
dtype
=
"float16"
,
trust_remote_code
=
True
,
enforce_eager
=
True
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
tensor_parallel_size
=
1
,
distributed_executor_backend
=
"ray"
,
dtype
=
"float16"
,
trust_remote_code
=
True
,
enforce_eager
=
True
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
requirements-rocm.txt
View file @
35a8304d
...
...
@@ -15,4 +15,4 @@ torch == 2.3.0
triton == 2.1.0
flash_attn == 2.6.1
xformers == 0.0.25
lmslim == 0.1.1
\ No newline at end of file
lmslim == 0.1.2
\ No newline at end of file
vllm/_custom_ops.py
View file @
35a8304d
...
...
@@ -615,18 +615,19 @@ def cutlass_scaled_mm(a: torch.Tensor,
scale_b
:
torch
.
Tensor
,
out_dtype
:
torch
.
dtype
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
assert
(
b
.
shape
[
0
]
%
16
==
0
and
b
.
shape
[
1
]
%
16
==
0
)
assert
(
out_dtype
is
torch
.
bfloat16
or
out_dtype
is
torch
.
float16
)
assert
bias
is
None
or
bias
.
shape
[
0
]
==
b
.
shape
[
1
]
and
bias
.
dtype
==
out_dtype
#
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
#
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
#
assert bias is None or bias.shape[0] == b.shape[
#
1] and bias.dtype == out_dtype
m
=
a
.
shape
[
0
]
n
=
b
.
shape
[
1
]
out
=
torch
.
empty
((
m
,
n
),
dtype
=
out_dtype
,
device
=
a
.
device
)
#
m = a.shape[0]
#
n = b.shape[1]
#
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
torch
.
ops
.
_C
.
cutlass_scaled_mm
(
out
,
a
,
b
,
scale_a
,
scale_b
,
bias
)
#
torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
return
out
# return out
return
quant_ops
.
rocblas_scaled_mm_nn
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
def
cutlass_scaled_mm_azp
(
a
:
torch
.
Tensor
,
...
...
vllm/model_executor/model_loader/loader.py
View file @
35a8304d
...
...
@@ -407,7 +407,7 @@ class DefaultModelLoader(BaseModelLoader):
for
_
,
module
in
model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
and
quant_method
!=
"awq"
and
quant_method
!=
"gptq"
:
if
quant_method
is
not
None
and
quant_method
!=
"awq"
and
quant_method
!=
"gptq"
and
quant_method
!=
"compressed_tensors"
:
# When quant methods need to process weights after loading
# (for repacking, quantizing, etc), they expect parameters
# to be on the global target device. This scope is for the
...
...
vllm/model_executor/model_loader/utils.py
View file @
35a8304d
...
...
@@ -28,8 +28,8 @@ def get_model_architecture(
os
.
environ
[
'LLAMA_NN'
]
=
'1'
if
os
.
getenv
(
'GEMM_PAD'
)
!=
'1'
:
os
.
environ
[
'GEMM_PAD'
]
=
'0'
if
os
.
getenv
(
'FA_PAD'
)
!=
'
0
'
:
os
.
environ
[
'FA_PAD'
]
=
'
1
'
if
os
.
getenv
(
'FA_PAD'
)
!=
'
1
'
:
os
.
environ
[
'FA_PAD'
]
=
'
0
'
else
:
os
.
environ
[
'LLAMA_NN'
]
=
'0'
os
.
environ
[
'GEMM_PAD'
]
=
'0'
...
...
vllm/model_executor/models/llama.py
View file @
35a8304d
...
...
@@ -639,6 +639,23 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
if
self
.
quant_method
==
"compressed_tensors"
:
lay_key_words
=
[
"self_attn.qkv_proj.weight"
,
"self_attn.o_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.down_proj.weight"
,
]
combined_words
=
"|"
.
join
(
lay_key_words
)
for
layername
,
weight
in
params_dict
.
items
():
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
weight_data
=
params_dict
[
layername
]
k
=
weight_data
.
shape
[
0
]
_weight
=
weight_data
.
T
.
contiguous
().
reshape
(
k
,
-
1
)
weight_data
.
data
.
copy_
(
_weight
)
# If this function is called, it should always initialize KV cache scale
# factors (or else raise an exception). Thus, handled exceptions should
# make sure to leave KV cache scale factors in a known good (dummy) state
...
...
vllm/model_executor/models/qwen.py
View file @
35a8304d
...
...
@@ -1091,3 +1091,19 @@ class QWenLMHeadModel(nn.Module, SupportsMultiModal):
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
if
self
.
quant_method
==
"compressed_tensors"
:
lay_key_words
=
[
"attn.c_attn.weight"
,
"attn.c_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.c_proj.weight"
,
]
combined_words
=
"|"
.
join
(
lay_key_words
)
for
layername
,
weight
in
params_dict
.
items
():
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
weight_data
=
params_dict
[
layername
]
k
=
weight_data
.
shape
[
0
]
_weight
=
weight_data
.
T
.
contiguous
().
reshape
(
k
,
-
1
)
weight_data
.
data
.
copy_
(
_weight
)
vllm/model_executor/models/qwen2.py
View file @
35a8304d
...
...
@@ -543,3 +543,19 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
if
self
.
quant_method
==
"compressed_tensors"
:
lay_key_words
=
[
"self_attn.qkv_proj.weight"
,
"self_attn.o_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.down_proj.weight"
,
]
combined_words
=
"|"
.
join
(
lay_key_words
)
for
layername
,
weight
in
params_dict
.
items
():
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
weight_data
=
params_dict
[
layername
]
k
=
weight_data
.
shape
[
0
]
_weight
=
weight_data
.
T
.
contiguous
().
reshape
(
k
,
-
1
)
weight_data
.
data
.
copy_
(
_weight
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment