Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
fb103cfd
Unverified
Commit
fb103cfd
authored
Nov 20, 2023
by
Xu Kai
Committed by
GitHub
Nov 20, 2023
Browse files
[inference] update examples and engine (#5073)
* update examples and engine * fix choices * update example
parent
0c7d8beb
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
107 additions
and
187 deletions
+107
-187
colossalai/inference/__init__.py
colossalai/inference/__init__.py
+2
-2
colossalai/inference/engine/__init__.py
colossalai/inference/engine/__init__.py
+2
-2
colossalai/inference/engine/engine.py
colossalai/inference/engine/engine.py
+4
-30
examples/inference/build_smoothquant_weight.py
examples/inference/build_smoothquant_weight.py
+4
-5
examples/inference/hybrid_gptq_llama.py
examples/inference/hybrid_gptq_llama.py
+0
-72
examples/inference/hybrid_smoothquant_llama.py
examples/inference/hybrid_smoothquant_llama.py
+0
-69
examples/inference/run_llama_inference.py
examples/inference/run_llama_inference.py
+89
-0
requirements/requirements-infer.txt
requirements/requirements-infer.txt
+0
-1
tests/test_infer/test_hybrid_bloom.py
tests/test_infer/test_hybrid_bloom.py
+2
-2
tests/test_infer/test_hybrid_chatglm2.py
tests/test_infer/test_hybrid_chatglm2.py
+2
-2
tests/test_infer/test_hybrid_llama.py
tests/test_infer/test_hybrid_llama.py
+2
-2
No files found.
colossalai/inference/__init__.py
View file @
fb103cfd
from
.engine
import
Cai
InferEngine
from
.engine
import
Infer
ence
Engine
from
.engine.policies
import
BloomModelInferPolicy
,
ChatGLM2InferPolicy
,
LlamaModelInferPolicy
from
.engine.policies
import
BloomModelInferPolicy
,
ChatGLM2InferPolicy
,
LlamaModelInferPolicy
__all__
=
[
"
Cai
InferEngine"
,
"LlamaModelInferPolicy"
,
"BloomModelInferPolicy"
,
"ChatGLM2InferPolicy"
]
__all__
=
[
"Infer
ence
Engine"
,
"LlamaModelInferPolicy"
,
"BloomModelInferPolicy"
,
"ChatGLM2InferPolicy"
]
colossalai/inference/engine/__init__.py
View file @
fb103cfd
from
.engine
import
Cai
InferEngine
from
.engine
import
Infer
ence
Engine
__all__
=
[
"
Cai
InferEngine"
]
__all__
=
[
"Infer
ence
Engine"
]
colossalai/inference/engine/engine.py
View file @
fb103cfd
...
@@ -3,7 +3,6 @@ from typing import Union
...
@@ -3,7 +3,6 @@ from typing import Union
import
torch
import
torch
import
torch.distributed
as
dist
import
torch.distributed
as
dist
import
torch.nn
as
nn
import
torch.nn
as
nn
from
transformers.tokenization_utils_base
import
BatchEncoding
from
transformers.utils
import
logging
from
transformers.utils
import
logging
from
colossalai.cluster
import
ProcessGroupMesh
from
colossalai.cluster
import
ProcessGroupMesh
...
@@ -27,9 +26,9 @@ _supported_models = [
...
@@ -27,9 +26,9 @@ _supported_models = [
]
]
class
Cai
InferEngine
:
class
Infer
ence
Engine
:
"""
"""
Cai
InferEngine is a class that handles the pipeline parallel inference.
Infer
ence
Engine is a class that handles the pipeline parallel inference.
Args:
Args:
tp_size (int): the size of tensor parallelism.
tp_size (int): the size of tensor parallelism.
...
@@ -42,27 +41,6 @@ class CaiInferEngine:
...
@@ -42,27 +41,6 @@ class CaiInferEngine:
max_input_len (int): the maximum input length.
max_input_len (int): the maximum input length.
max_output_len (int): the maximum output length.
max_output_len (int): the maximum output length.
Example:
```python
from colossalai.inference import InferEngine
from colossalai.inference.pipeline.policies import LlamaModelInferPolicy
import colossalai
from transformers import LlamaForCausalLM, LlamaTokenizer
colossalai.launch_from_torch(config={})
model = LlamaForCausalLM.from_pretrained("your_path_to_model")
tokenizer = LlamaTokenizer.from_pretrained("/home/lczyh/share/models/llama-7b-hf")
# assume the model is infered with 2 pipeline stages
inferengine = CaiInferEngine(pp_size=2, model=model, model_policy=LlamaModelInferPolicy())
input = ["Introduce a landmark in China ","Introduce a landmark in China "]
data = tokenizer(input, return_tensors='pt')
output = inferengine.inference([data.to('cuda').data])
```
"""
"""
def
__init__
(
def
__init__
(
...
@@ -148,7 +126,7 @@ class CaiInferEngine:
...
@@ -148,7 +126,7 @@ class CaiInferEngine:
if
quant
==
"gptq"
:
if
quant
==
"gptq"
:
self
.
gptq_manager
.
post_init_gptq_buffer
(
self
.
model
)
self
.
gptq_manager
.
post_init_gptq_buffer
(
self
.
model
)
def
generate
(
self
,
input_list
:
Union
[
BatchEncoding
,
dict
]):
def
generate
(
self
,
input_list
:
Union
[
list
,
dict
]):
"""
"""
Args:
Args:
input_list (list): a list of input data, each element is a `BatchEncoding` or `dict`.
input_list (list): a list of input data, each element is a `BatchEncoding` or `dict`.
...
@@ -157,11 +135,7 @@ class CaiInferEngine:
...
@@ -157,11 +135,7 @@ class CaiInferEngine:
out (list): a list of output data, each element is a list of token.
out (list): a list of output data, each element is a list of token.
timestamp (float): the time cost of the inference, only return when verbose is `True`.
timestamp (float): the time cost of the inference, only return when verbose is `True`.
"""
"""
assert
isinstance
(
input_list
,
(
BatchEncoding
,
dict
)
),
f
"Only accept BatchEncoding or dict as input, but get
{
input_list
.
__class__
.
__name__
}
."
if
isinstance
(
input_list
,
BatchEncoding
):
input_list
=
input_list
.
data
out
,
timestamp
=
self
.
schedule
.
generate_step
(
self
.
model
,
iter
([
input_list
]))
out
,
timestamp
=
self
.
schedule
.
generate_step
(
self
.
model
,
iter
([
input_list
]))
if
self
.
verbose
:
if
self
.
verbose
:
return
out
,
timestamp
return
out
,
timestamp
...
...
examples/inference/smoothquant_
llama
.py
→
examples/inference/
build_
smoothquant_
weight
.py
View file @
fb103cfd
...
@@ -29,7 +29,7 @@ def parse_args():
...
@@ -29,7 +29,7 @@ def parse_args():
type
=
str
,
type
=
str
,
help
=
"location of the calibration dataset"
,
help
=
"location of the calibration dataset"
,
)
)
parser
.
add_argument
(
"--num-samples"
,
type
=
int
,
default
=
512
)
parser
.
add_argument
(
"--num-samples"
,
type
=
int
,
default
=
10
)
parser
.
add_argument
(
"--seq-len"
,
type
=
int
,
default
=
512
)
parser
.
add_argument
(
"--seq-len"
,
type
=
int
,
default
=
512
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
return
args
return
args
...
@@ -41,13 +41,12 @@ def main():
...
@@ -41,13 +41,12 @@ def main():
model_path
=
args
.
model_name
model_path
=
args
.
model_name
dataset_path
=
args
.
dataset_path
dataset_path
=
args
.
dataset_path
output_path
=
args
.
output_path
output_path
=
args
.
output_path
num_samples
=
10
num_samples
=
args
.
num_samples
seq_len
=
512
seq_len
=
args
.
seq_len
model
,
tokenizer
=
build_model_and_tokenizer
(
model_path
)
model
,
tokenizer
=
build_model_and_tokenizer
(
model_path
)
if
not
os
.
path
.
exists
(
dataset_path
):
if
not
os
.
path
.
exists
(
dataset_path
):
print
(
f
"Cannot find the dataset at
{
args
.
dataset_path
}
"
)
raise
FileNotFoundError
(
f
"Cannot find the dataset at
{
args
.
dataset_path
}
"
)
raise
FileNotFoundError
dataset
=
load_dataset
(
"json"
,
data_files
=
dataset_path
,
split
=
"train"
)
dataset
=
load_dataset
(
"json"
,
data_files
=
dataset_path
,
split
=
"train"
)
model
.
quantized
(
tokenizer
,
dataset
,
num_samples
=
num_samples
,
seq_len
=
seq_len
)
model
.
quantized
(
tokenizer
,
dataset
,
num_samples
=
num_samples
,
seq_len
=
seq_len
)
...
...
examples/inference/hybrid_gptq_llama.py
deleted
100644 → 0
View file @
0c7d8beb
import
argparse
import
os
import
torch
import
torch.distributed
as
dist
from
auto_gptq
import
AutoGPTQForCausalLM
import
colossalai
from
colossalai.inference
import
CaiInferEngine
from
colossalai.logging
import
disable_existing_loggers
from
colossalai.testing
import
spawn
os
.
environ
[
"TRANSFORMERS_NO_ADVISORY_WARNINGS"
]
=
"true"
def
run_llama_inference
(
args
):
quantized_model_dir
=
args
.
quantized_path
max_batch_size
=
args
.
max_batch_size
max_input_len
=
args
.
max_input_len
max_output_len
=
args
.
max_output_len
micro_batch_size
=
args
.
micro_batch_size
# load quantized model to the first GPU
model
=
AutoGPTQForCausalLM
.
from_quantized
(
quantized_model_dir
,
inject_fused_attention
=
False
,
device
=
torch
.
cuda
.
current_device
()
)
engine
=
CaiInferEngine
(
tp_size
=
2
,
pp_size
=
2
,
model
=
model
,
max_batch_size
=
max_batch_size
,
max_input_len
=
max_input_len
,
max_output_len
=
max_output_len
,
micro_batch_size
=
micro_batch_size
,
quant
=
"gptq"
,
)
def
data_gen
():
input_ids
=
torch
.
tensor
([[
15496
,
11
,
616
,
3290
,
318
,
13779
,
318
,
13779
]],
dtype
=
torch
.
int64
)
attention_mask
=
torch
.
tensor
([[
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
]],
dtype
=
torch
.
int64
)
return
dict
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
)
inputs
=
data_gen
()
for
k
,
v
in
inputs
.
items
():
if
torch
.
is_tensor
(
v
)
or
"Tensor"
in
v
.
__class__
.
__name__
:
new_shape
=
[
1
]
*
v
.
dim
()
new_shape
[
0
]
=
16
inputs
[
k
]
=
v
.
to
(
"cuda"
).
repeat
(
*
new_shape
)
output
=
engine
.
generate
(
inputs
)
if
dist
.
get_rank
()
==
0
:
assert
len
(
output
[
0
])
==
max_output_len
,
f
"
{
len
(
output
)
}
,
{
max_output_len
}
"
def
run_gptq_infernece
(
rank
,
world_size
,
port
,
args
):
disable_existing_loggers
()
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_llama_inference
(
args
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"-q"
,
"--quantized_path"
,
type
=
str
,
help
=
"Model path"
,
required
=
True
)
parser
.
add_argument
(
"--tp_size"
,
type
=
int
,
default
=
2
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"--pp_size"
,
type
=
int
,
default
=
2
,
help
=
"Pipeline parallel size"
)
parser
.
add_argument
(
"--max_batch_size"
,
type
=
int
,
default
=
4
,
help
=
"Maximum batch size"
)
parser
.
add_argument
(
"--micro_batch_size"
,
type
=
int
,
default
=
4
,
help
=
"Micro batch size"
)
parser
.
add_argument
(
"--max_input_len"
,
type
=
int
,
default
=
32
,
help
=
"Maximum input length"
)
parser
.
add_argument
(
"--max_output_len"
,
type
=
int
,
default
=
32
,
help
=
"Maximum output length"
)
args
=
parser
.
parse_args
()
spawn
(
run_gptq_infernece
,
args
.
tp_size
*
args
.
pp_size
,
args
=
args
)
examples/inference/hybrid_smoothquant_llama.py
deleted
100644 → 0
View file @
0c7d8beb
import
argparse
import
torch
import
torch.distributed
as
dist
import
colossalai
from
colossalai.inference
import
CaiInferEngine
from
colossalai.inference.quant.smoothquant.models.llama
import
SmoothLlamaForCausalLM
from
colossalai.logging
import
disable_existing_loggers
from
colossalai.testing
import
spawn
@
torch
.
no_grad
()
def
run_llama_inference
(
args
):
quantized_model_dir
=
args
.
quantized_path
max_batch_size
=
args
.
max_batch_size
max_input_len
=
args
.
max_input_len
max_output_len
=
args
.
max_output_len
micro_batch_size
=
args
.
micro_batch_size
def
data_gen
():
input_ids
=
torch
.
tensor
([[
15496
,
11
,
616
,
3290
,
318
,
13779
,
318
,
13779
]],
dtype
=
torch
.
int64
)
attention_mask
=
torch
.
tensor
([[
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
]],
dtype
=
torch
.
int64
)
return
dict
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
)
inputs
=
data_gen
()
for
k
,
v
in
inputs
.
items
():
if
torch
.
is_tensor
(
v
)
or
"Tensor"
in
v
.
__class__
.
__name__
:
new_shape
=
[
1
]
*
v
.
dim
()
new_shape
[
0
]
=
16
inputs
[
k
]
=
v
.
to
(
"cuda"
).
repeat
(
*
new_shape
)
model
=
SmoothLlamaForCausalLM
.
from_quantized
(
quantized_model_dir
,
model_basename
=
"llama-7b"
)
model
=
model
.
cuda
()
engine
=
CaiInferEngine
(
tp_size
=
2
,
pp_size
=
2
,
model
=
model
,
max_batch_size
=
max_batch_size
,
max_input_len
=
max_input_len
,
max_output_len
=
max_output_len
,
micro_batch_size
=
micro_batch_size
,
quant
=
"smoothquant"
,
)
output
=
engine
.
generate
(
inputs
)
if
dist
.
get_rank
()
==
0
:
assert
len
(
output
[
0
])
==
32
,
f
"
{
len
(
output
)
}
,
{
32
}
"
def
run_smoothquant_inference
(
rank
,
world_size
,
port
,
args
):
disable_existing_loggers
()
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_llama_inference
(
args
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"-q"
,
"--quantized_path"
,
type
=
str
,
help
=
"Model path"
,
required
=
True
)
parser
.
add_argument
(
"--tp_size"
,
type
=
int
,
default
=
2
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"--pp_size"
,
type
=
int
,
default
=
2
,
help
=
"Pipeline parallel size"
)
parser
.
add_argument
(
"--max_batch_size"
,
type
=
int
,
default
=
4
,
help
=
"Maximum batch size"
)
parser
.
add_argument
(
"--micro_batch_size"
,
type
=
int
,
default
=
4
,
help
=
"Micro batch size"
)
parser
.
add_argument
(
"--max_input_len"
,
type
=
int
,
default
=
32
,
help
=
"Maximum input length"
)
parser
.
add_argument
(
"--max_output_len"
,
type
=
int
,
default
=
32
,
help
=
"Maximum output length"
)
args
=
parser
.
parse_args
()
spawn
(
run_smoothquant_inference
,
args
.
tp_size
*
args
.
pp_size
,
args
=
args
)
examples/inference/
hybrid_llama
.py
→
examples/inference/
run_llama_inference
.py
View file @
fb103cfd
import
argparse
import
argparse
import
time
import
torch
import
torch
import
torch.distributed
as
dist
import
torch.distributed
as
dist
import
transformers
from
transformers
import
LlamaForCausalLM
,
LlamaTokenizer
from
transformers
import
LlamaForCausalLM
,
LlamaTokenizer
import
colossalai
import
colossalai
from
colossalai.inference
import
Cai
InferEngine
from
colossalai.inference
import
Infer
ence
Engine
from
colossalai.testing
import
spawn
from
colossalai.testing
import
spawn
def
run_inference
(
args
):
def
run_inference
(
args
):
llama_model_path
=
args
.
path
llama_model_path
=
args
.
model_path
llama_tokenize_path
=
args
.
tokenizer_path
max_input_len
=
args
.
max_input_len
max_input_len
=
args
.
max_input_len
max_output_len
=
args
.
max_output_len
max_output_len
=
args
.
max_output_len
max_batch_size
=
args
.
batch_size
max_batch_size
=
args
.
batch_size
...
@@ -21,23 +21,32 @@ def run_inference(args):
...
@@ -21,23 +21,32 @@ def run_inference(args):
pp_size
=
args
.
pp_size
pp_size
=
args
.
pp_size
rank
=
dist
.
get_rank
()
rank
=
dist
.
get_rank
()
tokenizer
=
LlamaTokenizer
.
from_pretrained
(
llama_
model_path
)
tokenizer
=
LlamaTokenizer
.
from_pretrained
(
llama_
tokenize_path
,
padding_side
=
"left"
)
tokenizer
.
pad_token_id
=
tokenizer
.
unk_token_id
tokenizer
.
pad_token_id
=
tokenizer
.
unk_token_id
model
=
LlamaForCausalLM
.
from_pretrained
(
llama_model_path
,
pad_token_id
=
tokenizer
.
eos_token_id
)
model
=
model
.
half
()
model
=
transformers
.
LlamaForCausalLM
(
if
args
.
quant
is
None
:
transformers
.
LlamaConfig
(
model
=
LlamaForCausalLM
.
from_pretrained
(
llama_model_path
,
pad_token_id
=
tokenizer
.
unk_token_id
)
vocab_size
=
20000
,
hidden_size
=
512
,
intermediate_size
=
1536
,
num_attention_heads
=
4
,
num_hidden_layers
=
4
model
=
model
.
half
()
elif
args
.
quant
==
"gptq"
:
from
auto_gptq
import
AutoGPTQForCausalLM
model
=
AutoGPTQForCausalLM
.
from_quantized
(
llama_model_path
,
inject_fused_attention
=
False
,
device
=
torch
.
cuda
.
current_device
()
)
)
)
elif
args
.
quant
==
"smoothquant"
:
from
colossalai.inference.quant.smoothquant.models.llama
import
SmoothLlamaForCausalLM
engine
=
CaiInferEngine
(
model
=
SmoothLlamaForCausalLM
.
from_quantized
(
llama_model_path
,
model_basename
=
args
.
smoothquant_base_name
)
model
=
model
.
cuda
()
engine
=
InferenceEngine
(
tp_size
=
tp_size
,
tp_size
=
tp_size
,
pp_size
=
pp_size
,
pp_size
=
pp_size
,
model
=
model
,
model
=
model
,
max_input_len
=
max_input_len
,
max_output_len
=
max_output_len
,
max_output_len
=
max_output_len
,
micro_batch_size
=
micro_batch_size
,
micro_batch_size
=
micro_batch_size
,
quant
=
args
.
quant
,
)
)
input_tokens
=
{
input_tokens
=
{
...
@@ -45,26 +54,9 @@ def run_inference(args):
...
@@ -45,26 +54,9 @@ def run_inference(args):
"attention_mask"
:
torch
.
ones
((
max_batch_size
,
max_input_len
),
device
=
"cuda"
),
"attention_mask"
:
torch
.
ones
((
max_batch_size
,
max_input_len
),
device
=
"cuda"
),
}
}
iters
=
10
outputs
=
engine
.
generate
(
input_tokens
)
warmup
=
3
times
=
[]
for
i
in
range
(
iters
):
torch
.
cuda
.
synchronize
()
start
=
time
.
time
()
outputs
=
engine
.
generate
(
input_tokens
)
torch
.
cuda
.
synchronize
()
end
=
time
.
time
()
if
rank
==
0
:
out_len
=
len
(
outputs
[
0
])
print
(
"generation time {} s"
.
format
(
str
(
end
-
start
)))
print
(
out_len
)
times
.
append
((
end
-
start
)
/
out_len
)
if
rank
==
0
:
if
rank
==
0
:
times
=
times
[
warmup
:]
print
(
tokenizer
.
batch_decode
(
outputs
))
latency
=
sum
(
times
)
/
len
(
times
)
print
(
"total process latency is : "
+
str
(
latency
)
+
" s"
)
print
(
"total throughput is : "
+
str
(
1
/
latency
*
max_batch_size
))
def
run_tp_pipeline_inference
(
rank
,
world_size
,
port
,
args
):
def
run_tp_pipeline_inference
(
rank
,
world_size
,
port
,
args
):
...
@@ -74,13 +66,24 @@ def run_tp_pipeline_inference(rank, world_size, port, args):
...
@@ -74,13 +66,24 @@ def run_tp_pipeline_inference(rank, world_size, port, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"-p"
,
"--path"
,
type
=
str
,
help
=
"Model path"
,
required
=
True
)
parser
.
add_argument
(
"-p"
,
"--model_path"
,
type
=
str
,
help
=
"Model path"
,
required
=
True
)
parser
.
add_argument
(
"-tp"
,
"--tp_size"
,
type
=
int
,
default
=
1
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"--tokenizer_path"
,
type
=
str
,
help
=
"Tokenizer path"
,
required
=
True
)
parser
.
add_argument
(
"-pp"
,
"--pp_size"
,
type
=
int
,
default
=
1
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"-b"
,
"--batch_size"
,
type
=
int
,
default
=
64
,
help
=
"Maximum batch size"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--max_input_len"
,
type
=
int
,
default
=
512
,
help
=
"Maximum input length"
)
"-q"
,
parser
.
add_argument
(
"--max_output_len"
,
type
=
int
,
default
=
256
,
help
=
"Maximum output length"
)
"--quant"
,
parser
.
add_argument
(
"--micro_batch_size"
,
type
=
int
,
default
=
2
,
help
=
"Micro batch size"
)
type
=
str
,
choices
=
[
"gptq"
,
"smoothquant"
],
default
=
None
,
help
=
"quantization type: 'gptq' or 'smoothquant'"
,
)
parser
.
add_argument
(
"--smoothquant_base_name"
,
type
=
str
,
default
=
None
,
help
=
"soothquant base name"
)
parser
.
add_argument
(
"-tp"
,
"--tp_size"
,
type
=
int
,
default
=
2
,
help
=
"Tensor parallel size"
)
parser
.
add_argument
(
"-pp"
,
"--pp_size"
,
type
=
int
,
default
=
2
,
help
=
"Pipeline parallel size"
)
parser
.
add_argument
(
"-b"
,
"--batch_size"
,
type
=
int
,
default
=
4
,
help
=
"Maximum batch size"
)
parser
.
add_argument
(
"--max_input_len"
,
type
=
int
,
default
=
32
,
help
=
"Maximum input length"
)
parser
.
add_argument
(
"--max_output_len"
,
type
=
int
,
default
=
16
,
help
=
"Maximum output length"
)
parser
.
add_argument
(
"--micro_batch_size"
,
type
=
int
,
default
=
1
,
help
=
"Micro batch size"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
spawn
(
run_tp_pipeline_inference
,
nprocs
=
args
.
tp_size
*
args
.
pp_size
,
args
=
args
)
spawn
(
run_tp_pipeline_inference
,
nprocs
=
args
.
tp_size
*
args
.
pp_size
,
args
=
args
)
requirements/requirements-infer.txt
View file @
fb103cfd
...
@@ -3,5 +3,4 @@ packaging
...
@@ -3,5 +3,4 @@ packaging
ninja
ninja
auto-gptq==0.5.0
auto-gptq==0.5.0
git+https://github.com/ModelTC/lightllm.git@ece7b43f8a6dfa74027adc77c2c176cff28c76c8
git+https://github.com/ModelTC/lightllm.git@ece7b43f8a6dfa74027adc77c2c176cff28c76c8
git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
git+https://github.com/Dao-AILab/flash-attention.git@017716451d446e464dde9aca3a3c1ed2209caaa9
git+https://github.com/Dao-AILab/flash-attention.git@017716451d446e464dde9aca3a3c1ed2209caaa9
tests/test_infer/test_hybrid_bloom.py
View file @
fb103cfd
...
@@ -7,7 +7,7 @@ import transformers
...
@@ -7,7 +7,7 @@ import transformers
from
packaging
import
version
from
packaging
import
version
import
colossalai
import
colossalai
from
colossalai.inference
import
Cai
InferEngine
from
colossalai.inference
import
Infer
ence
Engine
from
colossalai.testing
import
clear_cache_before_run
,
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
colossalai.testing
import
clear_cache_before_run
,
parameterize
,
rerun_if_address_is_in_use
,
spawn
CUDA_SUPPORT
=
version
.
parse
(
torch
.
version
.
cuda
)
>
version
.
parse
(
"11.5"
)
CUDA_SUPPORT
=
version
.
parse
(
torch
.
version
.
cuda
)
>
version
.
parse
(
"11.5"
)
...
@@ -36,7 +36,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
...
@@ -36,7 +36,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
transformers
.
BloomConfig
(
vocab_size
=
20000
,
hidden_size
=
512
,
n_head
=
4
,
n_layer
=
4
)
transformers
.
BloomConfig
(
vocab_size
=
20000
,
hidden_size
=
512
,
n_head
=
4
,
n_layer
=
4
)
)
)
engine
=
Cai
InferEngine
(
engine
=
Infer
ence
Engine
(
tp_size
=
tp_size
,
tp_size
=
tp_size
,
pp_size
=
pp_size
,
pp_size
=
pp_size
,
model
=
model
,
model
=
model
,
...
...
tests/test_infer/test_hybrid_chatglm2.py
View file @
fb103cfd
...
@@ -6,7 +6,7 @@ import torch.distributed as dist
...
@@ -6,7 +6,7 @@ import torch.distributed as dist
from
packaging
import
version
from
packaging
import
version
import
colossalai
import
colossalai
from
colossalai.inference
import
Cai
InferEngine
from
colossalai.inference
import
Infer
ence
Engine
from
colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm
import
ChatGLMConfig
from
colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm
import
ChatGLMConfig
from
colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm
import
ChatGLMForConditionalGeneration
from
colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm
import
ChatGLMForConditionalGeneration
from
colossalai.testing
import
clear_cache_before_run
,
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
colossalai.testing
import
clear_cache_before_run
,
parameterize
,
rerun_if_address_is_in_use
,
spawn
...
@@ -44,7 +44,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
...
@@ -44,7 +44,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
)
)
model
=
ChatGLMForConditionalGeneration
(
chatglm_config
)
model
=
ChatGLMForConditionalGeneration
(
chatglm_config
)
engine
=
Cai
InferEngine
(
engine
=
Infer
ence
Engine
(
tp_size
=
tp_size
,
tp_size
=
tp_size
,
pp_size
=
pp_size
,
pp_size
=
pp_size
,
model
=
model
,
model
=
model
,
...
...
tests/test_infer/test_hybrid_llama.py
View file @
fb103cfd
...
@@ -7,7 +7,7 @@ import transformers
...
@@ -7,7 +7,7 @@ import transformers
from
packaging
import
version
from
packaging
import
version
import
colossalai
import
colossalai
from
colossalai.inference
import
Cai
InferEngine
from
colossalai.inference
import
Infer
ence
Engine
from
colossalai.testing
import
clear_cache_before_run
,
parameterize
,
rerun_if_address_is_in_use
,
spawn
from
colossalai.testing
import
clear_cache_before_run
,
parameterize
,
rerun_if_address_is_in_use
,
spawn
CUDA_SUPPORT
=
version
.
parse
(
torch
.
version
.
cuda
)
>
version
.
parse
(
"11.5"
)
CUDA_SUPPORT
=
version
.
parse
(
torch
.
version
.
cuda
)
>
version
.
parse
(
"11.5"
)
...
@@ -41,7 +41,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
...
@@ -41,7 +41,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
)
)
)
)
engine
=
Cai
InferEngine
(
engine
=
Infer
ence
Engine
(
tp_size
=
tp_size
,
tp_size
=
tp_size
,
pp_size
=
pp_size
,
pp_size
=
pp_size
,
model
=
model
,
model
=
model
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment