Unverified Commit fb103cfd authored by Xu Kai's avatar Xu Kai Committed by GitHub
Browse files

[inference] update examples and engine (#5073)

* update examples and engine

* fix choices

* update example
parent 0c7d8beb
from .engine import CaiInferEngine from .engine import InferenceEngine
from .engine.policies import BloomModelInferPolicy, ChatGLM2InferPolicy, LlamaModelInferPolicy from .engine.policies import BloomModelInferPolicy, ChatGLM2InferPolicy, LlamaModelInferPolicy
__all__ = ["CaiInferEngine", "LlamaModelInferPolicy", "BloomModelInferPolicy", "ChatGLM2InferPolicy"] __all__ = ["InferenceEngine", "LlamaModelInferPolicy", "BloomModelInferPolicy", "ChatGLM2InferPolicy"]
from .engine import CaiInferEngine from .engine import InferenceEngine
__all__ = ["CaiInferEngine"] __all__ = ["InferenceEngine"]
...@@ -3,7 +3,6 @@ from typing import Union ...@@ -3,7 +3,6 @@ from typing import Union
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch.nn as nn import torch.nn as nn
from transformers.tokenization_utils_base import BatchEncoding
from transformers.utils import logging from transformers.utils import logging
from colossalai.cluster import ProcessGroupMesh from colossalai.cluster import ProcessGroupMesh
...@@ -27,9 +26,9 @@ _supported_models = [ ...@@ -27,9 +26,9 @@ _supported_models = [
] ]
class CaiInferEngine: class InferenceEngine:
""" """
CaiInferEngine is a class that handles the pipeline parallel inference. InferenceEngine is a class that handles the pipeline parallel inference.
Args: Args:
tp_size (int): the size of tensor parallelism. tp_size (int): the size of tensor parallelism.
...@@ -42,27 +41,6 @@ class CaiInferEngine: ...@@ -42,27 +41,6 @@ class CaiInferEngine:
max_input_len (int): the maximum input length. max_input_len (int): the maximum input length.
max_output_len (int): the maximum output length. max_output_len (int): the maximum output length.
Example:
```python
from colossalai.inference import InferEngine
from colossalai.inference.pipeline.policies import LlamaModelInferPolicy
import colossalai
from transformers import LlamaForCausalLM, LlamaTokenizer
colossalai.launch_from_torch(config={})
model = LlamaForCausalLM.from_pretrained("your_path_to_model")
tokenizer = LlamaTokenizer.from_pretrained("/home/lczyh/share/models/llama-7b-hf")
# assume the model is infered with 2 pipeline stages
inferengine = CaiInferEngine(pp_size=2, model=model, model_policy=LlamaModelInferPolicy())
input = ["Introduce a landmark in China ","Introduce a landmark in China "]
data = tokenizer(input, return_tensors='pt')
output = inferengine.inference([data.to('cuda').data])
```
""" """
def __init__( def __init__(
...@@ -148,7 +126,7 @@ class CaiInferEngine: ...@@ -148,7 +126,7 @@ class CaiInferEngine:
if quant == "gptq": if quant == "gptq":
self.gptq_manager.post_init_gptq_buffer(self.model) self.gptq_manager.post_init_gptq_buffer(self.model)
def generate(self, input_list: Union[BatchEncoding, dict]): def generate(self, input_list: Union[list, dict]):
""" """
Args: Args:
input_list (list): a list of input data, each element is a `BatchEncoding` or `dict`. input_list (list): a list of input data, each element is a `BatchEncoding` or `dict`.
...@@ -157,11 +135,7 @@ class CaiInferEngine: ...@@ -157,11 +135,7 @@ class CaiInferEngine:
out (list): a list of output data, each element is a list of token. out (list): a list of output data, each element is a list of token.
timestamp (float): the time cost of the inference, only return when verbose is `True`. timestamp (float): the time cost of the inference, only return when verbose is `True`.
""" """
assert isinstance(
input_list, (BatchEncoding, dict)
), f"Only accept BatchEncoding or dict as input, but get {input_list.__class__.__name__}."
if isinstance(input_list, BatchEncoding):
input_list = input_list.data
out, timestamp = self.schedule.generate_step(self.model, iter([input_list])) out, timestamp = self.schedule.generate_step(self.model, iter([input_list]))
if self.verbose: if self.verbose:
return out, timestamp return out, timestamp
......
...@@ -29,7 +29,7 @@ def parse_args(): ...@@ -29,7 +29,7 @@ def parse_args():
type=str, type=str,
help="location of the calibration dataset", help="location of the calibration dataset",
) )
parser.add_argument("--num-samples", type=int, default=512) parser.add_argument("--num-samples", type=int, default=10)
parser.add_argument("--seq-len", type=int, default=512) parser.add_argument("--seq-len", type=int, default=512)
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -41,13 +41,12 @@ def main(): ...@@ -41,13 +41,12 @@ def main():
model_path = args.model_name model_path = args.model_name
dataset_path = args.dataset_path dataset_path = args.dataset_path
output_path = args.output_path output_path = args.output_path
num_samples = 10 num_samples = args.num_samples
seq_len = 512 seq_len = args.seq_len
model, tokenizer = build_model_and_tokenizer(model_path) model, tokenizer = build_model_and_tokenizer(model_path)
if not os.path.exists(dataset_path): if not os.path.exists(dataset_path):
print(f"Cannot find the dataset at {args.dataset_path}") raise FileNotFoundError(f"Cannot find the dataset at {args.dataset_path}")
raise FileNotFoundError
dataset = load_dataset("json", data_files=dataset_path, split="train") dataset = load_dataset("json", data_files=dataset_path, split="train")
model.quantized(tokenizer, dataset, num_samples=num_samples, seq_len=seq_len) model.quantized(tokenizer, dataset, num_samples=num_samples, seq_len=seq_len)
......
import argparse
import os
import torch
import torch.distributed as dist
from auto_gptq import AutoGPTQForCausalLM
import colossalai
from colossalai.inference import CaiInferEngine
from colossalai.logging import disable_existing_loggers
from colossalai.testing import spawn
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
def run_llama_inference(args):
quantized_model_dir = args.quantized_path
max_batch_size = args.max_batch_size
max_input_len = args.max_input_len
max_output_len = args.max_output_len
micro_batch_size = args.micro_batch_size
# load quantized model to the first GPU
model = AutoGPTQForCausalLM.from_quantized(
quantized_model_dir, inject_fused_attention=False, device=torch.cuda.current_device()
)
engine = CaiInferEngine(
tp_size=2,
pp_size=2,
model=model,
max_batch_size=max_batch_size,
max_input_len=max_input_len,
max_output_len=max_output_len,
micro_batch_size=micro_batch_size,
quant="gptq",
)
def data_gen():
input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
return dict(input_ids=input_ids, attention_mask=attention_mask)
inputs = data_gen()
for k, v in inputs.items():
if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
new_shape = [1] * v.dim()
new_shape[0] = 16
inputs[k] = v.to("cuda").repeat(*new_shape)
output = engine.generate(inputs)
if dist.get_rank() == 0:
assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"
def run_gptq_infernece(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_llama_inference(args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
parser.add_argument("--tp_size", type=int, default=2, help="Tensor parallel size")
parser.add_argument("--pp_size", type=int, default=2, help="Pipeline parallel size")
parser.add_argument("--max_batch_size", type=int, default=4, help="Maximum batch size")
parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size")
parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
parser.add_argument("--max_output_len", type=int, default=32, help="Maximum output length")
args = parser.parse_args()
spawn(run_gptq_infernece, args.tp_size * args.pp_size, args=args)
import argparse
import torch
import torch.distributed as dist
import colossalai
from colossalai.inference import CaiInferEngine
from colossalai.inference.quant.smoothquant.models.llama import SmoothLlamaForCausalLM
from colossalai.logging import disable_existing_loggers
from colossalai.testing import spawn
@torch.no_grad()
def run_llama_inference(args):
quantized_model_dir = args.quantized_path
max_batch_size = args.max_batch_size
max_input_len = args.max_input_len
max_output_len = args.max_output_len
micro_batch_size = args.micro_batch_size
def data_gen():
input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
return dict(input_ids=input_ids, attention_mask=attention_mask)
inputs = data_gen()
for k, v in inputs.items():
if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
new_shape = [1] * v.dim()
new_shape[0] = 16
inputs[k] = v.to("cuda").repeat(*new_shape)
model = SmoothLlamaForCausalLM.from_quantized(quantized_model_dir, model_basename="llama-7b")
model = model.cuda()
engine = CaiInferEngine(
tp_size=2,
pp_size=2,
model=model,
max_batch_size=max_batch_size,
max_input_len=max_input_len,
max_output_len=max_output_len,
micro_batch_size=micro_batch_size,
quant="smoothquant",
)
output = engine.generate(inputs)
if dist.get_rank() == 0:
assert len(output[0]) == 32, f"{len(output)}, {32}"
def run_smoothquant_inference(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_llama_inference(args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
parser.add_argument("--tp_size", type=int, default=2, help="Tensor parallel size")
parser.add_argument("--pp_size", type=int, default=2, help="Pipeline parallel size")
parser.add_argument("--max_batch_size", type=int, default=4, help="Maximum batch size")
parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size")
parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
parser.add_argument("--max_output_len", type=int, default=32, help="Maximum output length")
args = parser.parse_args()
spawn(run_smoothquant_inference, args.tp_size * args.pp_size, args=args)
import argparse import argparse
import time
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer from transformers import LlamaForCausalLM, LlamaTokenizer
import colossalai import colossalai
from colossalai.inference import CaiInferEngine from colossalai.inference import InferenceEngine
from colossalai.testing import spawn from colossalai.testing import spawn
def run_inference(args): def run_inference(args):
llama_model_path = args.path llama_model_path = args.model_path
llama_tokenize_path = args.tokenizer_path
max_input_len = args.max_input_len max_input_len = args.max_input_len
max_output_len = args.max_output_len max_output_len = args.max_output_len
max_batch_size = args.batch_size max_batch_size = args.batch_size
...@@ -21,23 +21,32 @@ def run_inference(args): ...@@ -21,23 +21,32 @@ def run_inference(args):
pp_size = args.pp_size pp_size = args.pp_size
rank = dist.get_rank() rank = dist.get_rank()
tokenizer = LlamaTokenizer.from_pretrained(llama_model_path) tokenizer = LlamaTokenizer.from_pretrained(llama_tokenize_path, padding_side="left")
tokenizer.pad_token_id = tokenizer.unk_token_id tokenizer.pad_token_id = tokenizer.unk_token_id
model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id)
model = model.half()
model = transformers.LlamaForCausalLM( if args.quant is None:
transformers.LlamaConfig( model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.unk_token_id)
vocab_size=20000, hidden_size=512, intermediate_size=1536, num_attention_heads=4, num_hidden_layers=4 model = model.half()
elif args.quant == "gptq":
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized(
llama_model_path, inject_fused_attention=False, device=torch.cuda.current_device()
) )
) elif args.quant == "smoothquant":
from colossalai.inference.quant.smoothquant.models.llama import SmoothLlamaForCausalLM
engine = CaiInferEngine( model = SmoothLlamaForCausalLM.from_quantized(llama_model_path, model_basename=args.smoothquant_base_name)
model = model.cuda()
engine = InferenceEngine(
tp_size=tp_size, tp_size=tp_size,
pp_size=pp_size, pp_size=pp_size,
model=model, model=model,
max_input_len=max_input_len,
max_output_len=max_output_len, max_output_len=max_output_len,
micro_batch_size=micro_batch_size, micro_batch_size=micro_batch_size,
quant=args.quant,
) )
input_tokens = { input_tokens = {
...@@ -45,26 +54,9 @@ def run_inference(args): ...@@ -45,26 +54,9 @@ def run_inference(args):
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"), "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
} }
iters = 10 outputs = engine.generate(input_tokens)
warmup = 3
times = []
for i in range(iters):
torch.cuda.synchronize()
start = time.time()
outputs = engine.generate(input_tokens)
torch.cuda.synchronize()
end = time.time()
if rank == 0:
out_len = len(outputs[0])
print("generation time {} s".format(str(end - start)))
print(out_len)
times.append((end - start) / out_len)
if rank == 0: if rank == 0:
times = times[warmup:] print(tokenizer.batch_decode(outputs))
latency = sum(times) / len(times)
print("total process latency is : " + str(latency) + " s")
print("total throughput is : " + str(1 / latency * max_batch_size))
def run_tp_pipeline_inference(rank, world_size, port, args): def run_tp_pipeline_inference(rank, world_size, port, args):
...@@ -74,13 +66,24 @@ def run_tp_pipeline_inference(rank, world_size, port, args): ...@@ -74,13 +66,24 @@ def run_tp_pipeline_inference(rank, world_size, port, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-p", "--path", type=str, help="Model path", required=True) parser.add_argument("-p", "--model_path", type=str, help="Model path", required=True)
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size") parser.add_argument("--tokenizer_path", type=str, help="Tokenizer path", required=True)
parser.add_argument("-pp", "--pp_size", type=int, default=1, help="Tensor parallel size")
parser.add_argument("-b", "--batch_size", type=int, default=64, help="Maximum batch size") parser.add_argument(
parser.add_argument("--max_input_len", type=int, default=512, help="Maximum input length") "-q",
parser.add_argument("--max_output_len", type=int, default=256, help="Maximum output length") "--quant",
parser.add_argument("--micro_batch_size", type=int, default=2, help="Micro batch size") type=str,
choices=["gptq", "smoothquant"],
default=None,
help="quantization type: 'gptq' or 'smoothquant'",
)
parser.add_argument("--smoothquant_base_name", type=str, default=None, help="soothquant base name")
parser.add_argument("-tp", "--tp_size", type=int, default=2, help="Tensor parallel size")
parser.add_argument("-pp", "--pp_size", type=int, default=2, help="Pipeline parallel size")
parser.add_argument("-b", "--batch_size", type=int, default=4, help="Maximum batch size")
parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
parser.add_argument("--max_output_len", type=int, default=16, help="Maximum output length")
parser.add_argument("--micro_batch_size", type=int, default=1, help="Micro batch size")
args = parser.parse_args() args = parser.parse_args()
spawn(run_tp_pipeline_inference, nprocs=args.tp_size * args.pp_size, args=args) spawn(run_tp_pipeline_inference, nprocs=args.tp_size * args.pp_size, args=args)
...@@ -3,5 +3,4 @@ packaging ...@@ -3,5 +3,4 @@ packaging
ninja ninja
auto-gptq==0.5.0 auto-gptq==0.5.0
git+https://github.com/ModelTC/lightllm.git@ece7b43f8a6dfa74027adc77c2c176cff28c76c8 git+https://github.com/ModelTC/lightllm.git@ece7b43f8a6dfa74027adc77c2c176cff28c76c8
git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
git+https://github.com/Dao-AILab/flash-attention.git@017716451d446e464dde9aca3a3c1ed2209caaa9 git+https://github.com/Dao-AILab/flash-attention.git@017716451d446e464dde9aca3a3c1ed2209caaa9
...@@ -7,7 +7,7 @@ import transformers ...@@ -7,7 +7,7 @@ import transformers
from packaging import version from packaging import version
import colossalai import colossalai
from colossalai.inference import CaiInferEngine from colossalai.inference import InferenceEngine
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5") CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
...@@ -36,7 +36,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size): ...@@ -36,7 +36,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
transformers.BloomConfig(vocab_size=20000, hidden_size=512, n_head=4, n_layer=4) transformers.BloomConfig(vocab_size=20000, hidden_size=512, n_head=4, n_layer=4)
) )
engine = CaiInferEngine( engine = InferenceEngine(
tp_size=tp_size, tp_size=tp_size,
pp_size=pp_size, pp_size=pp_size,
model=model, model=model,
......
...@@ -6,7 +6,7 @@ import torch.distributed as dist ...@@ -6,7 +6,7 @@ import torch.distributed as dist
from packaging import version from packaging import version
import colossalai import colossalai
from colossalai.inference import CaiInferEngine from colossalai.inference import InferenceEngine
from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
...@@ -44,7 +44,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size): ...@@ -44,7 +44,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
) )
model = ChatGLMForConditionalGeneration(chatglm_config) model = ChatGLMForConditionalGeneration(chatglm_config)
engine = CaiInferEngine( engine = InferenceEngine(
tp_size=tp_size, tp_size=tp_size,
pp_size=pp_size, pp_size=pp_size,
model=model, model=model,
......
...@@ -7,7 +7,7 @@ import transformers ...@@ -7,7 +7,7 @@ import transformers
from packaging import version from packaging import version
import colossalai import colossalai
from colossalai.inference import CaiInferEngine from colossalai.inference import InferenceEngine
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5") CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
...@@ -41,7 +41,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size): ...@@ -41,7 +41,7 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
) )
) )
engine = CaiInferEngine( engine = InferenceEngine(
tp_size=tp_size, tp_size=tp_size,
pp_size=pp_size, pp_size=pp_size,
model=model, model=model,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment