Add handson to ColossalAI. (#1896)

Co-authored-by: Boxiang Wang <boxiang.wang1@gmail.com>

Add handson to ColossalAI. (#1896)
Co-authored-by: Boxiang Wang <boxiang.wang1@gmail.com>
d9bf83e0 · BoxiangW · GitHub · 0ef8154b · d9bf83e0 · d9bf83e0
Unverified Commit d9bf83e0 authored Nov 11, 2022 by BoxiangW Committed by GitHub Nov 11, 2022
20 changed files
--- a/examples/tutorial/handson1/README.md
+++ b/examples/tutorial/handson1/README.md
+# Handson 1: Multi-dimensional Parallelism with Colossal-AI
+## Install Colossal-AI and other dependencies
+```bash
+sh install.sh
+```
+## Prepare Dataset
+We use CIFAR10 dataset in this example. The dataset will be downloaded to `../data` by default. 
+If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
+```bash
+export DATA=/path/to/data
+```
+## Run on 2*2 device mesh
+Current configuration setting on `config.py` is TP=2, PP=2.
+```bash
+colossalai run --nproc_per_node 4 train.py --config config.py
+```
\ No newline at end of file
--- a/examples/tutorial/handson1/config.py
+++ b/examples/tutorial/handson1/config.py
+from colossalai.amp import AMP_TYPE
+# hyperparameters
+# BATCH_SIZE is as per GPU
+# global batch size = BATCH_SIZE x data parallel size
+BATCH_SIZE = 256
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 10
+WARMUP_EPOCHS = 3
+# model config
+IMG_SIZE = 224
+PATCH_SIZE = 16
+HIDDEN_SIZE = 512
+DEPTH = 4
+NUM_HEADS = 4
+MLP_RATIO = 2
+NUM_CLASSES = 1000
+CHECKPOINT = False
+SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
+# parallel setting
+TENSOR_PARALLEL_SIZE = 2
+TENSOR_PARALLEL_MODE = '1d'
+parallel = dict(
+    pipeline=2,
+    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
+)
+fp16 = dict(mode=AMP_TYPE.NAIVE)
+clip_grad_norm = 1.0
+# pipeline config
+NUM_MICRO_BATCHES = parallel['pipeline']
--- a/examples/tutorial/handson1/install.sh
+++ b/examples/tutorial/handson1/install.sh
+pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
+pip install colossalai==0.1.10+torch1.12cu11.3 -f https://release.colossalai.org
+pip install titans
+colossalai check -i
\ No newline at end of file
--- a/examples/tutorial/handson1/train.py
+++ b/examples/tutorial/handson1/train.py
+import os
+import colossalai
+import torch
+from tqdm import tqdm
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn import CrossEntropyLoss
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.utils import is_using_pp, get_dataloader
+from colossalai.pipeline.pipelinable import PipelinableContext
+from titans.model.vit.vit import _create_vit_model
+from titans.dataloader.cifar10 import build_cifar
+def main():
+    # initialize distributed setting
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    # launch from torch
+    colossalai.launch_from_torch(config=args.config)
+    # get logger
+    logger = get_dist_logger()
+    logger.info("initialized distributed environment", ranks=[0])
+    if hasattr(gpc.config, 'LOG_PATH'):
+        if gpc.get_global_rank() == 0:
+            log_path = gpc.config.LOG_PATH
+            if not os.path.exists(log_path):
+                os.mkdir(log_path)
+            logger.log_to_file(log_path)
+    use_pipeline = is_using_pp()
+    # create model
+    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
+                        patch_size=gpc.config.PATCH_SIZE,
+                        hidden_size=gpc.config.HIDDEN_SIZE,
+                        depth=gpc.config.DEPTH,
+                        num_heads=gpc.config.NUM_HEADS,
+                        mlp_ratio=gpc.config.MLP_RATIO,
+                        num_classes=10,
+                        init_method='jax',
+                        checkpoint=gpc.config.CHECKPOINT)
+    if use_pipeline:
+        pipelinable = PipelinableContext()
+        with pipelinable:
+            model = _create_vit_model(**model_kwargs)
+        pipelinable.to_layer_list()
+        pipelinable.policy = "uniform"
+        model = pipelinable.partition(
+            1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
+    else:
+        model = _create_vit_model(**model_kwargs)
+    # count number of parameters
+    total_numel = 0
+    for p in model.parameters():
+        total_numel += p.numel()
+    if not gpc.is_initialized(ParallelMode.PIPELINE):
+        pipeline_stage = 0
+    else:
+        pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
+    logger.info(
+        f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
+    # create dataloaders
+    root = os.environ.get('DATA', '../data/cifar10')
+    train_dataloader, test_dataloader = build_cifar(
+        gpc.config.BATCH_SIZE, root, pad_if_needed=True)
+    # create loss function
+    criterion = CrossEntropyLoss(label_smoothing=0.1)
+    # create optimizer
+    optimizer = torch.optim.AdamW(model.parameters(
+    ), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
+    # create lr scheduler
+    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
+                                           total_steps=gpc.config.NUM_EPOCHS,
+                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
+    # initialize
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
+                                                                         optimizer=optimizer,
+                                                                         criterion=criterion,
+                                                                         train_dataloader=train_dataloader,
+                                                                         test_dataloader=test_dataloader)
+    logger.info("Engine is built", ranks=[0])
+    data_iter = iter(train_dataloader)
+    for epoch in range(gpc.config.NUM_EPOCHS):
+        # training
+        engine.train()
+        if gpc.get_global_rank() == 0:
+            description = 'Epoch {} / {}'.format(epoch, gpc.config.NUM_EPOCHS)
+            progress = tqdm(range(len(train_dataloader)), desc=description)
+        else:
+            progress = range(len(train_dataloader))
+        for _ in progress:
+            engine.zero_grad()
+            engine.execute_schedule(data_iter, return_output_label=False)
+            engine.step()
+            lr_scheduler.step()
+if __name__ == '__main__':
+    main()
--- a/examples/tutorial/handson2/README.md
+++ b/examples/tutorial/handson2/README.md
+# Handson 2: Sequence Parallelism with BERT
+## Prepare Dataset
+We use CIFAR10 dataset in this example. The dataset will be downloaded to `../data` by default. 
+If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
+```bash
+export DATA=/path/to/data
+```
+## Run on 2*2 device mesh
+Current configuration setting on `config.py` is TP=2, PP=2.
+```bash
+colossalai run --nproc_per_node 4 train.py --config config.py
+```
\ No newline at end of file
--- a/examples/tutorial/handson2/config.py
+++ b/examples/tutorial/handson2/config.py
+from colossalai.amp import AMP_TYPE
+# hyperparameters
+# BATCH_SIZE is as per GPU
+# global batch size = BATCH_SIZE x data parallel size
+BATCH_SIZE = 256
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 10
+WARMUP_EPOCHS = 3
+# model config
+IMG_SIZE = 224
+PATCH_SIZE = 16
+HIDDEN_SIZE = 512
+DEPTH = 4
+NUM_HEADS = 4
+MLP_RATIO = 2
+NUM_CLASSES = 1000
+CHECKPOINT = False
+SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
+# parallel setting
+TENSOR_PARALLEL_SIZE = 1
+TENSOR_PARALLEL_MODE = '1d'
+parallel = dict(
+    tensor=dict(size=4, mode='sequence')
+)
+fp16 = dict(mode=AMP_TYPE.NAIVE)
+clip_grad_norm = 1.0
+# pipeline config
+NUM_MICRO_BATCHES = parallel['pipeline']
--- a/examples/tutorial/handson2/train.py
+++ b/examples/tutorial/handson2/train.py
+import os
+import colossalai
+import torch
+from tqdm import tqdm
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn import CrossEntropyLoss
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.utils import is_using_pp, get_dataloader
+from colossalai.pipeline.pipelinable import PipelinableContext
+from titans.model.vit.vit import _create_vit_model
+from titans.dataloader.cifar10 import build_cifar
+def main():
+    # initialize distributed setting
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    # launch from torch
+    colossalai.launch_from_torch(config=args.config)
+    # get logger
+    logger = get_dist_logger()
+    logger.info("initialized distributed environment", ranks=[0])
+    if hasattr(gpc.config, 'LOG_PATH'):
+        if gpc.get_global_rank() == 0:
+            log_path = gpc.config.LOG_PATH
+            if not os.path.exists(log_path):
+                os.mkdir(log_path)
+            logger.log_to_file(log_path)
+    use_pipeline = is_using_pp()
+    # create model
+    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
+                        patch_size=gpc.config.PATCH_SIZE,
+                        hidden_size=gpc.config.HIDDEN_SIZE,
+                        depth=gpc.config.DEPTH,
+                        num_heads=gpc.config.NUM_HEADS,
+                        mlp_ratio=gpc.config.MLP_RATIO,
+                        num_classes=10,
+                        init_method='jax',
+                        checkpoint=gpc.config.CHECKPOINT)
+    if use_pipeline:
+        pipelinable = PipelinableContext()
+        with pipelinable:
+            model = _create_vit_model(**model_kwargs)
+        pipelinable.to_layer_list()
+        pipelinable.policy = "uniform"
+        model = pipelinable.partition(
+            1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
+    else:
+        model = _create_vit_model(**model_kwargs)
+    # count number of parameters
+    total_numel = 0
+    for p in model.parameters():
+        total_numel += p.numel()
+    if not gpc.is_initialized(ParallelMode.PIPELINE):
+        pipeline_stage = 0
+    else:
+        pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
+    logger.info(
+        f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
+    # create dataloaders
+    root = os.environ.get('DATA', '../data/cifar10')
+    train_dataloader, test_dataloader = build_cifar(
+        gpc.config.BATCH_SIZE, root, pad_if_needed=True)
+    # create loss function
+    criterion = CrossEntropyLoss(label_smoothing=0.1)
+    # create optimizer
+    optimizer = torch.optim.AdamW(model.parameters(
+    ), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
+    # create lr scheduler
+    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
+                                           total_steps=gpc.config.NUM_EPOCHS,
+                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
+    # initialize
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
+                                                                         optimizer=optimizer,
+                                                                         criterion=criterion,
+                                                                         train_dataloader=train_dataloader,
+                                                                         test_dataloader=test_dataloader)
+    logger.info("Engine is built", ranks=[0])
+    data_iter = iter(train_dataloader)
+    for epoch in range(gpc.config.NUM_EPOCHS):
+        # training
+        engine.train()
+        if gpc.get_global_rank() == 0:
+            description = 'Epoch {} / {}'.format(epoch, gpc.config.NUM_EPOCHS)
+            progress = tqdm(range(len(train_dataloader)), desc=description)
+        else:
+            progress = range(len(train_dataloader))
+        for _ in progress:
+            engine.zero_grad()
+            engine.execute_schedule(data_iter, return_output_label=False)
+            engine.step()
+            lr_scheduler.step()
+if __name__ == '__main__':
+    main()
--- a/examples/tutorial/auto_parallel/README.md
+++ b/examples/tutorial/auto_parallel/README.md
-# Train ResNet on CIFAR10 with auto_parallel
+# Handson 3: Auto-Parallelism with ResNet
 ## Prepare Dataset

--- a/examples/tutorial/auto_parallel/auto_ckpt_demo.ipynb
+++ b/examples/tutorial/auto_parallel/auto_ckpt_demo.ipynb
--- a/examples/tutorial/auto_parallel/auto_parallel_demo.py
+++ b/examples/tutorial/auto_parallel/auto_parallel_demo.py
--- a/examples/tutorial/auto_parallel/bench_utils.py
+++ b/examples/tutorial/auto_parallel/bench_utils.py
--- a/examples/tutorial/handson4/README.md
+++ b/examples/tutorial/handson4/README.md
+# Handson 4: Comparison of Large Batch Training Optimization
+## Prepare Dataset
+We use CIFAR10 dataset in this example. The dataset will be downloaded to `../data` by default. 
+If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
+```bash
+export DATA=/path/to/data
+```
+## Run on 2*2 device mesh
+```bash
+colossalai run --nproc_per_node 4 train.py --config config.py
+```
\ No newline at end of file
--- a/examples/tutorial/handson4/config.py
+++ b/examples/tutorial/handson4/config.py
+from colossalai.amp import AMP_TYPE
+# hyperparameters
+# BATCH_SIZE is as per GPU
+# global batch size = BATCH_SIZE x data parallel size
+BATCH_SIZE = 512
+LEARNING_RATE = 3e-3
+WEIGHT_DECAY = 0.3
+NUM_EPOCHS = 10
+WARMUP_EPOCHS = 3
+# model config
+IMG_SIZE = 224
+PATCH_SIZE = 16
+HIDDEN_SIZE = 512
+DEPTH = 4
+NUM_HEADS = 4
+MLP_RATIO = 2
+NUM_CLASSES = 1000
+CHECKPOINT = False
+SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
+# parallel setting
+TENSOR_PARALLEL_SIZE = 2
+TENSOR_PARALLEL_MODE = '1d'
+parallel = dict(
+    pipeline=2,
+    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
+)
+fp16 = dict(mode=AMP_TYPE.NAIVE)
+clip_grad_norm = 1.0
+# pipeline config
+NUM_MICRO_BATCHES = parallel['pipeline']
--- a/examples/tutorial/handson4/train.py
+++ b/examples/tutorial/handson4/train.py
+import os
+import colossalai
+import torch
+from tqdm import tqdm
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn import CrossEntropyLoss
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import Lars, Lamb
+from colossalai.utils import is_using_pp, get_dataloader
+from colossalai.pipeline.pipelinable import PipelinableContext
+from titans.model.vit.vit import _create_vit_model
+from titans.dataloader.cifar10 import build_cifar
+def main():
+    # initialize distributed setting
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    # launch from torch
+    colossalai.launch_from_torch(config=args.config)
+    # get logger
+    logger = get_dist_logger()
+    logger.info("initialized distributed environment", ranks=[0])
+    if hasattr(gpc.config, 'LOG_PATH'):
+        if gpc.get_global_rank() == 0:
+            log_path = gpc.config.LOG_PATH
+            if not os.path.exists(log_path):
+                os.mkdir(log_path)
+            logger.log_to_file(log_path)
+    use_pipeline = is_using_pp()
+    # create model
+    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
+                        patch_size=gpc.config.PATCH_SIZE,
+                        hidden_size=gpc.config.HIDDEN_SIZE,
+                        depth=gpc.config.DEPTH,
+                        num_heads=gpc.config.NUM_HEADS,
+                        mlp_ratio=gpc.config.MLP_RATIO,
+                        num_classes=10,
+                        init_method='jax',
+                        checkpoint=gpc.config.CHECKPOINT)
+    if use_pipeline:
+        pipelinable = PipelinableContext()
+        with pipelinable:
+            model = _create_vit_model(**model_kwargs)
+        pipelinable.to_layer_list()
+        pipelinable.policy = "uniform"
+        model = pipelinable.partition(
+            1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
+    else:
+        model = _create_vit_model(**model_kwargs)
+    # count number of parameters
+    total_numel = 0
+    for p in model.parameters():
+        total_numel += p.numel()
+    if not gpc.is_initialized(ParallelMode.PIPELINE):
+        pipeline_stage = 0
+    else:
+        pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
+    logger.info(
+        f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
+    # create dataloaders
+    root = os.environ.get('DATA', '../data/cifar10')
+    train_dataloader, test_dataloader = build_cifar(
+        gpc.config.BATCH_SIZE, root, pad_if_needed=True)
+    # create loss function
+    criterion = CrossEntropyLoss(label_smoothing=0.1)
+    # create optimizer
+    optimizer = Lars(model.parameters(), lr=gpc.config.LEARNING_RATE,
+                     weight_decay=gpc.config.WEIGHT_DECAY)
+    # create lr scheduler
+    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
+                                           total_steps=gpc.config.NUM_EPOCHS,
+                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
+    # initialize
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
+                                                                         optimizer=optimizer,
+                                                                         criterion=criterion,
+                                                                         train_dataloader=train_dataloader,
+                                                                         test_dataloader=test_dataloader)
+    logger.info("Engine is built", ranks=[0])
+    data_iter = iter(train_dataloader)
+    for epoch in range(gpc.config.NUM_EPOCHS):
+        # training
+        engine.train()
+        if gpc.get_global_rank() == 0:
+            description = 'Epoch {} / {}'.format(epoch, gpc.config.NUM_EPOCHS)
+            progress = tqdm(range(len(train_dataloader)), desc=description)
+        else:
+            progress = range(len(train_dataloader))
+        for _ in progress:
+            engine.zero_grad()
+            engine.execute_schedule(data_iter, return_output_label=False)
+            engine.step()
+            lr_scheduler.step()
+if __name__ == '__main__':
+    main()
--- a/examples/tutorial/handson5/README.md
+++ b/examples/tutorial/handson5/README.md
+# Handson 5: Fine-tuning and Serving for OPT from Hugging Face
--- a/examples/tutorial/handson5/inference/README.md
+++ b/examples/tutorial/handson5/inference/README.md
+# Overview
+This is an example showing how to run OPT generation. The OPT model is implemented using ColossalAI.
+It supports tensor parallelism, batching and caching.
+# How to run
+Run OPT-125M:
+```shell
+python opt_fastapi.py opt-125m
+```
+It will launch a HTTP server on `0.0.0.0:7070` by default and you can customize host and port. You can open `localhost:7070/docs` in your browser to see the openapi docs.
+## Configure
+### Configure model
+```shell
+python opt_fastapi.py <model>
+```
+Available models: opt-125m, opt-6.7b, opt-30b, opt-175b.
+### Configure tensor parallelism
+```shell
+python opt_fastapi.py <model> --tp <TensorParallelismWorldSize>
+```
+The `<TensorParallelismWorldSize>` can be an integer in `[1, #GPUs]`. Default `1`.
+### Configure checkpoint
+```shell
+python opt_fastapi.py <model> --checkpoint <CheckpointPath>
+```
+The `<CheckpointPath>` can be a file path or a directory path. If it's a directory path, all files under the directory will be loaded.
+### Configure queue
+```shell
+python opt_fastapi.py <model> --queue_size <QueueSize>
+```
+The `<QueueSize>` can be an integer in `[0, MAXINT]`. If it's `0`, the request queue size is infinite. If it's a positive integer, when the request queue is full, incoming requests will be dropped (the HTTP status code of response will be 406).
+### Configure bathcing
+```shell
+python opt_fastapi.py <model> --max_batch_size <MaxBatchSize>
+```
+The `<MaxBatchSize>` can be an integer in `[1, MAXINT]`. The engine will make batch whose size is less or equal to this value.
+Note that the batch size is not always equal to `<MaxBatchSize>`, as some consecutive requests may not be batched.
+### Configure caching
+```shell
+python opt_fastapi.py <model> --cache_size <CacheSize> --cache_list_size <CacheListSize>
+```
+This will cache `<CacheSize>` unique requests. And for each unique request, it cache `<CacheListSize>` different results. A random result will be returned if the cache is hit.
+The `<CacheSize>` can be an integer in `[0, MAXINT]`. If it's `0`, cache won't be applied. The `<CacheListSize>` can be an integer in `[1, MAXINT]`.
+### Other configurations
+```shell
+python opt_fastapi.py -h
+```
+# How to benchmark
+```shell
+cd benchmark
+locust
+```
+Then open the web interface link which is on your console.
+# Pre-process pre-trained weights
+## OPT-66B
+See [script/processing_ckpt_66b.py](./script/processing_ckpt_66b.py).
+## OPT-175B
+See [script/process-opt-175b](./script/process-opt-175b/).
\ No newline at end of file
--- a/examples/tutorial/handson5/inference/batch.py
+++ b/examples/tutorial/handson5/inference/batch.py
+import torch
+from typing import List, Deque, Tuple, Hashable, Any
+from energonai import BatchManager, SubmitEntry, TaskEntry
+class BatchManagerForGeneration(BatchManager):
+    def __init__(self, max_batch_size: int = 1, pad_token_id: int = 0) -> None:
+        super().__init__()
+        self.max_batch_size = max_batch_size
+        self.pad_token_id = pad_token_id
+    def _left_padding(self, batch_inputs):
+        max_len = max(len(inputs['input_ids']) for inputs in batch_inputs)
+        outputs = {'input_ids': [], 'attention_mask': []}
+        for inputs in batch_inputs:
+            input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
+            padding_len = max_len - len(input_ids)
+            input_ids = [self.pad_token_id] * padding_len + input_ids
+            attention_mask = [0] * padding_len + attention_mask
+            outputs['input_ids'].append(input_ids)
+            outputs['attention_mask'].append(attention_mask)
+        for k in outputs:
+            outputs[k] = torch.tensor(outputs[k])
+        return outputs, max_len
+    @staticmethod
+    def _make_batch_key(entry: SubmitEntry) -> tuple:
+        data = entry.data
+        return (data['top_k'], data['top_p'], data['temperature'])
+    def make_batch(self, q: Deque[SubmitEntry]) -> Tuple[TaskEntry, dict]:
+        entry = q.popleft()
+        uids = [entry.uid]
+        batch = [entry.data]
+        while len(batch) < self.max_batch_size:
+            if len(q) == 0:
+                break
+            if self._make_batch_key(entry) != self._make_batch_key(q[0]):
+                break
+            if q[0].data['max_tokens'] > entry.data['max_tokens']:
+                break
+            e = q.popleft()
+            batch.append(e.data)
+            uids.append(e.uid)
+        inputs, max_len = self._left_padding(batch)
+        trunc_lens = []
+        for data in batch:
+            trunc_lens.append(max_len + data['max_tokens'])
+        inputs['top_k'] = entry.data['top_k']
+        inputs['top_p'] = entry.data['top_p']
+        inputs['temperature'] = entry.data['temperature']
+        inputs['max_tokens'] = max_len + entry.data['max_tokens']
+        return TaskEntry(tuple(uids), inputs), {'trunc_lens': trunc_lens}
+    def split_batch(self, task_entry: TaskEntry, trunc_lens: List[int] = []) -> List[Tuple[Hashable, Any]]:
+        retval = []
+        for uid, output, trunc_len in zip(task_entry.uids, task_entry.batch, trunc_lens):
+            retval.append((uid, output[:trunc_len]))
+        return retval
--- a/examples/tutorial/handson5/inference/benchmark/locustfile.py
+++ b/examples/tutorial/handson5/inference/benchmark/locustfile.py
+from locust import HttpUser, task
+from json import JSONDecodeError
+class GenerationUser(HttpUser):
+    @task
+    def generate(self):
+        prompt = 'Question: What is the longest river on the earth? Answer:'
+        for i in range(4, 9):
+            data = {'max_tokens': 2**i, 'prompt': prompt}
+            with self.client.post('/generation', json=data, catch_response=True) as response:
+                if response.status_code in (200, 406):
+                    response.success()
+                else:
+                    response.failure('Response wrong')
--- a/examples/tutorial/handson5/inference/cache.py
+++ b/examples/tutorial/handson5/inference/cache.py
+from collections import OrderedDict
+from threading import Lock
+from contextlib import contextmanager
+from typing import List, Any, Hashable, Dict
+class MissCacheError(Exception):
+    pass
+class ListCache:
+    def __init__(self, cache_size: int, list_size: int, fixed_keys: List[Hashable] = []) -> None:
+        """Cache a list of values. The fixed keys won't be removed. For other keys, LRU is applied.
+        When the value list is not full, a cache miss occurs. Otherwise, a cache hit occurs. Redundant values will be removed.
+        Args:
+            cache_size (int): Max size for LRU cache.
+            list_size (int): Value list size.
+            fixed_keys (List[Hashable], optional): The keys which won't be removed. Defaults to [].
+        """
+        self.cache_size = cache_size
+        self.list_size = list_size
+        self.cache: OrderedDict[Hashable, List[Any]] = OrderedDict()
+        self.fixed_cache: Dict[Hashable, List[Any]] = {}
+        for key in fixed_keys:
+            self.fixed_cache[key] = []
+        self._lock = Lock()
+    def get(self, key: Hashable) -> List[Any]:
+        with self.lock():
+            if key in self.fixed_cache:
+                l = self.fixed_cache[key]
+                if len(l) >= self.list_size:
+                    return l
+            elif key in self.cache:
+                self.cache.move_to_end(key)
+                l = self.cache[key]
+                if len(l) >= self.list_size:
+                    return l
+        raise MissCacheError()
+    def add(self, key: Hashable, value: Any) -> None:
+        with self.lock():
+            if key in self.fixed_cache:
+                l = self.fixed_cache[key]
+                if len(l) < self.list_size and value not in l:
+                    l.append(value)
+            elif key in self.cache:
+                self.cache.move_to_end(key)
+                l = self.cache[key]
+                if len(l) < self.list_size and value not in l:
+                    l.append(value)
+            else:
+                if len(self.cache) >= self.cache_size:
+                    self.cache.popitem(last=False)
+                self.cache[key] = [value]
+    @contextmanager
+    def lock(self):
+        try:
+            self._lock.acquire()
+            yield
+        finally:
+            self._lock.release()
--- a/examples/tutorial/handson5/inference/opt_fastapi.py
+++ b/examples/tutorial/handson5/inference/opt_fastapi.py
+import argparse
+import logging
+import random
+from typing import Optional
+import uvicorn
+from energonai import QueueFullError, launch_engine
+from energonai.model import opt_6B, opt_30B, opt_125M, opt_175B
+from fastapi import FastAPI, HTTPException, Request
+from pydantic import BaseModel, Field
+from transformers import GPT2Tokenizer
+from batch import BatchManagerForGeneration
+from cache import ListCache, MissCacheError
+class GenerationTaskReq(BaseModel):
+    max_tokens: int = Field(gt=0, le=256, example=64)
+    prompt: str = Field(
+        min_length=1, example='Question: Where were the 2004 Olympics held?\nAnswer: Athens, Greece\n\nQuestion: What is the longest river on the earth?\nAnswer:')
+    top_k: Optional[int] = Field(default=None, gt=0, example=50)
+    top_p: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.5)
+    temperature: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.7)
+app = FastAPI()
+@app.post('/generation')
+async def generate(data: GenerationTaskReq, request: Request):
+    logger.info(f'{request.client.host}:{request.client.port} - "{request.method} {request.url.path}" - {data}')
+    key = (data.prompt, data.max_tokens)
+    try:
+        if cache is None:
+            raise MissCacheError()
+        outputs = cache.get(key)
+        output = random.choice(outputs)
+        logger.info('Cache hit')
+    except MissCacheError:
+        inputs = tokenizer(data.prompt, truncation=True, max_length=512)
+        inputs['max_tokens'] = data.max_tokens
+        inputs['top_k'] = data.top_k
+        inputs['top_p'] = data.top_p
+        inputs['temperature'] = data.temperature
+        try:
+            uid = id(data)
+            engine.submit(uid, inputs)
+            output = await engine.wait(uid)
+            output = tokenizer.decode(output, skip_special_tokens=True)
+            if cache is not None:
+                cache.add(key, output)
+        except QueueFullError as e:
+            raise HTTPException(status_code=406, detail=e.args[0])
+    return {'text': output}
+@app.on_event("shutdown")
+async def shutdown(*_):
+    engine.shutdown()
+    server.should_exit = True
+    server.force_exit = True
+    await server.shutdown()
+def get_model_fn(model_name: str):
+    model_map = {
+        'opt-125m': opt_125M,
+        'opt-6.7b': opt_6B,
+        'opt-30b': opt_30B,
+        'opt-175b': opt_175B
+    }
+    return model_map[model_name]
+def print_args(args: argparse.Namespace):
+    print('\n==> Args:')
+    for k, v in args.__dict__.items():
+        print(f'{k} = {v}')
+FIXED_CACHE_KEYS = [
+    ('Question: What is the name of the largest continent on earth?\nAnswer: Asia\n\nQuestion: What is at the center of the solar system?\nAnswer:', 64),
+    ('A chat between a salesman and a student.\n\nSalesman: Hi boy, are you looking for a new phone?\nStudent: Yes, my phone is not functioning well.\nSalesman: What is your budget? \nStudent: I have received my scholarship so I am fine with any phone.\nSalesman: Great, then perhaps this latest flagship phone is just right for you.', 64),
+    ("English: I am happy today.\nChinese: 我今天很开心。\n\nEnglish: I am going to play basketball.\nChinese: 我一会去打篮球。\n\nEnglish: Let's celebrate our anniversary.\nChinese:", 64)
+]
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', choices=['opt-125m', 'opt-6.7b', 'opt-30b', 'opt-175b'])
+    parser.add_argument('--tp', type=int, default=1)
+    parser.add_argument('--master_host', default='localhost')
+    parser.add_argument('--master_port', type=int, default=19990)
+    parser.add_argument('--rpc_port', type=int, default=19980)
+    parser.add_argument('--max_batch_size', type=int, default=8)
+    parser.add_argument('--pipe_size', type=int, default=1)
+    parser.add_argument('--queue_size', type=int, default=0)
+    parser.add_argument('--http_host', default='0.0.0.0')
+    parser.add_argument('--http_port', type=int, default=7070)
+    parser.add_argument('--checkpoint', default=None)
+    parser.add_argument('--cache_size', type=int, default=0)
+    parser.add_argument('--cache_list_size', type=int, default=1)
+    args = parser.parse_args()
+    print_args(args)
+    model_kwargs = {}
+    if args.checkpoint is not None:
+        model_kwargs['checkpoint'] = args.checkpoint
+    logger = logging.getLogger(__name__)
+    tokenizer = GPT2Tokenizer.from_pretrained('facebook/opt-30b')
+    if args.cache_size > 0:
+        cache = ListCache(args.cache_size, args.cache_list_size, fixed_keys=FIXED_CACHE_KEYS)
+    else:
+        cache = None
+    engine = launch_engine(args.tp, 1, args.master_host, args.master_port, args.rpc_port, get_model_fn(args.model),
+                           batch_manager=BatchManagerForGeneration(max_batch_size=args.max_batch_size,
+                                                                   pad_token_id=tokenizer.pad_token_id),
+                           pipe_size=args.pipe_size,
+                           queue_size=args.queue_size,
+                           **model_kwargs)
+    config = uvicorn.Config(app, host=args.http_host, port=args.http_port)
+    server = uvicorn.Server(config=config)
+    server.run()