[tutorial] removed duplicated tutorials (#1904)

cb7ec714 · Frank Lee · GitHub · 351f0f64 · cb7ec714 · cb7ec714
Unverified Commit cb7ec714 authored Nov 11, 2022 by Frank Lee Committed by GitHub Nov 11, 2022
20 changed files
--- a/examples/tutorial/handson3/README.md
+++ b/examples/tutorial/handson3/README.md
@@ -2,7 +2,7 @@
 ## Prepare Dataset
-We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default. 
+We use CIFAR10 dataset in this example. The dataset will be downloaded to `./data` by default.
 If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
 ```bash
@@ -14,4 +14,4 @@ export DATA=/path/to/data
 ```bash
 colossalai run --nproc_per_node 4 auto_parallel_demo.py
 ```
\ No newline at end of file
--- a/examples/tutorial/handson3/auto_ckpt_demo.ipynb
+++ b/examples/tutorial/handson3/auto_ckpt_demo.ipynb
--- a/examples/tutorial/handson3/auto_parallel_demo.py
+++ b/examples/tutorial/handson3/auto_parallel_demo.py
+import os
 from pathlib import Path
-from colossalai.logging import get_dist_logger
-import colossalai
 import torch
-import os
+from titans.utils import barrier_context
 from torch.fx import GraphModule
-from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
-from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_dataloader
 from torchvision import transforms
-from colossalai.nn.lr_scheduler import CosineAnnealingLR
 from torchvision.datasets import CIFAR10
 from torchvision.models import resnet50
 from tqdm import tqdm
-from titans.utils import barrier_context
+import colossalai
+from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
+from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
 from colossalai.auto_parallel.tensor_shard.solver.cost_graph import CostGraph
 from colossalai.auto_parallel.tensor_shard.solver.graph_analysis import GraphAnalyser
 from colossalai.auto_parallel.tensor_shard.solver.options import SolverOptions
 from colossalai.auto_parallel.tensor_shard.solver.solver import Solver
 from colossalai.auto_parallel.tensor_shard.solver.strategies_constructor import StrategiesConstructor
+from colossalai.core import global_context as gpc
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.tracer.tracer import ColoTracer
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import CosineAnnealingLR
+from colossalai.utils import get_dataloader
 DATA_ROOT = Path(os.environ.get('DATA', './data'))
 BATCH_SIZE = 1024

--- a/examples/tutorial/handson3/bench_utils.py
+++ b/examples/tutorial/handson3/bench_utils.py
--- a/examples/tutorial/handson1/README.md
+++ b/examples/tutorial/handson1/README.md
-# Handson 1: Multi-dimensional Parallelism with Colossal-AI
-## Install Colossal-AI and other dependencies
-```bash
-sh install.sh
-```
-## Prepare Dataset
-We use CIFAR10 dataset in this example. The dataset will be downloaded to `../data` by default. 
-If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
-```bash
-export DATA=/path/to/data
-```
-## Run on 2*2 device mesh
-Current configuration setting on `config.py` is TP=2, PP=2.
-```bash
-colossalai run --nproc_per_node 4 train.py --config config.py
-```
\ No newline at end of file
--- a/examples/tutorial/handson1/config.py
+++ b/examples/tutorial/handson1/config.py
-from colossalai.amp import AMP_TYPE
-# hyperparameters
-# BATCH_SIZE is as per GPU
-# global batch size = BATCH_SIZE x data parallel size
-BATCH_SIZE = 256
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 10
-WARMUP_EPOCHS = 3
-# model config
-IMG_SIZE = 224
-PATCH_SIZE = 16
-HIDDEN_SIZE = 512
-DEPTH = 4
-NUM_HEADS = 4
-MLP_RATIO = 2
-NUM_CLASSES = 1000
-CHECKPOINT = False
-SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
-# parallel setting
-TENSOR_PARALLEL_SIZE = 2
-TENSOR_PARALLEL_MODE = '1d'
-parallel = dict(
-    pipeline=2,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-fp16 = dict(mode=AMP_TYPE.NAIVE)
-clip_grad_norm = 1.0
-# pipeline config
-NUM_MICRO_BATCHES = parallel['pipeline']
--- a/examples/tutorial/handson1/install.sh
+++ b/examples/tutorial/handson1/install.sh
-pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
-pip install colossalai==0.1.10+torch1.12cu11.3 -f https://release.colossalai.org
-pip install titans
-colossalai check -i
\ No newline at end of file
--- a/examples/tutorial/handson1/train.py
+++ b/examples/tutorial/handson1/train.py
-import os
-import colossalai
-import torch
-from tqdm import tqdm
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.nn import CrossEntropyLoss
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.utils import is_using_pp, get_dataloader
-from colossalai.pipeline.pipelinable import PipelinableContext
-from titans.model.vit.vit import _create_vit_model
-from titans.dataloader.cifar10 import build_cifar
-def main():
-    # initialize distributed setting
-    parser = colossalai.get_default_parser()
-    args = parser.parse_args()
-    # launch from torch
-    colossalai.launch_from_torch(config=args.config)
-    # get logger
-    logger = get_dist_logger()
-    logger.info("initialized distributed environment", ranks=[0])
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-    use_pipeline = is_using_pp()
-    # create model
-    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
-                        patch_size=gpc.config.PATCH_SIZE,
-                        hidden_size=gpc.config.HIDDEN_SIZE,
-                        depth=gpc.config.DEPTH,
-                        num_heads=gpc.config.NUM_HEADS,
-                        mlp_ratio=gpc.config.MLP_RATIO,
-                        num_classes=10,
-                        init_method='jax',
-                        checkpoint=gpc.config.CHECKPOINT)
-    if use_pipeline:
-        pipelinable = PipelinableContext()
-        with pipelinable:
-            model = _create_vit_model(**model_kwargs)
-        pipelinable.to_layer_list()
-        pipelinable.policy = "uniform"
-        model = pipelinable.partition(
-            1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
-    else:
-        model = _create_vit_model(**model_kwargs)
-    # count number of parameters
-    total_numel = 0
-    for p in model.parameters():
-        total_numel += p.numel()
-    if not gpc.is_initialized(ParallelMode.PIPELINE):
-        pipeline_stage = 0
-    else:
-        pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
-    logger.info(
-        f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
-    # create dataloaders
-    root = os.environ.get('DATA', '../data/cifar10')
-    train_dataloader, test_dataloader = build_cifar(
-        gpc.config.BATCH_SIZE, root, pad_if_needed=True)
-    # create loss function
-    criterion = CrossEntropyLoss(label_smoothing=0.1)
-    # create optimizer
-    optimizer = torch.optim.AdamW(model.parameters(
-    ), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
-    # create lr scheduler
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
-                                           total_steps=gpc.config.NUM_EPOCHS,
-                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
-    # initialize
-    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
-                                                                         optimizer=optimizer,
-                                                                         criterion=criterion,
-                                                                         train_dataloader=train_dataloader,
-                                                                         test_dataloader=test_dataloader)
-    logger.info("Engine is built", ranks=[0])
-    data_iter = iter(train_dataloader)
-    for epoch in range(gpc.config.NUM_EPOCHS):
-        # training
-        engine.train()
-        if gpc.get_global_rank() == 0:
-            description = 'Epoch {} / {}'.format(epoch, gpc.config.NUM_EPOCHS)
-            progress = tqdm(range(len(train_dataloader)), desc=description)
-        else:
-            progress = range(len(train_dataloader))
-        for _ in progress:
-            engine.zero_grad()
-            engine.execute_schedule(data_iter, return_output_label=False)
-            engine.step()
-            lr_scheduler.step()
-if __name__ == '__main__':
-    main()
--- a/examples/tutorial/handson2/README.md
+++ b/examples/tutorial/handson2/README.md
-# Handson 2: Sequence Parallelism with BERT
-## Prepare Dataset
-We use CIFAR10 dataset in this example. The dataset will be downloaded to `../data` by default. 
-If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
-```bash
-export DATA=/path/to/data
-```
-## Run on 2*2 device mesh
-Current configuration setting on `config.py` is TP=2, PP=2.
-```bash
-colossalai run --nproc_per_node 4 train.py --config config.py
-```
\ No newline at end of file
--- a/examples/tutorial/handson2/config.py
+++ b/examples/tutorial/handson2/config.py
-from colossalai.amp import AMP_TYPE
-# hyperparameters
-# BATCH_SIZE is as per GPU
-# global batch size = BATCH_SIZE x data parallel size
-BATCH_SIZE = 256
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 10
-WARMUP_EPOCHS = 3
-# model config
-IMG_SIZE = 224
-PATCH_SIZE = 16
-HIDDEN_SIZE = 512
-DEPTH = 4
-NUM_HEADS = 4
-MLP_RATIO = 2
-NUM_CLASSES = 1000
-CHECKPOINT = False
-SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
-# parallel setting
-TENSOR_PARALLEL_SIZE = 1
-TENSOR_PARALLEL_MODE = '1d'
-parallel = dict(
-    tensor=dict(size=4, mode='sequence')
-)
-fp16 = dict(mode=AMP_TYPE.NAIVE)
-clip_grad_norm = 1.0
-# pipeline config
-NUM_MICRO_BATCHES = parallel['pipeline']
--- a/examples/tutorial/handson2/train.py
+++ b/examples/tutorial/handson2/train.py
-import os
-import colossalai
-import torch
-from tqdm import tqdm
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.nn import CrossEntropyLoss
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.utils import is_using_pp, get_dataloader
-from colossalai.pipeline.pipelinable import PipelinableContext
-from titans.model.vit.vit import _create_vit_model
-from titans.dataloader.cifar10 import build_cifar
-def main():
-    # initialize distributed setting
-    parser = colossalai.get_default_parser()
-    args = parser.parse_args()
-    # launch from torch
-    colossalai.launch_from_torch(config=args.config)
-    # get logger
-    logger = get_dist_logger()
-    logger.info("initialized distributed environment", ranks=[0])
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-    use_pipeline = is_using_pp()
-    # create model
-    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
-                        patch_size=gpc.config.PATCH_SIZE,
-                        hidden_size=gpc.config.HIDDEN_SIZE,
-                        depth=gpc.config.DEPTH,
-                        num_heads=gpc.config.NUM_HEADS,
-                        mlp_ratio=gpc.config.MLP_RATIO,
-                        num_classes=10,
-                        init_method='jax',
-                        checkpoint=gpc.config.CHECKPOINT)
-    if use_pipeline:
-        pipelinable = PipelinableContext()
-        with pipelinable:
-            model = _create_vit_model(**model_kwargs)
-        pipelinable.to_layer_list()
-        pipelinable.policy = "uniform"
-        model = pipelinable.partition(
-            1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
-    else:
-        model = _create_vit_model(**model_kwargs)
-    # count number of parameters
-    total_numel = 0
-    for p in model.parameters():
-        total_numel += p.numel()
-    if not gpc.is_initialized(ParallelMode.PIPELINE):
-        pipeline_stage = 0
-    else:
-        pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
-    logger.info(
-        f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
-    # create dataloaders
-    root = os.environ.get('DATA', '../data/cifar10')
-    train_dataloader, test_dataloader = build_cifar(
-        gpc.config.BATCH_SIZE, root, pad_if_needed=True)
-    # create loss function
-    criterion = CrossEntropyLoss(label_smoothing=0.1)
-    # create optimizer
-    optimizer = torch.optim.AdamW(model.parameters(
-    ), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
-    # create lr scheduler
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
-                                           total_steps=gpc.config.NUM_EPOCHS,
-                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
-    # initialize
-    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
-                                                                         optimizer=optimizer,
-                                                                         criterion=criterion,
-                                                                         train_dataloader=train_dataloader,
-                                                                         test_dataloader=test_dataloader)
-    logger.info("Engine is built", ranks=[0])
-    data_iter = iter(train_dataloader)
-    for epoch in range(gpc.config.NUM_EPOCHS):
-        # training
-        engine.train()
-        if gpc.get_global_rank() == 0:
-            description = 'Epoch {} / {}'.format(epoch, gpc.config.NUM_EPOCHS)
-            progress = tqdm(range(len(train_dataloader)), desc=description)
-        else:
-            progress = range(len(train_dataloader))
-        for _ in progress:
-            engine.zero_grad()
-            engine.execute_schedule(data_iter, return_output_label=False)
-            engine.step()
-            lr_scheduler.step()
-if __name__ == '__main__':
-    main()
--- a/examples/tutorial/handson4/README.md
+++ b/examples/tutorial/handson4/README.md
-# Handson 4: Comparison of Large Batch Training Optimization
-## Prepare Dataset
-We use CIFAR10 dataset in this example. The dataset will be downloaded to `../data` by default. 
-If you wish to use customized directory for the dataset. You can set the environment variable `DATA` via the following command.
-```bash
-export DATA=/path/to/data
-```
-## Run on 2*2 device mesh
-```bash
-colossalai run --nproc_per_node 4 train.py --config config.py
-```
\ No newline at end of file
--- a/examples/tutorial/handson4/config.py
+++ b/examples/tutorial/handson4/config.py
-from colossalai.amp import AMP_TYPE
-# hyperparameters
-# BATCH_SIZE is as per GPU
-# global batch size = BATCH_SIZE x data parallel size
-BATCH_SIZE = 512
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 10
-WARMUP_EPOCHS = 3
-# model config
-IMG_SIZE = 224
-PATCH_SIZE = 16
-HIDDEN_SIZE = 512
-DEPTH = 4
-NUM_HEADS = 4
-MLP_RATIO = 2
-NUM_CLASSES = 1000
-CHECKPOINT = False
-SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE)**2 + 1    # add 1 for cls token
-# parallel setting
-TENSOR_PARALLEL_SIZE = 2
-TENSOR_PARALLEL_MODE = '1d'
-parallel = dict(
-    pipeline=2,
-    tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
-)
-fp16 = dict(mode=AMP_TYPE.NAIVE)
-clip_grad_norm = 1.0
-# pipeline config
-NUM_MICRO_BATCHES = parallel['pipeline']
--- a/examples/tutorial/handson4/train.py
+++ b/examples/tutorial/handson4/train.py
-import os
-import colossalai
-import torch
-from tqdm import tqdm
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.nn import CrossEntropyLoss
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.nn.optimizer import Lars, Lamb
-from colossalai.utils import is_using_pp, get_dataloader
-from colossalai.pipeline.pipelinable import PipelinableContext
-from titans.model.vit.vit import _create_vit_model
-from titans.dataloader.cifar10 import build_cifar
-def main():
-    # initialize distributed setting
-    parser = colossalai.get_default_parser()
-    args = parser.parse_args()
-    # launch from torch
-    colossalai.launch_from_torch(config=args.config)
-    # get logger
-    logger = get_dist_logger()
-    logger.info("initialized distributed environment", ranks=[0])
-    if hasattr(gpc.config, 'LOG_PATH'):
-        if gpc.get_global_rank() == 0:
-            log_path = gpc.config.LOG_PATH
-            if not os.path.exists(log_path):
-                os.mkdir(log_path)
-            logger.log_to_file(log_path)
-    use_pipeline = is_using_pp()
-    # create model
-    model_kwargs = dict(img_size=gpc.config.IMG_SIZE,
-                        patch_size=gpc.config.PATCH_SIZE,
-                        hidden_size=gpc.config.HIDDEN_SIZE,
-                        depth=gpc.config.DEPTH,
-                        num_heads=gpc.config.NUM_HEADS,
-                        mlp_ratio=gpc.config.MLP_RATIO,
-                        num_classes=10,
-                        init_method='jax',
-                        checkpoint=gpc.config.CHECKPOINT)
-    if use_pipeline:
-        pipelinable = PipelinableContext()
-        with pipelinable:
-            model = _create_vit_model(**model_kwargs)
-        pipelinable.to_layer_list()
-        pipelinable.policy = "uniform"
-        model = pipelinable.partition(
-            1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
-    else:
-        model = _create_vit_model(**model_kwargs)
-    # count number of parameters
-    total_numel = 0
-    for p in model.parameters():
-        total_numel += p.numel()
-    if not gpc.is_initialized(ParallelMode.PIPELINE):
-        pipeline_stage = 0
-    else:
-        pipeline_stage = gpc.get_local_rank(ParallelMode.PIPELINE)
-    logger.info(
-        f"number of parameters: {total_numel} on pipeline stage {pipeline_stage}")
-    # create dataloaders
-    root = os.environ.get('DATA', '../data/cifar10')
-    train_dataloader, test_dataloader = build_cifar(
-        gpc.config.BATCH_SIZE, root, pad_if_needed=True)
-    # create loss function
-    criterion = CrossEntropyLoss(label_smoothing=0.1)
-    # create optimizer
-    optimizer = Lars(model.parameters(), lr=gpc.config.LEARNING_RATE,
-                     weight_decay=gpc.config.WEIGHT_DECAY)
-    # create lr scheduler
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
-                                           total_steps=gpc.config.NUM_EPOCHS,
-                                           warmup_steps=gpc.config.WARMUP_EPOCHS)
-    # initialize
-    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
-                                                                         optimizer=optimizer,
-                                                                         criterion=criterion,
-                                                                         train_dataloader=train_dataloader,
-                                                                         test_dataloader=test_dataloader)
-    logger.info("Engine is built", ranks=[0])
-    data_iter = iter(train_dataloader)
-    for epoch in range(gpc.config.NUM_EPOCHS):
-        # training
-        engine.train()
-        if gpc.get_global_rank() == 0:
-            description = 'Epoch {} / {}'.format(epoch, gpc.config.NUM_EPOCHS)
-            progress = tqdm(range(len(train_dataloader)), desc=description)
-        else:
-            progress = range(len(train_dataloader))
-        for _ in progress:
-            engine.zero_grad()
-            engine.execute_schedule(data_iter, return_output_label=False)
-            engine.step()
-            lr_scheduler.step()
-if __name__ == '__main__':
-    main()
--- a/examples/tutorial/handson5/README.md
+++ b/examples/tutorial/handson5/README.md
-# Handson 5: Fine-tuning and Serving for OPT from Hugging Face
--- a/examples/tutorial/handson5/inference/README.md
+++ b/examples/tutorial/handson5/inference/README.md
-# Overview
-This is an example showing how to run OPT generation. The OPT model is implemented using ColossalAI.
-It supports tensor parallelism, batching and caching.
-# How to run
-Run OPT-125M:
-```shell
-python opt_fastapi.py opt-125m
-```
-It will launch a HTTP server on `0.0.0.0:7070` by default and you can customize host and port. You can open `localhost:7070/docs` in your browser to see the openapi docs.
-## Configure
-### Configure model
-```shell
-python opt_fastapi.py <model>
-```
-Available models: opt-125m, opt-6.7b, opt-30b, opt-175b.
-### Configure tensor parallelism
-```shell
-python opt_fastapi.py <model> --tp <TensorParallelismWorldSize>
-```
-The `<TensorParallelismWorldSize>` can be an integer in `[1, #GPUs]`. Default `1`.
-### Configure checkpoint
-```shell
-python opt_fastapi.py <model> --checkpoint <CheckpointPath>
-```
-The `<CheckpointPath>` can be a file path or a directory path. If it's a directory path, all files under the directory will be loaded.
-### Configure queue
-```shell
-python opt_fastapi.py <model> --queue_size <QueueSize>
-```
-The `<QueueSize>` can be an integer in `[0, MAXINT]`. If it's `0`, the request queue size is infinite. If it's a positive integer, when the request queue is full, incoming requests will be dropped (the HTTP status code of response will be 406).
-### Configure bathcing
-```shell
-python opt_fastapi.py <model> --max_batch_size <MaxBatchSize>
-```
-The `<MaxBatchSize>` can be an integer in `[1, MAXINT]`. The engine will make batch whose size is less or equal to this value.
-Note that the batch size is not always equal to `<MaxBatchSize>`, as some consecutive requests may not be batched.
-### Configure caching
-```shell
-python opt_fastapi.py <model> --cache_size <CacheSize> --cache_list_size <CacheListSize>
-```
-This will cache `<CacheSize>` unique requests. And for each unique request, it cache `<CacheListSize>` different results. A random result will be returned if the cache is hit.
-The `<CacheSize>` can be an integer in `[0, MAXINT]`. If it's `0`, cache won't be applied. The `<CacheListSize>` can be an integer in `[1, MAXINT]`.
-### Other configurations
-```shell
-python opt_fastapi.py -h
-```
-# How to benchmark
-```shell
-cd benchmark
-locust
-```
-Then open the web interface link which is on your console.
-# Pre-process pre-trained weights
-## OPT-66B
-See [script/processing_ckpt_66b.py](./script/processing_ckpt_66b.py).
-## OPT-175B
-See [script/process-opt-175b](./script/process-opt-175b/).
\ No newline at end of file
--- a/examples/tutorial/handson5/inference/batch.py
+++ b/examples/tutorial/handson5/inference/batch.py
-import torch
-from typing import List, Deque, Tuple, Hashable, Any
-from energonai import BatchManager, SubmitEntry, TaskEntry
-class BatchManagerForGeneration(BatchManager):
-    def __init__(self, max_batch_size: int = 1, pad_token_id: int = 0) -> None:
-        super().__init__()
-        self.max_batch_size = max_batch_size
-        self.pad_token_id = pad_token_id
-    def _left_padding(self, batch_inputs):
-        max_len = max(len(inputs['input_ids']) for inputs in batch_inputs)
-        outputs = {'input_ids': [], 'attention_mask': []}
-        for inputs in batch_inputs:
-            input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
-            padding_len = max_len - len(input_ids)
-            input_ids = [self.pad_token_id] * padding_len + input_ids
-            attention_mask = [0] * padding_len + attention_mask
-            outputs['input_ids'].append(input_ids)
-            outputs['attention_mask'].append(attention_mask)
-        for k in outputs:
-            outputs[k] = torch.tensor(outputs[k])
-        return outputs, max_len
-    @staticmethod
-    def _make_batch_key(entry: SubmitEntry) -> tuple:
-        data = entry.data
-        return (data['top_k'], data['top_p'], data['temperature'])
-    def make_batch(self, q: Deque[SubmitEntry]) -> Tuple[TaskEntry, dict]:
-        entry = q.popleft()
-        uids = [entry.uid]
-        batch = [entry.data]
-        while len(batch) < self.max_batch_size:
-            if len(q) == 0:
-                break
-            if self._make_batch_key(entry) != self._make_batch_key(q[0]):
-                break
-            if q[0].data['max_tokens'] > entry.data['max_tokens']:
-                break
-            e = q.popleft()
-            batch.append(e.data)
-            uids.append(e.uid)
-        inputs, max_len = self._left_padding(batch)
-        trunc_lens = []
-        for data in batch:
-            trunc_lens.append(max_len + data['max_tokens'])
-        inputs['top_k'] = entry.data['top_k']
-        inputs['top_p'] = entry.data['top_p']
-        inputs['temperature'] = entry.data['temperature']
-        inputs['max_tokens'] = max_len + entry.data['max_tokens']
-        return TaskEntry(tuple(uids), inputs), {'trunc_lens': trunc_lens}
-    def split_batch(self, task_entry: TaskEntry, trunc_lens: List[int] = []) -> List[Tuple[Hashable, Any]]:
-        retval = []
-        for uid, output, trunc_len in zip(task_entry.uids, task_entry.batch, trunc_lens):
-            retval.append((uid, output[:trunc_len]))
-        return retval
--- a/examples/tutorial/handson5/inference/benchmark/locustfile.py
+++ b/examples/tutorial/handson5/inference/benchmark/locustfile.py
-from locust import HttpUser, task
-from json import JSONDecodeError
-class GenerationUser(HttpUser):
-    @task
-    def generate(self):
-        prompt = 'Question: What is the longest river on the earth? Answer:'
-        for i in range(4, 9):
-            data = {'max_tokens': 2**i, 'prompt': prompt}
-            with self.client.post('/generation', json=data, catch_response=True) as response:
-                if response.status_code in (200, 406):
-                    response.success()
-                else:
-                    response.failure('Response wrong')
--- a/examples/tutorial/handson5/inference/cache.py
+++ b/examples/tutorial/handson5/inference/cache.py
-from collections import OrderedDict
-from threading import Lock
-from contextlib import contextmanager
-from typing import List, Any, Hashable, Dict
-class MissCacheError(Exception):
-    pass
-class ListCache:
-    def __init__(self, cache_size: int, list_size: int, fixed_keys: List[Hashable] = []) -> None:
-        """Cache a list of values. The fixed keys won't be removed. For other keys, LRU is applied.
-        When the value list is not full, a cache miss occurs. Otherwise, a cache hit occurs. Redundant values will be removed.
-        Args:
-            cache_size (int): Max size for LRU cache.
-            list_size (int): Value list size.
-            fixed_keys (List[Hashable], optional): The keys which won't be removed. Defaults to [].
-        """
-        self.cache_size = cache_size
-        self.list_size = list_size
-        self.cache: OrderedDict[Hashable, List[Any]] = OrderedDict()
-        self.fixed_cache: Dict[Hashable, List[Any]] = {}
-        for key in fixed_keys:
-            self.fixed_cache[key] = []
-        self._lock = Lock()
-    def get(self, key: Hashable) -> List[Any]:
-        with self.lock():
-            if key in self.fixed_cache:
-                l = self.fixed_cache[key]
-                if len(l) >= self.list_size:
-                    return l
-            elif key in self.cache:
-                self.cache.move_to_end(key)
-                l = self.cache[key]
-                if len(l) >= self.list_size:
-                    return l
-        raise MissCacheError()
-    def add(self, key: Hashable, value: Any) -> None:
-        with self.lock():
-            if key in self.fixed_cache:
-                l = self.fixed_cache[key]
-                if len(l) < self.list_size and value not in l:
-                    l.append(value)
-            elif key in self.cache:
-                self.cache.move_to_end(key)
-                l = self.cache[key]
-                if len(l) < self.list_size and value not in l:
-                    l.append(value)
-            else:
-                if len(self.cache) >= self.cache_size:
-                    self.cache.popitem(last=False)
-                self.cache[key] = [value]
-    @contextmanager
-    def lock(self):
-        try:
-            self._lock.acquire()
-            yield
-        finally:
-            self._lock.release()
--- a/examples/tutorial/handson5/inference/opt_fastapi.py
+++ b/examples/tutorial/handson5/inference/opt_fastapi.py
-import argparse
-import logging
-import random
-from typing import Optional
-import uvicorn
-from energonai import QueueFullError, launch_engine
-from energonai.model import opt_6B, opt_30B, opt_125M, opt_175B
-from fastapi import FastAPI, HTTPException, Request
-from pydantic import BaseModel, Field
-from transformers import GPT2Tokenizer
-from batch import BatchManagerForGeneration
-from cache import ListCache, MissCacheError
-class GenerationTaskReq(BaseModel):
-    max_tokens: int = Field(gt=0, le=256, example=64)
-    prompt: str = Field(
-        min_length=1, example='Question: Where were the 2004 Olympics held?\nAnswer: Athens, Greece\n\nQuestion: What is the longest river on the earth?\nAnswer:')
-    top_k: Optional[int] = Field(default=None, gt=0, example=50)
-    top_p: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.5)
-    temperature: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.7)
-app = FastAPI()
-@app.post('/generation')
-async def generate(data: GenerationTaskReq, request: Request):
-    logger.info(f'{request.client.host}:{request.client.port} - "{request.method} {request.url.path}" - {data}')
-    key = (data.prompt, data.max_tokens)
-    try:
-        if cache is None:
-            raise MissCacheError()
-        outputs = cache.get(key)
-        output = random.choice(outputs)
-        logger.info('Cache hit')
-    except MissCacheError:
-        inputs = tokenizer(data.prompt, truncation=True, max_length=512)
-        inputs['max_tokens'] = data.max_tokens
-        inputs['top_k'] = data.top_k
-        inputs['top_p'] = data.top_p
-        inputs['temperature'] = data.temperature
-        try:
-            uid = id(data)
-            engine.submit(uid, inputs)
-            output = await engine.wait(uid)
-            output = tokenizer.decode(output, skip_special_tokens=True)
-            if cache is not None:
-                cache.add(key, output)
-        except QueueFullError as e:
-            raise HTTPException(status_code=406, detail=e.args[0])
-    return {'text': output}
-@app.on_event("shutdown")
-async def shutdown(*_):
-    engine.shutdown()
-    server.should_exit = True
-    server.force_exit = True
-    await server.shutdown()
-def get_model_fn(model_name: str):
-    model_map = {
-        'opt-125m': opt_125M,
-        'opt-6.7b': opt_6B,
-        'opt-30b': opt_30B,
-        'opt-175b': opt_175B
-    }
-    return model_map[model_name]
-def print_args(args: argparse.Namespace):
-    print('\n==> Args:')
-    for k, v in args.__dict__.items():
-        print(f'{k} = {v}')
-FIXED_CACHE_KEYS = [
-    ('Question: What is the name of the largest continent on earth?\nAnswer: Asia\n\nQuestion: What is at the center of the solar system?\nAnswer:', 64),
-    ('A chat between a salesman and a student.\n\nSalesman: Hi boy, are you looking for a new phone?\nStudent: Yes, my phone is not functioning well.\nSalesman: What is your budget? \nStudent: I have received my scholarship so I am fine with any phone.\nSalesman: Great, then perhaps this latest flagship phone is just right for you.', 64),
-    ("English: I am happy today.\nChinese: 我今天很开心。\n\nEnglish: I am going to play basketball.\nChinese: 我一会去打篮球。\n\nEnglish: Let's celebrate our anniversary.\nChinese:", 64)
-]
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('model', choices=['opt-125m', 'opt-6.7b', 'opt-30b', 'opt-175b'])
-    parser.add_argument('--tp', type=int, default=1)
-    parser.add_argument('--master_host', default='localhost')
-    parser.add_argument('--master_port', type=int, default=19990)
-    parser.add_argument('--rpc_port', type=int, default=19980)
-    parser.add_argument('--max_batch_size', type=int, default=8)
-    parser.add_argument('--pipe_size', type=int, default=1)
-    parser.add_argument('--queue_size', type=int, default=0)
-    parser.add_argument('--http_host', default='0.0.0.0')
-    parser.add_argument('--http_port', type=int, default=7070)
-    parser.add_argument('--checkpoint', default=None)
-    parser.add_argument('--cache_size', type=int, default=0)
-    parser.add_argument('--cache_list_size', type=int, default=1)
-    args = parser.parse_args()
-    print_args(args)
-    model_kwargs = {}
-    if args.checkpoint is not None:
-        model_kwargs['checkpoint'] = args.checkpoint
-    logger = logging.getLogger(__name__)
-    tokenizer = GPT2Tokenizer.from_pretrained('facebook/opt-30b')
-    if args.cache_size > 0:
-        cache = ListCache(args.cache_size, args.cache_list_size, fixed_keys=FIXED_CACHE_KEYS)
-    else:
-        cache = None
-    engine = launch_engine(args.tp, 1, args.master_host, args.master_port, args.rpc_port, get_model_fn(args.model),
-                           batch_manager=BatchManagerForGeneration(max_batch_size=args.max_batch_size,
-                                                                   pad_token_id=tokenizer.pad_token_id),
-                           pipe_size=args.pipe_size,
-                           queue_size=args.queue_size,
-                           **model_kwargs)
-    config = uvicorn.Config(app, host=args.http_host, port=args.http_port)
-    server = uvicorn.Server(config=config)
-    server.run()