[example] simplify opt example (#2344)

35e22be2 · Jiarui Fang · GitHub · 7080a8ed · 35e22be2 · 35e22be2
Unverified Commit 35e22be2 authored Jan 06, 2023 by Jiarui Fang Committed by GitHub Jan 06, 2023
9 changed files
--- a/examples/language/gpt/train_gpt_demo.py
+++ b/examples/language/gpt/train_gpt_demo.py
@@ -5,7 +5,6 @@ from time import time
 import psutil
 import torch
 import torch.nn as nn
-from model_zoo import model_builder
 from packaging import version
 from torch.nn.parallel import DistributedDataParallel as DDP
 from utils import get_data, get_tflops
@@ -16,6 +15,7 @@ from colossalai.nn.parallel import ZeroDDP
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext
+from model_zoo import model_builder
 CAI_VERSION = colossalai.__version__

--- a/examples/language/opt/README.md
+++ b/examples/language/opt/README.md
@@ -29,24 +29,5 @@ We adapt the OPT training code to ColossalAI by leveraging Gemini and ZeRO DDP.
 You can launch training by using the following bash script
 ```bash
-bash ./run_clm.sh <batch-size-per-gpu> <mem-cap> <model> <gpu-num>
+bash ./run_gemini.sh
 ```
- batch-size-per-gpu: number of samples fed to each GPU, default is 16
- mem-cap: limit memory usage within a value in GB, default is 0 (no limit)
- model: the size of the OPT model, default is `6.7b`. Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7`, `13b`, `30b`, `66b`. For `175b`, you can request
-the pretrained weights from [OPT weight downloading page](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT).
- gpu-num: the number of GPUs to use, default is 1.
-## Remarkable Performance
-On a single GPU, Colossal-AI’s automatic strategy provides remarkable performance gains from the ZeRO Offloading strategy by Microsoft DeepSpeed.
-Users can experience up to a 40% speedup, at a variety of model scales. However, when using a traditional deep learning training framework like PyTorch, a single GPU can no longer support the training of models at such a scale.
-<p align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT.png" width=1000/>
-</p>
-Adopting the distributed training strategy with 8 GPUs is as simple as adding a `-nprocs 8` to the training command of Colossal-AI!
-More details about behind the scenes can be found on the corresponding [blog](https://medium.com/@yangyou_berkeley/colossal-ai-seamlessly-accelerates-large-models-at-low-costs-with-hugging-face-4d1a887e500d),
-and a detailed tutorial will be added in [Documentation](https://www.colossalai.org/docs/get_started/installation) very soon.
--- a/examples/language/opt/benchmark.sh
+++ b/examples/language/opt/benchmark.sh
@@ -14,7 +14,7 @@ do
 pkill -9 torchrun
 pkill -9 python
-bash ./run_clm.sh $BS $MEMCAP $MODEL $GPUNUM
+env BS=$BS MEM_CAP=$MEMCAP MODEL=$MODEL GPUNUM=$GPUNUM bash ./run_gemini.sh
 done
 done
 done

--- a/examples/language/opt/colossalai_zero.py
+++ b/examples/language/opt/colossalai_zero.py
-from colossalai.zero.shard_utils import TensorShardStrategy
-zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
-                              tensor_placement_policy="auto",
-                              reuse_fp16_shard=True),
-            optimizer_config=dict(gpu_margin_mem_ratio=0.8, initial_scale=16384))
--- a/examples/language/opt/context.py
+++ b/examples/language/opt/context.py
-import torch.distributed as dist
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-class barrier_context():
-    """
-    This context manager is used to allow one process to execute while blocking all
-    other processes in the same process group. This is often useful when downloading is required
-    as we only want to download in one process to prevent file corruption.
-    Args:
-        executor_rank (int): the process rank to execute without blocking, all other processes will be blocked
-        parallel_mode (ParallelMode): the parallel mode corresponding to a process group
-    Usage:
-        with barrier_context():
-            dataset = CIFAR10(root='./data', download=True)
-    """
-    def __init__(self, executor_rank: int = 0, parallel_mode: ParallelMode = ParallelMode.GLOBAL):
-        # the class name is lowercase by convention
-        current_rank = gpc.get_local_rank(parallel_mode=parallel_mode)
-        self.should_block = current_rank != executor_rank
-        self.group = gpc.get_group(parallel_mode=parallel_mode)
-    def __enter__(self):
-        if self.should_block:
-            dist.barrier(group=self.group)
-    def __exit__(self, exc_type, exc_value, exc_traceback):
-        if not self.should_block:
-            dist.barrier(group=self.group)
--- a/examples/language/opt/requirements.txt
+++ b/examples/language/opt/requirements.txt
-colossalai
-torch >= 1.8.1
-datasets >= 1.8.0
-sentencepiece != 0.1.92
-protobuf
-accelerate == 0.13.2
--- a/examples/language/opt/run_clm.py
+++ b/examples/language/opt/run_clm.py
--- a/examples/language/opt/run_clm.sh
+++ b/examples/language/opt/run_clm.sh
 set -x
-export BS=${1:-16}
+export BS=${BS:-16}
-export MEMCAP=${2:-0}
+export MEMCAP=${MEMCAP:-0}
-export MODEL=${3:-"125m"}
+# Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7`, `13b`, `30b`, `66b`. For `175b`
-export GPUNUM=${4:-1}
+export MODEL=${MODEL:-"125m"}
+export GPUNUM=${GPUNUM:-1}
 # make directory for logs
 mkdir -p ./logs
@@ -13,10 +14,7 @@ export MODLE_PATH="facebook/opt-${MODEL}"
 torchrun \
  --nproc_per_node ${GPUNUM} \
  --master_port 19198 \
-  run_clm.py \
+  train_gemini_opt.py \
-  --dataset_name wikitext \
-  --dataset_config_name wikitext-2-raw-v1 \
-  --output_dir $PWD \
  --mem_cap ${MEMCAP} \
  --model_name_or_path ${MODLE_PATH} \
-  --per_device_train_batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
+  --batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
--- a/examples/language/opt/train_gemini_opt.py
+++ b/examples/language/opt/train_gemini_opt.py
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
+on a text file or a dataset without using HuggingFace Trainer.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=text-generation
+"""
+# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
+import time
+from functools import partial
+import datasets
+import torch
+import torch.distributed as dist
+import transformers
+from transformers import CONFIG_MAPPING, MODEL_MAPPING, AutoConfig, OPTForCausalLM
+from transformers.utils.versions import require_version
+import colossalai
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
+from colossalai.nn.parallel import GeminiDDP
+from colossalai.utils import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+def get_data(batch_size, seq_len, vocab_size):
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+    attention_mask = torch.ones_like(input_ids)
+    return input_ids, attention_mask
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+def get_time_stamp():
+    torch.cuda.synchronize()
+    return time.time()
+def get_tflops(model_numel, batch_size, seq_len, step_time):
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
+def parse_args():
+    parser = colossalai.get_default_parser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per dp group) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=20,
+        help="Total number of training steps to perform.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument("--mem_cap", type=int, default=0, help="use mem cap")
+    parser.add_argument("--init_in_cpu", action='store_true', default=False, help="init training model in cpu")
+    args = parser.parse_args()
+    return args
+def colo_memory_cap(size_in_GB):
+    from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
+    cuda_capacity = colo_device_memory_capacity(get_current_device())
+    if size_in_GB * (1024**3) < cuda_capacity:
+        colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
+        print("Using {} GB of GPU memory".format(size_in_GB))
+def main():
+    args = parse_args()
+    disable_existing_loggers()
+    colossalai.launch_from_torch({})
+    logger = get_dist_logger()
+    is_main_process = dist.get_rank() == 0
+    if is_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    if args.mem_cap > 0:
+        colo_memory_cap(args.mem_cap)
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        torch.mannul_seed(args.seed)
+        logger.info(f"Rank {dist.get_rank()}: random seed is set to {args.seed}")
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    logger.info("Model config has been created", ranks=[0])
+    if args.init_in_cpu:
+        init_dev = torch.device('cpu')
+    else:
+        init_dev = get_current_device()
+    # build model
+    if args.model_name_or_path is None or args.model_name_or_path == 'facebook/opt-13b':
+        # currently, there has a bug in pretrained opt-13b
+        # we can not import it until huggingface fix it
+        logger.info("Train a new model from scratch", ranks=[0])
+        with ColoInitContext(device=init_dev, dtype=torch.half):
+            model = OPTForCausalLM(config)
+    else:
+        logger.info("Finetune a pre-trained model", ranks=[0])
+        with ColoInitContext(device=init_dev, dtype=torch.half):
+            model = OPTForCausalLM.from_pretrained(args.model_name_or_path,
+                                                   from_tf=bool(".ckpt" in args.model_name_or_path),
+                                                   config=config,
+                                                   local_files_only=False)
+    # enable graident checkpointing
+    model.gradient_checkpointing_enable()
+    numel = sum([p.numel() for p in model.parameters()])
+    PLACEMENT_POLICY = 'cpu'
+    model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True)
+    optimizer = GeminiAdamOptimizer(model, lr=args.learning_rate, initial_scale=2**14, gpu_margin_mem_ratio=0.0)
+    SEQ_LEN = 1024
+    VOCAB_SIZE = 50257
+    get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
+    model.train()
+    for step in range(args.max_train_steps):
+        st_time = time.time()
+        input_ids, attn_mask = get_data(args.batch_size, SEQ_LEN, VOCAB_SIZE)
+        outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, use_cache=False)
+        loss = outputs['loss']
+        optimizer.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+        torch.cuda.synchronize()
+        step_time = time.time() - st_time
+        step_tflops = get_tflops_func(step_time)
+        logger.info("step {} finished, Tflops {}".format(step, step_tflops), ranks=[0])
+    logger.info("Training finished", ranks=[0])
+if __name__ == "__main__":
+    main()