"git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "d2cd48e0bec01a4192a5248ea312e108544066b9"
Unverified Commit 35e22be2 authored by Jiarui Fang's avatar Jiarui Fang Committed by GitHub
Browse files

[example] simplify opt example (#2344)

parent 7080a8ed
...@@ -5,7 +5,6 @@ from time import time ...@@ -5,7 +5,6 @@ from time import time
import psutil import psutil
import torch import torch
import torch.nn as nn import torch.nn as nn
from model_zoo import model_builder
from packaging import version from packaging import version
from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallel as DDP
from utils import get_data, get_tflops from utils import get_data, get_tflops
...@@ -16,6 +15,7 @@ from colossalai.nn.parallel import ZeroDDP ...@@ -16,6 +15,7 @@ from colossalai.nn.parallel import ZeroDDP
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from colossalai.utils.model.colo_init_context import ColoInitContext from colossalai.utils.model.colo_init_context import ColoInitContext
from model_zoo import model_builder
CAI_VERSION = colossalai.__version__ CAI_VERSION = colossalai.__version__
......
...@@ -29,24 +29,5 @@ We adapt the OPT training code to ColossalAI by leveraging Gemini and ZeRO DDP. ...@@ -29,24 +29,5 @@ We adapt the OPT training code to ColossalAI by leveraging Gemini and ZeRO DDP.
You can launch training by using the following bash script You can launch training by using the following bash script
```bash ```bash
bash ./run_clm.sh <batch-size-per-gpu> <mem-cap> <model> <gpu-num> bash ./run_gemini.sh
``` ```
- batch-size-per-gpu: number of samples fed to each GPU, default is 16
- mem-cap: limit memory usage within a value in GB, default is 0 (no limit)
- model: the size of the OPT model, default is `6.7b`. Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7`, `13b`, `30b`, `66b`. For `175b`, you can request
the pretrained weights from [OPT weight downloading page](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT).
- gpu-num: the number of GPUs to use, default is 1.
## Remarkable Performance
On a single GPU, Colossal-AI’s automatic strategy provides remarkable performance gains from the ZeRO Offloading strategy by Microsoft DeepSpeed.
Users can experience up to a 40% speedup, at a variety of model scales. However, when using a traditional deep learning training framework like PyTorch, a single GPU can no longer support the training of models at such a scale.
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/OPT.png" width=1000/>
</p>
Adopting the distributed training strategy with 8 GPUs is as simple as adding a `-nprocs 8` to the training command of Colossal-AI!
More details about behind the scenes can be found on the corresponding [blog](https://medium.com/@yangyou_berkeley/colossal-ai-seamlessly-accelerates-large-models-at-low-costs-with-hugging-face-4d1a887e500d),
and a detailed tutorial will be added in [Documentation](https://www.colossalai.org/docs/get_started/installation) very soon.
...@@ -14,7 +14,7 @@ do ...@@ -14,7 +14,7 @@ do
pkill -9 torchrun pkill -9 torchrun
pkill -9 python pkill -9 python
bash ./run_clm.sh $BS $MEMCAP $MODEL $GPUNUM env BS=$BS MEM_CAP=$MEMCAP MODEL=$MODEL GPUNUM=$GPUNUM bash ./run_gemini.sh
done done
done done
done done
......
from colossalai.zero.shard_utils import TensorShardStrategy
zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
tensor_placement_policy="auto",
reuse_fp16_shard=True),
optimizer_config=dict(gpu_margin_mem_ratio=0.8, initial_scale=16384))
import torch.distributed as dist
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
class barrier_context():
"""
This context manager is used to allow one process to execute while blocking all
other processes in the same process group. This is often useful when downloading is required
as we only want to download in one process to prevent file corruption.
Args:
executor_rank (int): the process rank to execute without blocking, all other processes will be blocked
parallel_mode (ParallelMode): the parallel mode corresponding to a process group
Usage:
with barrier_context():
dataset = CIFAR10(root='./data', download=True)
"""
def __init__(self, executor_rank: int = 0, parallel_mode: ParallelMode = ParallelMode.GLOBAL):
# the class name is lowercase by convention
current_rank = gpc.get_local_rank(parallel_mode=parallel_mode)
self.should_block = current_rank != executor_rank
self.group = gpc.get_group(parallel_mode=parallel_mode)
def __enter__(self):
if self.should_block:
dist.barrier(group=self.group)
def __exit__(self, exc_type, exc_value, exc_traceback):
if not self.should_block:
dist.barrier(group=self.group)
colossalai
torch >= 1.8.1
datasets >= 1.8.0
sentencepiece != 0.1.92
protobuf
accelerate == 0.13.2
This diff is collapsed.
set -x set -x
export BS=${1:-16} export BS=${BS:-16}
export MEMCAP=${2:-0} export MEMCAP=${MEMCAP:-0}
export MODEL=${3:-"125m"} # Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7`, `13b`, `30b`, `66b`. For `175b`
export GPUNUM=${4:-1} export MODEL=${MODEL:-"125m"}
export GPUNUM=${GPUNUM:-1}
# make directory for logs # make directory for logs
mkdir -p ./logs mkdir -p ./logs
...@@ -13,10 +14,7 @@ export MODLE_PATH="facebook/opt-${MODEL}" ...@@ -13,10 +14,7 @@ export MODLE_PATH="facebook/opt-${MODEL}"
torchrun \ torchrun \
--nproc_per_node ${GPUNUM} \ --nproc_per_node ${GPUNUM} \
--master_port 19198 \ --master_port 19198 \
run_clm.py \ train_gemini_opt.py \
--dataset_name wikitext \
--dataset_config_name wikitext-2-raw-v1 \
--output_dir $PWD \
--mem_cap ${MEMCAP} \ --mem_cap ${MEMCAP} \
--model_name_or_path ${MODLE_PATH} \ --model_name_or_path ${MODLE_PATH} \
--per_device_train_batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log --batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
on a text file or a dataset without using HuggingFace Trainer.
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=text-generation
"""
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
import time
from functools import partial
import datasets
import torch
import torch.distributed as dist
import transformers
from transformers import CONFIG_MAPPING, MODEL_MAPPING, AutoConfig, OPTForCausalLM
from transformers.utils.versions import require_version
import colossalai
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
from colossalai.nn.parallel import GeminiDDP
from colossalai.utils import get_current_device
from colossalai.utils.model.colo_init_context import ColoInitContext
def get_data(batch_size, seq_len, vocab_size):
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
attention_mask = torch.ones_like(input_ids)
return input_ids, attention_mask
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
def get_time_stamp():
torch.cuda.synchronize()
return time.time()
def get_tflops(model_numel, batch_size, seq_len, step_time):
return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
def parse_args():
parser = colossalai.get_default_parser()
parser.add_argument(
"--model_name_or_path",
type=str,
help="Path to pretrained model or model identifier from huggingface.co/models.",
required=True,
)
parser.add_argument(
"--config_name",
type=str,
default=None,
help="Pretrained config name or path if not the same as model_name",
)
parser.add_argument(
"--batch_size",
type=int,
default=8,
help="Batch size (per dp group) for the training dataloader.",
)
parser.add_argument(
"--learning_rate",
type=float,
default=5e-5,
help="Initial learning rate (after the potential warmup period) to use.",
)
parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
parser.add_argument(
"--max_train_steps",
type=int,
default=20,
help="Total number of training steps to perform.",
)
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--model_type",
type=str,
default=None,
help="Model type to use if training from scratch.",
choices=MODEL_TYPES,
)
parser.add_argument("--mem_cap", type=int, default=0, help="use mem cap")
parser.add_argument("--init_in_cpu", action='store_true', default=False, help="init training model in cpu")
args = parser.parse_args()
return args
def colo_memory_cap(size_in_GB):
from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
cuda_capacity = colo_device_memory_capacity(get_current_device())
if size_in_GB * (1024**3) < cuda_capacity:
colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
print("Using {} GB of GPU memory".format(size_in_GB))
def main():
args = parse_args()
disable_existing_loggers()
colossalai.launch_from_torch({})
logger = get_dist_logger()
is_main_process = dist.get_rank() == 0
if is_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
if args.mem_cap > 0:
colo_memory_cap(args.mem_cap)
# If passed along, set the training seed now.
if args.seed is not None:
torch.mannul_seed(args.seed)
logger.info(f"Rank {dist.get_rank()}: random seed is set to {args.seed}")
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# Load pretrained model
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if args.config_name:
config = AutoConfig.from_pretrained(args.config_name)
elif args.model_name_or_path:
config = AutoConfig.from_pretrained(args.model_name_or_path)
else:
config = CONFIG_MAPPING[args.model_type]()
logger.warning("You are instantiating a new config instance from scratch.")
logger.info("Model config has been created", ranks=[0])
if args.init_in_cpu:
init_dev = torch.device('cpu')
else:
init_dev = get_current_device()
# build model
if args.model_name_or_path is None or args.model_name_or_path == 'facebook/opt-13b':
# currently, there has a bug in pretrained opt-13b
# we can not import it until huggingface fix it
logger.info("Train a new model from scratch", ranks=[0])
with ColoInitContext(device=init_dev, dtype=torch.half):
model = OPTForCausalLM(config)
else:
logger.info("Finetune a pre-trained model", ranks=[0])
with ColoInitContext(device=init_dev, dtype=torch.half):
model = OPTForCausalLM.from_pretrained(args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
local_files_only=False)
# enable graident checkpointing
model.gradient_checkpointing_enable()
numel = sum([p.numel() for p in model.parameters()])
PLACEMENT_POLICY = 'cpu'
model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True)
optimizer = GeminiAdamOptimizer(model, lr=args.learning_rate, initial_scale=2**14, gpu_margin_mem_ratio=0.0)
SEQ_LEN = 1024
VOCAB_SIZE = 50257
get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
model.train()
for step in range(args.max_train_steps):
st_time = time.time()
input_ids, attn_mask = get_data(args.batch_size, SEQ_LEN, VOCAB_SIZE)
outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, use_cache=False)
loss = outputs['loss']
optimizer.backward(loss)
optimizer.step()
optimizer.zero_grad()
torch.cuda.synchronize()
step_time = time.time() - st_time
step_tflops = get_tflops_func(step_time)
logger.info("step {} finished, Tflops {}".format(step, step_tflops), ranks=[0])
logger.info("Training finished", ranks=[0])
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment