remove chatgpt (#3284)

bb6196e7 · Fazzie-Maqianli · GitHub · b0ce5a10 · b0ce5a10 · b0ce5a10
Unverified Commit bb6196e7 authored Mar 28, 2023 by Fazzie-Maqianli Committed by GitHub Mar 28, 2023
11 changed files
--- a/applications/ChatGPT/examples/train_rm.sh
+++ b/applications/ChatGPT/examples/train_rm.sh
-set_n_least_used_CUDA_VISIBLE_DEVICES 1
-
-python train_reward_model.py --pretrain 'microsoft/deberta-v3-large' \
-                             --model 'deberta' \
-                             --strategy naive \
-                             --loss_fn 'log_exp'\
-                             --save_path 'rmstatic.pt' \
-                             --test True
--- a/applications/ChatGPT/examples/train_sft.py
+++ b/applications/ChatGPT/examples/train_sft.py
-import argparse
-
-import loralib as lora
-import torch
-import torch.distributed as dist
-from torch.utils.data.distributed import DistributedSampler
-from chatgpt.dataset import SFTDataset, AlpacaDataset, AlpacaDataCollator
-from chatgpt.models.base import RewardModel
-from chatgpt.models.bloom import BLOOMLM
-from chatgpt.models.gpt import GPTLM
-from chatgpt.models.opt import OPTLM
-from chatgpt.models.llama import LlamaLM
-from chatgpt.trainer import SFTTrainer
-from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from chatgpt.utils import prepare_llama_tokenizer_and_embedding
-from datasets import load_dataset
-from torch.optim import Adam
-from torch.utils.data import DataLoader
-from transformers import AutoTokenizer, BloomTokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.logging import get_dist_logger
-
-
-def train(args):
-    # configure strategy
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
-        strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    # configure model
-    with strategy.model_init_context():
-        if args.model == 'bloom':
-            model = BLOOMLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
-        elif args.model == 'opt':
-            model = OPTLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
-        elif args.model == 'gpt2':
-            model = GPTLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
-        elif args.model == 'llama':
-            model = LlamaLM(pretrained=args.pretrain, lora_rank=args.lora_rank).cuda()
-        else:
-            raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure tokenizer
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = BloomTokenizerFast.from_pretrained(args.pretrain)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    elif args.model == 'llama':
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.pretrain,
-            padding_side="right",
-            use_fast=False,
-        )
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-    
-    if args.model == 'llama':
-        tokenizer = prepare_llama_tokenizer_and_embedding(tokenizer, model)
-    else:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    max_len = 512
-
-    # configure optimizer
-    if args.strategy.startswith('colossalai'):
-        optim = HybridAdam(model.parameters(), lr=5e-5)
-    else:
-        optim = Adam(model.parameters(), lr=5e-5)
-
-    logger = get_dist_logger()
-
-    # configure dataset
-    if args.dataset == 'yizhongw/self_instruct':
-        train_data = load_dataset(args.dataset, 'super_natural_instructions', split='train')
-        eval_data = load_dataset(args.dataset, 'super_natural_instructions', split='test')
-
-        train_dataset = SFTDataset(train_data, tokenizer, max_len)
-        eval_dataset = SFTDataset(eval_data, tokenizer, max_len)
-
-    elif 'alpaca' in args.dataset:
-        train_dataset = AlpacaDataset(tokenizer=tokenizer, data_path=args.dataset)
-        eval_dataset = None
-        data_collator = AlpacaDataCollator(tokenizer=tokenizer)
-
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        train_sampler = DistributedSampler(train_dataset, shuffle=True, seed=42, drop_last=True)
-        if eval_dataset is not None:
-            eval_sampler = DistributedSampler(eval_dataset, shuffle=False, seed=42, drop_last=False)
-    else:
-        train_sampler = None
-        eval_sampler = None
-
-    train_dataloader = DataLoader(train_dataset, shuffle=(train_sampler is None), sampler=train_sampler, batch_size=args.batch_size, collate_fn=data_collator)
-    if eval_dataset is not None:
-        eval_dataloader = DataLoader(eval_dataset, shuffle=(eval_sampler is None), sampler=eval_sampler, batch_size=args.batch_size, collate_fn=data_collator)
-    else:
-        eval_dataloader = None
-
-    trainer = SFTTrainer(model=model,
-                         strategy=strategy,
-                         optim=optim,
-                         train_dataloader=train_dataloader,
-                         eval_dataloader=eval_dataloader,
-                         batch_size=args.batch_size,
-                         max_epochs=args.max_epochs)
-
-    trainer.fit(logger=logger, use_lora=args.lora_rank, log_interval=args.log_interval)
-
-    # save model checkpoint after fitting on only rank0
-    strategy.save_model(model, 'sft_checkpoint.pt', only_rank0=True)
-    # save optimizer checkpoint on all ranks
-    strategy.save_optimizer(optim, 'sft_optim_checkpoint_%d.pt' % (torch.cuda.current_device()), only_rank0=False)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini', 'colossalai_zero2'],
-                        default='naive')
-    parser.add_argument('--model', choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom')
-    parser.add_argument('--pretrain', type=str, default=None)
-    parser.add_argument('--dataset', type=str, default='yizhongw/self_instruct')
-    parser.add_argument('--save_path', type=str, default='sft_ckpt.pth')
-    parser.add_argument('--max_epochs', type=int, default=1)
-    parser.add_argument('--batch_size', type=int, default=4)
-    parser.add_argument('--lora_rank', type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument('--log_interval', type=int, default=100, help="how many steps to log")
-    args = parser.parse_args()
-    train(args)
-
--- a/applications/ChatGPT/examples/train_sft.sh
+++ b/applications/ChatGPT/examples/train_sft.sh
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
-        | tail -n +2 \
-        | nl -v 0 \
-        | tee /dev/tty \
-        | sort -g -k 2 \
-        | awk '{print $1}' \
-        | head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 8
-
-#torchrun --standalone --nproc_per_node=2 train_sft.py --pretrain 'bigscience/bloomz-560m' --model 'bloom' --strategy colossalai_zero2 --log_interval 10
-#torchrun --standalone --nproc_per_node=8 train_sft.py  --model 'gpt2' --strategy colossalai_zero2 --batch_size 1 --log_interval 10
-torchrun --standalone --nproc_per_node=8 train_sft.py \
-    --pretrain "/data/personal/nus-mql/LLAMA-7B" \
-    --model 'llama' \
-    --strategy colossalai_zero2 \
-    --log_interval 10 \
-    --save_path /data/personal/nus-mql/Coati-7B \
-    --dataset /data/personal/nus-mql/stanford_alpaca/alpaca_data.json
--- a/applications/ChatGPT/pytest.ini
+++ b/applications/ChatGPT/pytest.ini
-[pytest]
-markers =
-    cpu: tests which can run on CPU
-    gpu: tests which requires a single GPU
-    dist: tests which are run in a multi-GPU or multi-machine environment
-    experiment: tests for experimental features
--- a/applications/ChatGPT/requirements-test.txt
+++ b/applications/ChatGPT/requirements-test.txt
-pytest
--- a/applications/ChatGPT/requirements.txt
+++ b/applications/ChatGPT/requirements.txt
-transformers>=4.20.1
-tqdm
-datasets
-loralib
-colossalai>=0.2.4
-torch==1.12.1
-langchain
--- a/applications/ChatGPT/setup.py
+++ b/applications/ChatGPT/setup.py
-from setuptools import find_packages, setup
-
-
-def fetch_requirements(path):
-    with open(path, 'r') as fd:
-        return [r.strip() for r in fd.readlines()]
-
-
-def fetch_readme():
-    with open('README.md', encoding='utf-8') as f:
-        return f.read()
-
-
-def fetch_version():
-    with open('version.txt', 'r') as f:
-        return f.read().strip()
-
-
-setup(
-    name='chatgpt',
-    version=fetch_version(),
-    packages=find_packages(exclude=(
-        'tests',
-        'benchmarks',
-        '*.egg-info',
-    )),
-    description='A RLFH implementation (ChatGPT) powered by ColossalAI',
-    long_description=fetch_readme(),
-    long_description_content_type='text/markdown',
-    license='Apache Software License 2.0',
-    url='https://github.com/hpcaitech/ChatGPT',
-    install_requires=fetch_requirements('requirements.txt'),
-    python_requires='>=3.6',
-    classifiers=[
-        'Programming Language :: Python :: 3',
-        'License :: OSI Approved :: Apache Software License',
-        'Environment :: GPU :: NVIDIA CUDA',
-        'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'Topic :: System :: Distributed Computing',
-    ],
-)
--- a/applications/ChatGPT/tests/__init__.py
+++ b/applications/ChatGPT/tests/__init__.py
--- a/applications/ChatGPT/tests/test_checkpoint.py
+++ b/applications/ChatGPT/tests/test_checkpoint.py
-import os
-import tempfile
-from contextlib import nullcontext
-from functools import partial
-
-import pytest
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
-from chatgpt.models.gpt import GPTActor
-from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import rerun_if_address_is_in_use
-from colossalai.utils import free_port
-
-GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)
-
-
-def get_data(batch_size: int, seq_len: int = 10) -> dict:
-    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
-    attention_mask = torch.ones_like(input_ids)
-    return dict(input_ids=input_ids, attention_mask=attention_mask)
-
-
-def run_test_checkpoint(strategy):
-    BATCH_SIZE = 2
-
-    if strategy == 'ddp':
-        strategy = DDPStrategy()
-    elif strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
-    elif strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    else:
-        raise ValueError(f'Unsupported strategy "{strategy}"')
-
-    with strategy.model_init_context():
-        actor = GPTActor(config=GPT_CONFIG).cuda()
-
-    actor_optim = HybridAdam(actor.parameters())
-
-    actor, actor_optim = strategy.prepare((actor, actor_optim))
-
-    def run_step():
-        data = get_data(BATCH_SIZE)
-        action_mask = torch.ones_like(data['attention_mask'], dtype=torch.bool)
-        action_log_probs = actor(data['input_ids'], action_mask.size(1), data['attention_mask'])
-        loss = action_log_probs.sum()
-        strategy.backward(loss, actor, actor_optim)
-        strategy.optimizer_step(actor_optim)
-
-    run_step()
-
-    ctx = tempfile.TemporaryDirectory() if dist.get_rank() == 0 else nullcontext()
-
-    with ctx as dirname:
-        rank0_dirname = [dirname]
-        dist.broadcast_object_list(rank0_dirname)
-        rank0_dirname = rank0_dirname[0]
-
-        model_path = os.path.join(rank0_dirname, 'model.pt')
-        optim_path = os.path.join(rank0_dirname, f'optim-r{dist.get_rank()}.pt')
-
-        strategy.save_model(actor, model_path, only_rank0=True)
-        strategy.save_optimizer(actor_optim, optim_path, only_rank0=False)
-
-        dist.barrier()
-
-        strategy.load_model(actor, model_path, strict=False)
-        strategy.load_optimizer(actor_optim, optim_path)
-
-        dist.barrier()
-
-    run_step()
-
-
-def run_dist(rank, world_size, port, strategy):
-    os.environ['RANK'] = str(rank)
-    os.environ['LOCAL_RANK'] = str(rank)
-    os.environ['WORLD_SIZE'] = str(world_size)
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = str(port)
-    run_test_checkpoint(strategy)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [2])
-@pytest.mark.parametrize('strategy', ['ddp', 'colossalai_zero2', 'colossalai_gemini'])
-@rerun_if_address_is_in_use()
-def test_checkpoint(world_size, strategy):
-    run_func = partial(run_dist, world_size=world_size, port=free_port(), strategy=strategy)
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_checkpoint(2, 'colossalai_zero2')
--- a/applications/ChatGPT/tests/test_data.py
+++ b/applications/ChatGPT/tests/test_data.py
-import os
-from copy import deepcopy
-from functools import partial
-
-import pytest
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
-from chatgpt.experience_maker import NaiveExperienceMaker
-from chatgpt.models.base import RewardModel
-from chatgpt.models.gpt import GPTActor, GPTCritic
-from chatgpt.replay_buffer import NaiveReplayBuffer
-from chatgpt.trainer.strategies import ColossalAIStrategy, DDPStrategy
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-
-from colossalai.testing import rerun_if_address_is_in_use
-from colossalai.utils import free_port
-
-GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)
-
-
-def get_data(batch_size: int, seq_len: int = 10) -> dict:
-    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
-    attention_mask = torch.ones_like(input_ids)
-    return dict(input_ids=input_ids, attention_mask=attention_mask)
-
-
-def gather_and_equal(tensor: torch.Tensor) -> bool:
-    world_size = dist.get_world_size()
-    outputs = [torch.empty_like(tensor) for _ in range(world_size)]
-    dist.all_gather(outputs, tensor.contiguous())
-    for t in outputs[1:]:
-        if not torch.equal(outputs[0], t):
-            return False
-    return True
-
-
-def run_test_data(strategy):
-    EXPERINCE_BATCH_SIZE = 4
-    SAMPLE_BATCH_SIZE = 2
-
-    if strategy == 'ddp':
-        strategy = DDPStrategy()
-    elif strategy == 'colossalai':
-        strategy = ColossalAIStrategy(placement_policy='cuda')
-    else:
-        raise ValueError(f'Unsupported strategy "{strategy}"')
-
-    actor = GPTActor(config=GPT_CONFIG).cuda()
-    critic = GPTCritic(config=GPT_CONFIG).cuda()
-
-    initial_model = deepcopy(actor)
-    reward_model = RewardModel(deepcopy(critic.model)).cuda()
-
-    experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model)
-    replay_buffer = NaiveReplayBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
-
-    # experience of all ranks should be the same
-    for _ in range(2):
-        data = get_data(EXPERINCE_BATCH_SIZE)
-        assert gather_and_equal(data['input_ids'])
-        assert gather_and_equal(data['attention_mask'])
-        experience = experience_maker.make_experience(**data,
-                                                      do_sample=True,
-                                                      max_length=16,
-                                                      eos_token_id=50256,
-                                                      pad_token_id=50256)
-        assert gather_and_equal(experience.sequences)
-        assert gather_and_equal(experience.action_log_probs)
-        assert gather_and_equal(experience.values)
-        assert gather_and_equal(experience.reward)
-        assert gather_and_equal(experience.advantages)
-        assert gather_and_equal(experience.action_mask)
-        assert gather_and_equal(experience.attention_mask)
-        replay_buffer.append(experience)
-
-    # replay buffer's data should be the same
-    buffer_size = torch.tensor([len(replay_buffer)], device='cuda')
-    assert gather_and_equal(buffer_size)
-    for item in replay_buffer.items:
-        assert gather_and_equal(item.sequences)
-        assert gather_and_equal(item.action_log_probs)
-        assert gather_and_equal(item.values)
-        assert gather_and_equal(item.reward)
-        assert gather_and_equal(item.advantages)
-        assert gather_and_equal(item.action_mask)
-        assert gather_and_equal(item.attention_mask)
-
-    # dataloader of each rank should have the same size and different batch
-    dataloader = strategy.setup_dataloader(replay_buffer)
-    dataloader_size = torch.tensor([len(dataloader)], device='cuda')
-    assert gather_and_equal(dataloader_size)
-    for experience in dataloader:
-        assert not gather_and_equal(experience.sequences)
-        assert not gather_and_equal(experience.action_log_probs)
-        assert not gather_and_equal(experience.values)
-        assert not gather_and_equal(experience.reward)
-        assert not gather_and_equal(experience.advantages)
-        # action mask and attention mask may be same
-
-
-def run_dist(rank, world_size, port, strategy):
-    os.environ['RANK'] = str(rank)
-    os.environ['LOCAL_RANK'] = str(rank)
-    os.environ['WORLD_SIZE'] = str(world_size)
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = str(port)
-    run_test_data(strategy)
-
-
-@pytest.mark.skip
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [2])
-@pytest.mark.parametrize('strategy', ['ddp', 'colossalai'])
-@rerun_if_address_is_in_use()
-def test_data(world_size, strategy):
-    run_func = partial(run_dist, world_size=world_size, port=free_port(), strategy=strategy)
-    mp.spawn(run_func, nprocs=world_size)
-
-
-if __name__ == '__main__':
-    test_data(2, 'colossalai')
--- a/applications/ChatGPT/version.txt
+++ b/applications/ChatGPT/version.txt
-1.0.0