Unverified Commit 862fbaaa authored by Tong Li's avatar Tong Li Committed by GitHub
Browse files

[Feature] Support LLaMA-3 CPT and ST (#5619)

* support LLaMA-3

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* Run pre-commit

---------
Co-authored-by: default avatarpre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
parent e094933d
...@@ -10,10 +10,10 @@ import math ...@@ -10,10 +10,10 @@ import math
import os import os
from multiprocessing import cpu_count from multiprocessing import cpu_count
from colossal_llama2.dataset.conversation import default_conversation from colossal_llama.dataset.conversation import default_conversation
from colossal_llama2.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft from colossal_llama.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft
from datasets import dataset_dict, load_dataset from datasets import dataset_dict, load_dataset
from transformers.models.llama.tokenization_llama import LlamaTokenizer from transformers import AddedToken, AutoTokenizer
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
...@@ -32,35 +32,25 @@ def main(): ...@@ -32,35 +32,25 @@ def main():
parser.add_argument( parser.add_argument(
"--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer" "--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer"
) )
parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory") parser.add_argument("--data_output_dirs", type=str, default="data_output_dirs", help="Data output directory")
parser.add_argument( parser.add_argument("--max_length", type=int, default=8192, help="Max length of each spliced tokenized sequence")
"--data_jsonl_output_dir",
type=str,
default="jsonl_output",
help="Output directory of spliced dataset with jsonl format",
)
parser.add_argument(
"--data_arrow_output_dir",
type=str,
default="arrow_output",
help="Output directory of spliced dataset with arrow format",
)
parser.add_argument("--max_length", type=int, default=4096, help="Max length of each spliced tokenized sequence")
parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins") parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins")
parser.add_argument("--llama_version", type=int, default=3, help="LLaMA version")
args = parser.parse_args() args = parser.parse_args()
if args.num_spliced_dataset_bins >= 100000: if args.num_spliced_dataset_bins >= 100000:
raise ValueError("Too many spliced divisions, must be smaller than 100000") raise ValueError("Too many spliced divisions, must be smaller than 100000")
assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}" args.data_cache_dir = os.path.join(args.data_output_dirs, "cache")
assert not os.path.exists( args.data_jsonl_output_dir = os.path.join(args.data_output_dirs, "jsonl")
args.data_jsonl_output_dir args.data_arrow_output_dir = os.path.join(args.data_output_dirs, "arrow")
), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}"
assert not os.path.exists( if not os.path.exists(args.data_cache_dir):
args.data_arrow_output_dir os.makedirs(args.data_cache_dir)
), f"Find existed arrow data output dir {args.data_arrow_output_dir}" if not os.path.exists(args.data_jsonl_output_dir):
os.makedirs(args.data_jsonl_output_dir) os.makedirs(args.data_jsonl_output_dir)
os.makedirs(args.data_arrow_output_dir) if not os.path.exists(args.data_arrow_output_dir):
os.makedirs(args.data_arrow_output_dir)
# Prepare to all input datasets # Prepare to all input datasets
input_data_paths = [] input_data_paths = []
...@@ -83,11 +73,20 @@ def main(): ...@@ -83,11 +73,20 @@ def main():
train_splits.append(f"train[{start}%:{end}%]") train_splits.append(f"train[{start}%:{end}%]")
# Prepare to the tokenizer. # Prepare to the tokenizer.
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_dir) tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
# Fix </s> split issue: https://github.com/huggingface/transformers/issues/23833
if args.llama_version == 2:
tokenizer.add_tokens(AddedToken("</s>", normalized=False, special=True), special_tokens=True)
tokenizer.add_bos_token = False tokenizer.add_bos_token = False
tokenizer.add_eos_token = False tokenizer.add_eos_token = False
if tokenizer.pad_token is None: if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.unk_token if tokenizer.unk_token is not None:
tokenizer.pad_token = tokenizer.unk_token
else:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.unk_token = tokenizer.eos_token
list_dataset = load_dataset( list_dataset = load_dataset(
path="json", path="json",
......
torch<2.0.0, >=1.12.1 torch==2.1.2
packaging==23.1 huggingface-hub
colossalai==0.3.5 packaging==24.0
colossalai==0.3.6
autoflake==2.2.1 autoflake==2.2.1
black==23.9.1 black==23.9.1
transformers==4.33.3 transformers==4.34.1
tensorboard==2.14.0 tensorboard==2.14.0
six==1.16.0 six==1.16.0
datasets datasets
......
import argparse import argparse
from colossal_llama2.utils.stream_chat_patch import streaming_chat from colossal_llama.utils.stream_chat_patch import streaming_chat
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
SYSTEM = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." SYSTEM = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
......
...@@ -12,18 +12,18 @@ from contextlib import nullcontext ...@@ -12,18 +12,18 @@ from contextlib import nullcontext
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossal_llama2.dataset.loader import ( from colossal_llama.dataset.loader import (
DataCollatorForSupervisedDataset, DataCollatorForSupervisedDataset,
StatefulDistributedSampler, StatefulDistributedSampler,
load_tokenized_dataset, load_tokenized_dataset,
) )
from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint from colossal_llama.utils.ckpt_io import load_checkpoint, save_checkpoint
from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention from colossal_llama.utils.flash_attention_patch import replace_with_flash_attention
from colossal_llama2.utils.froze import freeze_non_embeds_parameters from colossal_llama.utils.froze import freeze_non_embeds_parameters
from colossal_llama2.utils.neftune_patch import activate_neftune, deactivate_neftune from colossal_llama.utils.neftune_patch import activate_neftune, deactivate_neftune
from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm from tqdm import tqdm
from transformers import LlamaForCausalLM, LlamaTokenizer from transformers import AutoTokenizer, LlamaForCausalLM
import colossalai import colossalai
from colossalai.accelerator import get_accelerator from colossalai.accelerator import get_accelerator
...@@ -89,7 +89,7 @@ def main() -> None: ...@@ -89,7 +89,7 @@ def main() -> None:
parser.add_argument("--accumulation_steps", type=int, default=1, help="Number of accumulation steps") parser.add_argument("--accumulation_steps", type=int, default=1, help="Number of accumulation steps")
parser.add_argument("--micro_batch_size", type=int, default=2, help="Batch size of each process") parser.add_argument("--micro_batch_size", type=int, default=2, help="Batch size of each process")
parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate") parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate")
parser.add_argument("--max_length", type=int, default=4096, help="Model max length") parser.add_argument("--max_length", type=int, default=8192, help="Model max length")
parser.add_argument( parser.add_argument(
"--mixed_precision", "--mixed_precision",
type=str, type=str,
...@@ -196,7 +196,7 @@ def main() -> None: ...@@ -196,7 +196,7 @@ def main() -> None:
# ====================================================== # ======================================================
# Initialize Tokenizer, Dataset, Collator and Dataloader # Initialize Tokenizer, Dataset, Collator and Dataloader
# ====================================================== # ======================================================
tokenizer = LlamaTokenizer.from_pretrained(args.pretrained) tokenizer = AutoTokenizer.from_pretrained(args.pretrained)
if args.pad_token == "eos": if args.pad_token == "eos":
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
elif args.pad_token == "unk": elif args.pad_token == "unk":
......
...@@ -5,7 +5,7 @@ This directory contains the applications that are powered by Colossal-AI. ...@@ -5,7 +5,7 @@ This directory contains the applications that are powered by Colossal-AI.
The list of applications include: The list of applications include:
- [X] [Open-Sora](https://github.com/hpcaitech/Open-Sora): Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models - [X] [Open-Sora](https://github.com/hpcaitech/Open-Sora): Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models
- [X] [Colossal-LLaMA-2](./Colossal-LLaMA-2/): Continual Pre-training of LLaMA-2. - [X] [Colossal-LLaMA](./Colossal-LLaMA/): Continual Pre-training and Supervisied Fine-tuning of LLaMA2 / LLaMA3.
- [X] [ColossalEval](./ColossalEval): Evaluation Pipeline for LLMs. - [X] [ColossalEval](./ColossalEval): Evaluation Pipeline for LLMs.
- [X] [ColossalChat](./Chat/README.md): Replication of ChatGPT with RLHF. - [X] [ColossalChat](./Chat/README.md): Replication of ChatGPT with RLHF.
- [X] [FastFold](https://github.com/hpcaitech/FastFold): Optimizing AlphaFold (Biomedicine) Training and Inference on GPU Clusters. - [X] [FastFold](https://github.com/hpcaitech/FastFold): Optimizing AlphaFold (Biomedicine) Training and Inference on GPU Clusters.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment